[SCSI] aacraid: Reset adapter in recovery timeout

Received from Mark Salyzyn

If the adapter is in blinkled (Firmware Assert) when error recovery
timeout actions have been triggered, perform an adapter warm reset and
restart the initialization.

Signed-off-by: Mark Haverkamp <markh@osdl.org>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 699351c..37c55dd 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -175,7 +175,7 @@
  *
  *	Query config status, and commit the configuration if needed.
  */
-int aac_get_config_status(struct aac_dev *dev)
+int aac_get_config_status(struct aac_dev *dev, int commit_flag)
 {
 	int status = 0;
 	struct fib * fibptr;
@@ -219,7 +219,7 @@
 	aac_fib_complete(fibptr);
 	/* Send a CT_COMMIT_CONFIG to enable discovery of devices */
 	if (status >= 0) {
-		if (commit == 1) {
+		if ((commit == 1) || commit_flag) {
 			struct aac_commit_config * dinfo;
 			aac_fib_init(fibptr);
 			dinfo = (struct aac_commit_config *) fib_data(fibptr);
@@ -784,8 +784,9 @@
 		dev->maximum_num_channels = le32_to_cpu(bus_info->BusCount);
 	}
 
-	tmp = le32_to_cpu(dev->adapter_info.kernelrev);
-	printk(KERN_INFO "%s%d: kernel %d.%d-%d[%d] %.*s\n", 
+	if (!dev->in_reset) {
+		tmp = le32_to_cpu(dev->adapter_info.kernelrev);
+		printk(KERN_INFO "%s%d: kernel %d.%d-%d[%d] %.*s\n",
 			dev->name, 
 			dev->id,
 			tmp>>24,
@@ -794,20 +795,21 @@
 			le32_to_cpu(dev->adapter_info.kernelbuild),
 			(int)sizeof(dev->supplement_adapter_info.BuildDate),
 			dev->supplement_adapter_info.BuildDate);
-	tmp = le32_to_cpu(dev->adapter_info.monitorrev);
-	printk(KERN_INFO "%s%d: monitor %d.%d-%d[%d]\n", 
+		tmp = le32_to_cpu(dev->adapter_info.monitorrev);
+		printk(KERN_INFO "%s%d: monitor %d.%d-%d[%d]\n",
 			dev->name, dev->id,
 			tmp>>24,(tmp>>16)&0xff,tmp&0xff,
 			le32_to_cpu(dev->adapter_info.monitorbuild));
-	tmp = le32_to_cpu(dev->adapter_info.biosrev);
-	printk(KERN_INFO "%s%d: bios %d.%d-%d[%d]\n", 
+		tmp = le32_to_cpu(dev->adapter_info.biosrev);
+		printk(KERN_INFO "%s%d: bios %d.%d-%d[%d]\n",
 			dev->name, dev->id,
 			tmp>>24,(tmp>>16)&0xff,tmp&0xff,
 			le32_to_cpu(dev->adapter_info.biosbuild));
-	if (le32_to_cpu(dev->adapter_info.serial[0]) != 0xBAD0)
-		printk(KERN_INFO "%s%d: serial %x\n",
-			dev->name, dev->id,
-			le32_to_cpu(dev->adapter_info.serial[0]));
+		if (le32_to_cpu(dev->adapter_info.serial[0]) != 0xBAD0)
+			printk(KERN_INFO "%s%d: serial %x\n",
+				dev->name, dev->id,
+				le32_to_cpu(dev->adapter_info.serial[0]));
+	}
 
 	dev->nondasd_support = 0;
 	dev->raid_scsi_mode = 0;
@@ -1417,6 +1419,9 @@
 		return SCSI_MLQUEUE_DEVICE_BUSY;
 
 	aac = (struct aac_dev *)scsicmd->device->host->hostdata;
+	if (aac->in_reset)
+		return SCSI_MLQUEUE_HOST_BUSY;
+
 	/*
 	 *	Allocate and initialize a Fib
 	 */
@@ -1504,6 +1509,8 @@
 				case INQUIRY:
 				case READ_CAPACITY:
 				case TEST_UNIT_READY:
+					if (dev->in_reset)
+						return -1;
 					spin_unlock_irq(host->host_lock);
 					aac_probe_container(dev, cid);
 					if ((fsa_dev_ptr[cid].valid & 1) == 0)
@@ -1529,6 +1536,8 @@
 			}
 		} else {  /* check for physical non-dasd devices */
 			if(dev->nondasd_support == 1){
+				if (dev->in_reset)
+					return -1;
 				return aac_send_srb_fib(scsicmd);
 			} else {
 				scsicmd->result = DID_NO_CONNECT << 16;
@@ -1584,6 +1593,8 @@
 			scsicmd->scsi_done(scsicmd);
 			return 0;
 		}
+		if (dev->in_reset)
+			return -1;
 		setinqstr(dev, (void *) (inq_data.inqd_vid), fsa_dev_ptr[cid].type);
 		inq_data.inqd_pdt = INQD_PDT_DA;	/* Direct/random access device */
 		aac_internal_transfer(scsicmd, &inq_data, 0, sizeof(inq_data));
@@ -1739,6 +1750,8 @@
 		case READ_10:
 		case READ_12:
 		case READ_16:
+			if (dev->in_reset)
+				return -1;
 			/*
 			 *	Hack to keep track of ordinal number of the device that
 			 *	corresponds to a container. Needed to convert
@@ -1757,6 +1770,8 @@
 		case WRITE_10:
 		case WRITE_12:
 		case WRITE_16:
+			if (dev->in_reset)
+				return -1;
 			return aac_write(scsicmd, cid);
 
 		case SYNCHRONIZE_CACHE:
diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index 05f8098..8924c18 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -1029,6 +1029,7 @@
 	  init->InitStructRevision==cpu_to_le32(ADAPTER_INIT_STRUCT_REVISION_4)
 	u8			raw_io_64;
 	u8			printf_enabled;
+	u8			in_reset;
 };
 
 #define aac_adapter_interrupt(dev) \
@@ -1789,7 +1790,7 @@
 int aac_fib_complete(struct fib * context);
 #define fib_data(fibctx) ((void *)(fibctx)->hw_fib->data)
 struct aac_dev *aac_init_adapter(struct aac_dev *dev);
-int aac_get_config_status(struct aac_dev *dev);
+int aac_get_config_status(struct aac_dev *dev, int commit_flag);
 int aac_get_containers(struct aac_dev *dev);
 int aac_scsi_cmd(struct scsi_cmnd *cmd);
 int aac_dev_ioctl(struct aac_dev *dev, int cmd, void __user *arg);
@@ -1800,6 +1801,7 @@
 unsigned int aac_response_normal(struct aac_queue * q);
 unsigned int aac_command_normal(struct aac_queue * q);
 unsigned int aac_intr_normal(struct aac_dev * dev, u32 Index);
+int aac_check_health(struct aac_dev * dev);
 int aac_command_thread(void *data);
 int aac_close_fib_context(struct aac_dev * dev, struct aac_fib_context *fibctx);
 int aac_fib_adapter_complete(struct fib * fibptr, unsigned short size);
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 14d7aa9..da1d3a9 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -298,7 +298,7 @@
 		spin_unlock_irqrestore(&dev->fib_lock, flags);
 		/* If someone killed the AIF aacraid thread, restart it */
 		status = !dev->aif_thread;
-		if (status && dev->queues && dev->fsa_dev) {
+		if (status && !dev->in_reset && dev->queues && dev->fsa_dev) {
 			/* Be paranoid, be very paranoid! */
 			kthread_stop(dev->thread);
 			ssleep(1);
diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c
index c67da132..53add53 100644
--- a/drivers/scsi/aacraid/commsup.c
+++ b/drivers/scsi/aacraid/commsup.c
@@ -40,8 +40,10 @@
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
 #include <asm/semaphore.h>
 
 #include "aacraid.h"
@@ -1054,6 +1056,262 @@
 
 }
 
+static int _aac_reset_adapter(struct aac_dev *aac)
+{
+	int index, quirks;
+	u32 ret;
+	int retval;
+	struct Scsi_Host *host;
+	struct scsi_device *dev;
+	struct scsi_cmnd *command;
+	struct scsi_cmnd *command_list;
+
+	/*
+	 * Assumptions:
+	 *	- host is locked.
+	 *	- in_reset is asserted, so no new i/o is getting to the
+	 *	  card.
+	 *	- The card is dead.
+	 */
+	host = aac->scsi_host_ptr;
+	scsi_block_requests(host);
+	aac_adapter_disable_int(aac);
+	spin_unlock_irq(host->host_lock);
+	kthread_stop(aac->thread);
+
+	/*
+	 *	If a positive health, means in a known DEAD PANIC
+	 * state and the adapter could be reset to `try again'.
+	 */
+	retval = aac_adapter_check_health(aac);
+	if (retval == 0)
+		retval = aac_adapter_sync_cmd(aac, IOP_RESET_ALWAYS,
+		  0, 0, 0, 0, 0, 0, &ret, NULL, NULL, NULL, NULL);
+	if (retval)
+		retval = aac_adapter_sync_cmd(aac, IOP_RESET,
+		  0, 0, 0, 0, 0, 0, &ret, NULL, NULL, NULL, NULL);
+
+	if (retval)
+		goto out;
+	if (ret != 0x00000001) {
+		retval = -ENODEV;
+		goto out;
+	}
+
+	index = aac->cardtype;
+
+	/*
+	 * Re-initialize the adapter, first free resources, then carefully
+	 * apply the initialization sequence to come back again. Only risk
+	 * is a change in Firmware dropping cache, it is assumed the caller
+	 * will ensure that i/o is queisced and the card is flushed in that
+	 * case.
+	 */
+	aac_fib_map_free(aac);
+	aac->hw_fib_va = NULL;
+	aac->hw_fib_pa = 0;
+	pci_free_consistent(aac->pdev, aac->comm_size, aac->comm_addr, aac->comm_phys);
+	aac->comm_addr = NULL;
+	aac->comm_phys = 0;
+	kfree(aac->queues);
+	aac->queues = NULL;
+	free_irq(aac->pdev->irq, aac);
+	kfree(aac->fsa_dev);
+	aac->fsa_dev = NULL;
+	if (aac_get_driver_ident(index)->quirks & AAC_QUIRK_31BIT) {
+		if (((retval = pci_set_dma_mask(aac->pdev, DMA_32BIT_MASK))) ||
+		  ((retval = pci_set_consistent_dma_mask(aac->pdev, DMA_32BIT_MASK))))
+			goto out;
+	} else {
+		if (((retval = pci_set_dma_mask(aac->pdev, 0x7FFFFFFFULL))) ||
+		  ((retval = pci_set_consistent_dma_mask(aac->pdev, 0x7FFFFFFFULL))))
+			goto out;
+	}
+	if ((retval = (*(aac_get_driver_ident(index)->init))(aac)))
+		goto out;
+	if (aac_get_driver_ident(index)->quirks & AAC_QUIRK_31BIT)
+		if ((retval = pci_set_dma_mask(aac->pdev, DMA_32BIT_MASK)))
+			goto out;
+	aac->thread = kthread_run(aac_command_thread, aac, aac->name);
+	if (IS_ERR(aac->thread)) {
+		retval = PTR_ERR(aac->thread);
+		goto out;
+	}
+	(void)aac_get_adapter_info(aac);
+	quirks = aac_get_driver_ident(index)->quirks;
+	if ((quirks & AAC_QUIRK_34SG) && (host->sg_tablesize > 34)) {
+ 		host->sg_tablesize = 34;
+ 		host->max_sectors = (host->sg_tablesize * 8) + 112;
+ 	}
+ 	if ((quirks & AAC_QUIRK_17SG) && (host->sg_tablesize > 17)) {
+ 		host->sg_tablesize = 17;
+ 		host->max_sectors = (host->sg_tablesize * 8) + 112;
+ 	}
+	aac_get_config_status(aac, 1);
+	aac_get_containers(aac);
+	/*
+	 * This is where the assumption that the Adapter is quiesced
+	 * is important.
+	 */
+	command_list = NULL;
+	__shost_for_each_device(dev, host) {
+		unsigned long flags;
+		spin_lock_irqsave(&dev->list_lock, flags);
+		list_for_each_entry(command, &dev->cmd_list, list)
+			if (command->SCp.phase == AAC_OWNER_FIRMWARE) {
+				command->SCp.buffer = (struct scatterlist *)command_list;
+				command_list = command;
+			}
+		spin_unlock_irqrestore(&dev->list_lock, flags);
+	}
+	while ((command = command_list)) {
+		command_list = (struct scsi_cmnd *)command->SCp.buffer;
+		command->SCp.buffer = NULL;
+		command->result = DID_OK << 16
+		  | COMMAND_COMPLETE << 8
+		  | SAM_STAT_TASK_SET_FULL;
+		command->SCp.phase = AAC_OWNER_ERROR_HANDLER;
+		command->scsi_done(command);
+	}
+	retval = 0;
+
+out:
+	aac->in_reset = 0;
+	scsi_unblock_requests(host);
+	spin_lock_irq(host->host_lock);
+	return retval;
+}
+
+int aac_check_health(struct aac_dev * aac)
+{
+	int BlinkLED;
+	unsigned long time_now, flagv = 0;
+	struct list_head * entry;
+	struct Scsi_Host * host;
+
+	/* Extending the scope of fib_lock slightly to protect aac->in_reset */
+	if (spin_trylock_irqsave(&aac->fib_lock, flagv) == 0)
+		return 0;
+
+	if (aac->in_reset || !(BlinkLED = aac_adapter_check_health(aac))) {
+		spin_unlock_irqrestore(&aac->fib_lock, flagv);
+		return 0; /* OK */
+	}
+
+	aac->in_reset = 1;
+
+	/* Fake up an AIF:
+	 *	aac_aifcmd.command = AifCmdEventNotify = 1
+	 *	aac_aifcmd.seqnum = 0xFFFFFFFF
+	 *	aac_aifcmd.data[0] = AifEnExpEvent = 23
+	 *	aac_aifcmd.data[1] = AifExeFirmwarePanic = 3
+	 *	aac.aifcmd.data[2] = AifHighPriority = 3
+	 *	aac.aifcmd.data[3] = BlinkLED
+	 */
+
+	time_now = jiffies/HZ;
+	entry = aac->fib_list.next;
+
+	/*
+	 * For each Context that is on the
+	 * fibctxList, make a copy of the
+	 * fib, and then set the event to wake up the
+	 * thread that is waiting for it.
+	 */
+	while (entry != &aac->fib_list) {
+		/*
+		 * Extract the fibctx
+		 */
+		struct aac_fib_context *fibctx = list_entry(entry, struct aac_fib_context, next);
+		struct hw_fib * hw_fib;
+		struct fib * fib;
+		/*
+		 * Check if the queue is getting
+		 * backlogged
+		 */
+		if (fibctx->count > 20) {
+			/*
+			 * It's *not* jiffies folks,
+			 * but jiffies / HZ, so do not
+			 * panic ...
+			 */
+			u32 time_last = fibctx->jiffies;
+			/*
+			 * Has it been > 2 minutes
+			 * since the last read off
+			 * the queue?
+			 */
+			if ((time_now - time_last) > aif_timeout) {
+				entry = entry->next;
+				aac_close_fib_context(aac, fibctx);
+				continue;
+			}
+		}
+		/*
+		 * Warning: no sleep allowed while
+		 * holding spinlock
+		 */
+		hw_fib = kmalloc(sizeof(struct hw_fib), GFP_ATOMIC);
+		fib = kmalloc(sizeof(struct fib), GFP_ATOMIC);
+		if (fib && hw_fib) {
+			struct aac_aifcmd * aif;
+
+			memset(hw_fib, 0, sizeof(struct hw_fib));
+			memset(fib, 0, sizeof(struct fib));
+			fib->hw_fib = hw_fib;
+			fib->dev = aac;
+			aac_fib_init(fib);
+			fib->type = FSAFS_NTC_FIB_CONTEXT;
+			fib->size = sizeof (struct fib);
+			fib->data = hw_fib->data;
+			aif = (struct aac_aifcmd *)hw_fib->data;
+			aif->command = cpu_to_le32(AifCmdEventNotify);
+		 	aif->seqnum = cpu_to_le32(0xFFFFFFFF);
+		 	aif->data[0] = cpu_to_le32(AifEnExpEvent);
+			aif->data[1] = cpu_to_le32(AifExeFirmwarePanic);
+		 	aif->data[2] = cpu_to_le32(AifHighPriority);
+			aif->data[3] = cpu_to_le32(BlinkLED);
+
+			/*
+			 * Put the FIB onto the
+			 * fibctx's fibs
+			 */
+			list_add_tail(&fib->fiblink, &fibctx->fib_list);
+			fibctx->count++;
+			/*
+			 * Set the event to wake up the
+			 * thread that will waiting.
+			 */
+			up(&fibctx->wait_sem);
+		} else {
+			printk(KERN_WARNING "aifd: didn't allocate NewFib.\n");
+			kfree(fib);
+			kfree(hw_fib);
+		}
+		entry = entry->next;
+	}
+
+	spin_unlock_irqrestore(&aac->fib_lock, flagv);
+
+	if (BlinkLED < 0) {
+		printk(KERN_ERR "%s: Host adapter dead %d\n", aac->name, BlinkLED);
+		goto out;
+	}
+
+	printk(KERN_ERR "%s: Host adapter BLINK LED 0x%x\n", aac->name, BlinkLED);
+
+	host = aac->scsi_host_ptr;
+	spin_lock_irqsave(host->host_lock, flagv);
+	BlinkLED = _aac_reset_adapter(aac);
+	spin_unlock_irqrestore(host->host_lock, flagv);
+	return BlinkLED;
+
+out:
+	aac->in_reset = 0;
+	return BlinkLED;
+}
+
+
 /**
  *	aac_command_thread	-	command processing thread
  *	@dev: Adapter to monitor
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index 9d8b550..d67058f 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -454,17 +454,17 @@
 	printk(KERN_ERR "%s: Host adapter reset request. SCSI hang ?\n", 
 					AAC_DRIVERNAME);
 	aac = (struct aac_dev *)host->hostdata;
-	if (aac_adapter_check_health(aac)) {
-		printk(KERN_ERR "%s: Host adapter appears dead\n", 
-				AAC_DRIVERNAME);
-		return -ENODEV;
-	}
+
+	if ((count = aac_check_health(aac)))
+		return count;
 	/*
 	 * Wait for all commands to complete to this specific
 	 * target (block maximum 60 seconds).
 	 */
 	for (count = 60; count; --count) {
-		int active = 0;
+		int active = aac->in_reset;
+
+		if (active == 0)
 		__shost_for_each_device(dev, host) {
 			spin_lock_irqsave(&dev->list_lock, flags);
 			list_for_each_entry(command, &dev->cmd_list, list) {
@@ -933,7 +933,7 @@
 	else
 		shost->max_channel = 0;
 
-	aac_get_config_status(aac);
+	aac_get_config_status(aac, 0);
 	aac_get_containers(aac);
 	list_add(&aac->entry, insert);