File: /usr/src/linux/drivers/scsi/scsi_error.c

1     /*
2      *  scsi_error.c Copyright (C) 1997 Eric Youngdale
3      *
4      *  SCSI error/timeout handling
5      *      Initial versions: Eric Youngdale.  Based upon conversations with
6      *                        Leonard Zubkoff and David Miller at Linux Expo, 
7      *                        ideas originating from all over the place.
8      *
9      */
10     
11     #define __NO_VERSION__
12     #include <linux/module.h>
13     
14     #include <linux/sched.h>
15     #include <linux/timer.h>
16     #include <linux/string.h>
17     #include <linux/slab.h>
18     #include <linux/ioport.h>
19     #include <linux/kernel.h>
20     #include <linux/stat.h>
21     #include <linux/blk.h>
22     #include <linux/interrupt.h>
23     #include <linux/delay.h>
24     #include <linux/smp_lock.h>
25     
26     #define __KERNEL_SYSCALLS__
27     
28     #include <linux/unistd.h>
29     
30     #include <asm/system.h>
31     #include <asm/irq.h>
32     #include <asm/dma.h>
33     
34     #include "scsi.h"
35     #include "hosts.h"
36     #include "constants.h"
37     
38     /*
39      * We must always allow SHUTDOWN_SIGS.  Even if we are not a module,
40      * the host drivers that we are using may be loaded as modules, and
41      * when we unload these,  we need to ensure that the error handler thread
42      * can be shut down.
43      *
44      * Note - when we unload a module, we send a SIGHUP.  We mustn't
45      * enable SIGTERM, as this is how the init shuts things down when you
46      * go to single-user mode.  For that matter, init also sends SIGKILL,
47      * so we mustn't enable that one either.  We use SIGHUP instead.  Other
48      * options would be SIGPWR, I suppose.
49      */
50     #define SHUTDOWN_SIGS	(sigmask(SIGHUP))
51     
52     #ifdef DEBUG
53     #define SENSE_TIMEOUT SCSI_TIMEOUT
54     #define ABORT_TIMEOUT SCSI_TIMEOUT
55     #define RESET_TIMEOUT SCSI_TIMEOUT
56     #else
57     #define SENSE_TIMEOUT (10*HZ)
58     #define RESET_TIMEOUT (2*HZ)
59     #define ABORT_TIMEOUT (15*HZ)
60     #endif
61     
62     #define STATIC
63     
64     /*
65      * These should *probably* be handled by the host itself.
66      * Since it is allowed to sleep, it probably should.
67      */
68     #define BUS_RESET_SETTLE_TIME   5*HZ
69     #define HOST_RESET_SETTLE_TIME  10*HZ
70     
71     
72     static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
73     
74     STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
75     STATIC int scsi_request_sense(Scsi_Cmnd *);
76     STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
77     STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
78     STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
79     STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
80     STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
81     STATIC int scsi_try_host_reset(Scsi_Cmnd *);
82     STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
83     STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
84     STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
85     STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
86     STATIC void scsi_restart_operations(struct Scsi_Host *);
87     STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
88     
89     
90     /*
91      * Function:    scsi_add_timer()
92      *
93      * Purpose:     Start timeout timer for a single scsi command.
94      *
95      * Arguments:   SCset   - command that is about to start running.
96      *              timeout - amount of time to allow this command to run.
97      *              complete - timeout function to call if timer isn't
98      *                      canceled.
99      *
100      * Returns:     Nothing
101      *
102      * Notes:       This should be turned into an inline function.
103      *
104      * More Notes:  Each scsi command has it's own timer, and as it is added to
105      *              the queue, we set up the timer.  When the command completes,
106      *              we cancel the timer.  Pretty simple, really, especially
107      *              compared to the old way of handling this crap.
108      */
109     void scsi_add_timer(Scsi_Cmnd * SCset,
110     		    int timeout,
111     		    void (*complete) (Scsi_Cmnd *))
112     {
113     
114     	/*
115     	 * If the clock was already running for this command, then
116     	 * first delete the timer.  The timer handling code gets rather
117     	 * confused if we don't do this.
118     	 */
119     	if (SCset->eh_timeout.function != NULL) {
120     		del_timer(&SCset->eh_timeout);
121     	}
122     	SCset->eh_timeout.data = (unsigned long) SCset;
123     	SCset->eh_timeout.expires = jiffies + timeout;
124     	SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
125     
126     	SCset->done_late = 0;
127     
128     	SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
129     
130     	add_timer(&SCset->eh_timeout);
131     
132     }
133     
134     /*
135      * Function:    scsi_delete_timer()
136      *
137      * Purpose:     Delete/cancel timer for a given function.
138      *
139      * Arguments:   SCset   - command that we are canceling timer for.
140      *
141      * Returns:     1 if we were able to detach the timer.  0 if we
142      *              blew it, and the timer function has already started
143      *              to run.
144      *
145      * Notes:       This should be turned into an inline function.
146      */
147     int scsi_delete_timer(Scsi_Cmnd * SCset)
148     {
149     	int rtn;
150     
151     	rtn = del_timer(&SCset->eh_timeout);
152     
153     	SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p %d\n", SCset, rtn));
154     
155     	SCset->eh_timeout.data = (unsigned long) NULL;
156     	SCset->eh_timeout.function = NULL;
157     
158     	return rtn;
159     }
160     
161     /*
162      * Function:    scsi_times_out()
163      *
164      * Purpose:     Timeout function for normal scsi commands..
165      *
166      * Arguments:   SCpnt   - command that is timing out.
167      *
168      * Returns:     Nothing.
169      *
170      * Notes:       We do not need to lock this.  There is the potential for
171      *              a race only in that the normal completion handling might
172      *              run, but if the normal completion function determines
173      *              that the timer has already fired, then it mustn't do
174      *              anything.
175      */
176     void scsi_times_out(Scsi_Cmnd * SCpnt)
177     {
178     	/* 
179     	 * Notify the low-level code that this operation failed and we are
180     	 * reposessing the command.  
181     	 */
182     #ifdef ERIC_neverdef
183     	/*
184     	 * FIXME(eric)
185     	 * Allow the host adapter to push a queue ordering tag
186     	 * out to the bus to force the command in question to complete.
187     	 * If the host wants to do this, then we just restart the timer
188     	 * for the command.  Before we really do this, some real thought
189     	 * as to the optimum way to handle this should be done.  We *do*
190     	 * need to force ordering every so often to ensure that all requests
191     	 * do eventually complete, but I am not sure if this is the best way
192     	 * to actually go about it.
193     	 *
194     	 * Better yet, force a sync here, but don't block since we are in an
195     	 * interrupt.
196     	 */
197     	if (SCpnt->host->hostt->eh_ordered_queue_tag) {
198     		if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
199     			scsi_add_timer(SCpnt, SCpnt->internal_timeout,
200     				       scsi_times_out);
201     			return;
202     		}
203     	}
204     	/*
205     	 * FIXME(eric) - add a second special interface to handle this
206     	 * case.  Ideally that interface can also be used to request
207     	 * a queu
208     	 */
209     	if (SCpnt->host->can_queue) {
210     		SCpnt->host->hostt->queuecommand(SCpnt, NULL);
211     	}
212     #endif
213     
214     	/* Set the serial_number_at_timeout to the current serial_number */
215     	SCpnt->serial_number_at_timeout = SCpnt->serial_number;
216     
217     	SCpnt->eh_state = FAILED;
218     	SCpnt->state = SCSI_STATE_TIMEOUT;
219     	SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
220     
221     	SCpnt->host->in_recovery = 1;
222     	SCpnt->host->host_failed++;
223     
224     	SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
225     				   atomic_read(&SCpnt->host->host_active),
226     				   SCpnt->host->host_busy,
227     				   SCpnt->host->host_failed));
228     
229     	/*
230     	 * If the host is having troubles, then look to see if this was the last
231     	 * command that might have failed.  If so, wake up the error handler.
232     	 */
233     	if( SCpnt->host->eh_wait == NULL ) {
234     		panic("Error handler thread not present at %p %p %s %d", 
235     		      SCpnt, SCpnt->host, __FILE__, __LINE__);
236     	}
237     	if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
238     		up(SCpnt->host->eh_wait);
239     	}
240     }
241     
242     /*
243      * Function     scsi_block_when_processing_errors
244      *
245      * Purpose:     Prevent more commands from being queued while error recovery
246      *              is taking place.
247      *
248      * Arguments:   SDpnt - device on which we are performing recovery.
249      *
250      * Returns:     FALSE   The device was taken offline by error recovery.
251      *              TRUE    OK to proceed.
252      *
253      * Notes:       We block until the host is out of error recovery, and then
254      *              check to see whether the host or the device is offline.
255      */
256     int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
257     {
258     
259     	SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
260     
261     	SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt->online));
262     
263     	return SDpnt->online;
264     }
265     
266     /*
267      * Function:    scsi_eh_times_out()
268      *
269      * Purpose:     Timeout function for error handling.
270      *
271      * Arguments:   SCpnt   - command that is timing out.
272      *
273      * Returns:     Nothing.
274      *
275      * Notes:       During error handling, the kernel thread will be sleeping
276      *              waiting for some action to complete on the device.  Our only
277      *              job is to record that it timed out, and to wake up the
278      *              thread.
279      */
280     STATIC
281     void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
282     {
283     	SCpnt->eh_state = SCSI_STATE_TIMEOUT;
284     	SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt));
285     
286     	if (SCpnt->host->eh_action != NULL)
287     		up(SCpnt->host->eh_action);
288     	else
289     		printk("Missing scsi error handler thread\n");
290     }
291     
292     
293     /*
294      * Function:    scsi_eh_done()
295      *
296      * Purpose:     Completion function for error handling.
297      *
298      * Arguments:   SCpnt   - command that is timing out.
299      *
300      * Returns:     Nothing.
301      *
302      * Notes:       During error handling, the kernel thread will be sleeping
303      *              waiting for some action to complete on the device.  Our only
304      *              job is to record that the action completed, and to wake up the
305      *              thread.
306      */
307     STATIC
308     void scsi_eh_done(Scsi_Cmnd * SCpnt)
309     {
310     	int     rtn;
311     
312     	/*
313     	 * If the timeout handler is already running, then just set the
314     	 * flag which says we finished late, and return.  We have no
315     	 * way of stopping the timeout handler from running, so we must
316     	 * always defer to it.
317     	 */
318     	rtn = del_timer(&SCpnt->eh_timeout);
319     	if (!rtn) {
320     		SCpnt->done_late = 1;
321     		return;
322     	}
323     
324     	SCpnt->request.rq_status = RQ_SCSI_DONE;
325     
326     	SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
327     	SCpnt->eh_state = SUCCESS;
328     
329     	SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt,
330     					  SCpnt->result));
331     
332     	if (SCpnt->host->eh_action != NULL)
333     		up(SCpnt->host->eh_action);
334     }
335     
336     /*
337      * Function:    scsi_eh_action_done()
338      *
339      * Purpose:     Completion function for error handling.
340      *
341      * Arguments:   SCpnt   - command that is timing out.
342      *              answer  - boolean that indicates whether operation succeeded.
343      *
344      * Returns:     Nothing.
345      *
346      * Notes:       This callback is only used for abort and reset operations.
347      */
348     STATIC
349     void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
350     {
351     	SCpnt->request.rq_status = RQ_SCSI_DONE;
352     
353     	SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
354     	SCpnt->eh_state = (answer ? SUCCESS : FAILED);
355     
356     	if (SCpnt->host->eh_action != NULL)
357     		up(SCpnt->host->eh_action);
358     }
359     
360     /*
361      * Function:  scsi_sense_valid()
362      *
363      * Purpose:     Determine whether a host has automatically obtained sense
364      *              information or not.  If we have it, then give a recommendation
365      *              as to what we should do next.
366      */
367     int scsi_sense_valid(Scsi_Cmnd * SCpnt)
368     {
369     	if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
370     		return FALSE;
371     	}
372     	return TRUE;
373     }
374     
375     /*
376      * Function:  scsi_eh_retry_command()
377      *
378      * Purpose:     Retry the original command
379      *
380      * Returns:     SUCCESS - we were able to get the sense data.
381      *              FAILED  - we were not able to get the sense data.
382      * 
383      * Notes:       This function will *NOT* return until the command either
384      *              times out, or it completes.
385      */
386     STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
387     {
388     	memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
389     	       sizeof(SCpnt->data_cmnd));
390     	SCpnt->request_buffer = SCpnt->buffer;
391     	SCpnt->request_bufflen = SCpnt->bufflen;
392     	SCpnt->use_sg = SCpnt->old_use_sg;
393     	SCpnt->cmd_len = SCpnt->old_cmd_len;
394     	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
395     	SCpnt->underflow = SCpnt->old_underflow;
396     
397     	scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
398     
399     	/*
400     	 * Hey, we are done.  Let's look to see what happened.
401     	 */
402     	return SCpnt->eh_state;
403     }
404     
405     /*
406      * Function:  scsi_request_sense()
407      *
408      * Purpose:     Request sense data from a particular target.
409      *
410      * Returns:     SUCCESS - we were able to get the sense data.
411      *              FAILED  - we were not able to get the sense data.
412      * 
413      * Notes:       Some hosts automatically obtain this information, others
414      *              require that we obtain it on our own.
415      *
416      *              This function will *NOT* return until the command either
417      *              times out, or it completes.
418      */
419     STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
420     {
421     	static unsigned char generic_sense[6] =
422     	{REQUEST_SENSE, 0, 0, 0, 255, 0};
423     	unsigned char scsi_result0[256], *scsi_result = NULL;
424     	int saved_result;
425     
426     	ASSERT_LOCK(&io_request_lock, 0);
427     
428     	memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
429     	       sizeof(generic_sense));
430     
431     	if (SCpnt->device->scsi_level <= SCSI_2)
432     		SCpnt->cmnd[1] = SCpnt->lun << 5;
433     
434     	scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
435     	    ? &scsi_result0[0] : kmalloc(512, GFP_ATOMIC | GFP_DMA);
436     
437     	if (scsi_result == NULL) {
438     		printk("cannot allocate scsi_result in scsi_request_sense.\n");
439     		return FAILED;
440     	}
441     	/*
442     	 * Zero the sense buffer.  Some host adapters automatically always request
443     	 * sense, so it is not a good idea that SCpnt->request_buffer and
444     	 * SCpnt->sense_buffer point to the same address (DB).
445     	 * 0 is not a valid sense code. 
446     	 */
447     	memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
448     	memset((void *) scsi_result, 0, 256);
449     
450     	saved_result = SCpnt->result;
451     	SCpnt->request_buffer = scsi_result;
452     	SCpnt->request_bufflen = 256;
453     	SCpnt->use_sg = 0;
454     	SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
455     	SCpnt->sc_data_direction = SCSI_DATA_READ;
456     	SCpnt->underflow = 0;
457     
458     	scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
459     
460     	/* Last chance to have valid sense data */
461     	if (!scsi_sense_valid(SCpnt))
462     		memcpy((void *) SCpnt->sense_buffer,
463     		       SCpnt->request_buffer,
464     		       sizeof(SCpnt->sense_buffer));
465     
466     	if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
467     		kfree(scsi_result);
468     
469     	/*
470     	 * When we eventually call scsi_finish, we really wish to complete
471     	 * the original request, so let's restore the original data. (DB)
472     	 */
473     	memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
474     	       sizeof(SCpnt->data_cmnd));
475     	SCpnt->result = saved_result;
476     	SCpnt->request_buffer = SCpnt->buffer;
477     	SCpnt->request_bufflen = SCpnt->bufflen;
478     	SCpnt->use_sg = SCpnt->old_use_sg;
479     	SCpnt->cmd_len = SCpnt->old_cmd_len;
480     	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
481     	SCpnt->underflow = SCpnt->old_underflow;
482     
483     	/*
484     	 * Hey, we are done.  Let's look to see what happened.
485     	 */
486     	return SCpnt->eh_state;
487     }
488     
489     /*
490      * Function:  scsi_test_unit_ready()
491      *
492      * Purpose:     Run test unit ready command to see if the device is talking to us or not.
493      *
494      */
495     STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
496     {
497     	static unsigned char tur_command[6] =
498     	{TEST_UNIT_READY, 0, 0, 0, 0, 0};
499     
500     	memcpy((void *) SCpnt->cmnd, (void *) tur_command,
501     	       sizeof(tur_command));
502     
503     	if (SCpnt->device->scsi_level <= SCSI_2)
504     		SCpnt->cmnd[1] = SCpnt->lun << 5;
505     
506     	/*
507     	 * Zero the sense buffer.  The SCSI spec mandates that any
508     	 * untransferred sense data should be interpreted as being zero.
509     	 */
510     	memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
511     
512     	SCpnt->request_buffer = NULL;
513     	SCpnt->request_bufflen = 0;
514     	SCpnt->use_sg = 0;
515     	SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
516     	SCpnt->underflow = 0;
517     	SCpnt->sc_data_direction = SCSI_DATA_NONE;
518     
519     	scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
520     
521     	/*
522     	 * When we eventually call scsi_finish, we really wish to complete
523     	 * the original request, so let's restore the original data. (DB)
524     	 */
525     	memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
526     	       sizeof(SCpnt->data_cmnd));
527     	SCpnt->request_buffer = SCpnt->buffer;
528     	SCpnt->request_bufflen = SCpnt->bufflen;
529     	SCpnt->use_sg = SCpnt->old_use_sg;
530     	SCpnt->cmd_len = SCpnt->old_cmd_len;
531     	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
532     	SCpnt->underflow = SCpnt->old_underflow;
533     
534     	/*
535     	 * Hey, we are done.  Let's look to see what happened.
536     	 */
537     	SCSI_LOG_ERROR_RECOVERY(3,
538     		printk("scsi_test_unit_ready: SCpnt %p eh_state %x\n",
539     		SCpnt, SCpnt->eh_state));
540     	return SCpnt->eh_state;
541     }
542     
543     /*
544      * This would normally need to get the IO request lock,
545      * but as it doesn't actually touch anything that needs
546      * to be locked we can avoid the lock here..
547      */
548     STATIC
549     void scsi_sleep_done(struct semaphore *sem)
550     {
551     	if (sem != NULL) {
552     		up(sem);
553     	}
554     }
555     
556     void scsi_sleep(int timeout)
557     {
558     	DECLARE_MUTEX_LOCKED(sem);
559     	struct timer_list timer;
560     
561     	init_timer(&timer);
562     	timer.data = (unsigned long) &sem;
563     	timer.expires = jiffies + timeout;
564     	timer.function = (void (*)(unsigned long)) scsi_sleep_done;
565     
566     	SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout));
567     
568     	add_timer(&timer);
569     
570     	down(&sem);
571     	del_timer(&timer);
572     }
573     
574     /*
575      * Function:  scsi_send_eh_cmnd
576      *
577      * Purpose:     Send a command out to a device as part of error recovery.
578      *
579      * Notes:       The initialization of the structures is quite a bit different
580      *              in this case, and furthermore, there is a different completion
581      *              handler.
582      */
583     STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
584     {
585     	unsigned long flags;
586     	struct Scsi_Host *host;
587     
588     	ASSERT_LOCK(&io_request_lock, 0);
589     
590     	host = SCpnt->host;
591     
592           retry:
593     	/*
594     	 * We will use a queued command if possible, otherwise we will emulate the
595     	 * queuing and calling of completion function ourselves.
596     	 */
597     	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
598     
599     	if (host->can_queue) {
600     		DECLARE_MUTEX_LOCKED(sem);
601     
602     		SCpnt->eh_state = SCSI_STATE_QUEUED;
603     
604     		scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
605     
606     		/*
607     		 * Set up the semaphore so we wait for the command to complete.
608     		 */
609     		SCpnt->host->eh_action = &sem;
610     		SCpnt->request.rq_status = RQ_SCSI_BUSY;
611     
612     		spin_lock_irqsave(&io_request_lock, flags);
613     		host->hostt->queuecommand(SCpnt, scsi_eh_done);
614     		spin_unlock_irqrestore(&io_request_lock, flags);
615     
616     		down(&sem);
617     
618     		SCpnt->host->eh_action = NULL;
619     
620     		/*
621     		 * See if timeout.  If so, tell the host to forget about it.
622     		 * In other words, we don't want a callback any more.
623     		 */
624     		if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
625                             SCpnt->owner = SCSI_OWNER_LOWLEVEL;
626     
627     			/*
628     			 * As far as the low level driver is
629     			 * concerned, this command is still active, so
630     			 * we must give the low level driver a chance
631     			 * to abort it. (DB) 
632     			 *
633     			 * FIXME(eric) - we are not tracking whether we could
634     			 * abort a timed out command or not.  Not sure how
635     			 * we should treat them differently anyways.
636     			 */
637     			spin_lock_irqsave(&io_request_lock, flags);
638     			if (SCpnt->host->hostt->eh_abort_handler)
639     				SCpnt->host->hostt->eh_abort_handler(SCpnt);
640     			spin_unlock_irqrestore(&io_request_lock, flags);
641     			
642     			SCpnt->request.rq_status = RQ_SCSI_DONE;
643     			SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
644     			
645     			SCpnt->eh_state = FAILED;
646     		}
647     		SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
648     						SCpnt, SCpnt->eh_state));
649     	} else {
650     		int temp;
651     
652     		/*
653     		 * We damn well had better never use this code.  There is no timeout
654     		 * protection here, since we would end up waiting in the actual low
655     		 * level driver, we don't know how to wake it up.
656     		 */
657     		spin_lock_irqsave(&io_request_lock, flags);
658     		temp = host->hostt->command(SCpnt);
659     		spin_unlock_irqrestore(&io_request_lock, flags);
660     
661     		SCpnt->result = temp;
662     		/* Fall through to code below to examine status. */
663     		SCpnt->eh_state = SUCCESS;
664     	}
665     
666     	/*
667     	 * Now examine the actual status codes to see whether the command actually
668     	 * did complete normally.
669     	 */
670     	if (SCpnt->eh_state == SUCCESS) {
671     		int ret = scsi_eh_completed_normally(SCpnt);
672     		SCSI_LOG_ERROR_RECOVERY(3,
673     			printk("scsi_send_eh_cmnd: scsi_eh_completed_normally %x\n", ret));
674     		switch (ret) {
675     		case SUCCESS:
676     			SCpnt->eh_state = SUCCESS;
677     			break;
678     		case NEEDS_RETRY:
679     			goto retry;
680     		case FAILED:
681     		default:
682     			SCpnt->eh_state = FAILED;
683     			break;
684     		}
685     	} else {
686     		SCpnt->eh_state = FAILED;
687     	}
688     }
689     
690     /*
691      * Function:  scsi_unit_is_ready()
692      *
693      * Purpose:     Called after TEST_UNIT_READY is run, to test to see if
694      *              the unit responded in a way that indicates it is ready.
695      */
696     STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
697     {
698     	if (SCpnt->result) {
699     		if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
700     		     (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
701     		    ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
702     			if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
703     			    ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
704     			    ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
705     				return 0;
706     			}
707     		}
708     	}
709     	return 1;
710     }
711     
712     /*
713      * Function:    scsi_eh_finish_command
714      *
715      * Purpose:     Handle a command that we are finished with WRT error handling.
716      *
717      * Arguments:   SClist - pointer to list into which we are putting completed commands.
718      *              SCpnt  - command that is completing
719      *
720      * Notes:       We don't want to use the normal command completion while we are
721      *              are still handling errors - it may cause other commands to be queued,
722      *              and that would disturb what we are doing.  Thus we really want to keep
723      *              a list of pending commands for final completion, and once we
724      *              are ready to leave error handling we handle completion for real.
725      */
726     STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
727     {
728     	SCpnt->state = SCSI_STATE_BHQUEUE;
729     	SCpnt->bh_next = *SClist;
730     	/*
731     	 * Set this back so that the upper level can correctly free up
732     	 * things.
733     	 */
734     	SCpnt->use_sg = SCpnt->old_use_sg;
735     	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
736     	SCpnt->underflow = SCpnt->old_underflow;
737     	*SClist = SCpnt;
738     }
739     
740     /*
741      * Function:  scsi_try_to_abort_command
742      *
743      * Purpose:     Ask host adapter to abort a running command.
744      *
745      * Returns:     FAILED          Operation failed or not supported.
746      *              SUCCESS         Succeeded.
747      *
748      * Notes:       This function will not return until the user's completion
749      *              function has been called.  There is no timeout on this
750      *              operation.  If the author of the low-level driver wishes
751      *              this operation to be timed, they can provide this facility
752      *              themselves.  Helper functions in scsi_error.c can be supplied
753      *              to make this easier to do.
754      *
755      * Notes:       It may be possible to combine this with all of the reset
756      *              handling to eliminate a lot of code duplication.  I don't
757      *              know what makes more sense at the moment - this is just a
758      *              prototype.
759      */
760     STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
761     {
762     	int rtn;
763     	unsigned long flags;
764     
765     	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
766     
767     	if (SCpnt->host->hostt->eh_abort_handler == NULL) {
768     		return FAILED;
769     	}
770     	/* 
771     	 * scsi_done was called just after the command timed out and before
772     	 * we had a chance to process it. (DB)
773     	 */
774     	if (SCpnt->serial_number == 0)
775     		return SUCCESS;
776     
777     	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
778     
779     	spin_lock_irqsave(&io_request_lock, flags);
780     	rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
781     	spin_unlock_irqrestore(&io_request_lock, flags);
782     	return rtn;
783     }
784     
785     /*
786      * Function:  scsi_try_bus_device_reset
787      *
788      * Purpose:     Ask host adapter to perform a bus device reset for a given
789      *              device.
790      *
791      * Returns:     FAILED          Operation failed or not supported.
792      *              SUCCESS         Succeeded.
793      *
794      * Notes:       There is no timeout for this operation.  If this operation is
795      *              unreliable for a given host, then the host itself needs to put a
796      *              timer on it, and set the host back to a consistent state prior
797      *              to returning.
798      */
799     STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
800     {
801     	unsigned long flags;
802     	int rtn;
803     
804     	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
805     
806     	if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
807     		return FAILED;
808     	}
809     	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
810     
811     	spin_lock_irqsave(&io_request_lock, flags);
812     	rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
813     	spin_unlock_irqrestore(&io_request_lock, flags);
814     
815     	if (rtn == SUCCESS)
816     		SCpnt->eh_state = SUCCESS;
817     
818     	return SCpnt->eh_state;
819     }
820     
821     /*
822      * Function:  scsi_try_bus_reset
823      *
824      * Purpose:     Ask host adapter to perform a bus reset for a host.
825      *
826      * Returns:     FAILED          Operation failed or not supported.
827      *              SUCCESS         Succeeded.
828      *
829      * Notes:       
830      */
831     STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
832     {
833     	unsigned long flags;
834     	int rtn;
835     
836     	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
837     	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
838     	SCpnt->serial_number_at_timeout = SCpnt->serial_number;
839     
840     	if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
841     		return FAILED;
842     	}
843     
844     	spin_lock_irqsave(&io_request_lock, flags);
845     	rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
846     	spin_unlock_irqrestore(&io_request_lock, flags);
847     
848     	if (rtn == SUCCESS)
849     		SCpnt->eh_state = SUCCESS;
850     
851     	/*
852     	 * If we had a successful bus reset, mark the command blocks to expect
853     	 * a condition code of unit attention.
854     	 */
855     	scsi_sleep(BUS_RESET_SETTLE_TIME);
856     	if (SCpnt->eh_state == SUCCESS) {
857     		Scsi_Device *SDloop;
858     		for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
859     			if (SCpnt->channel == SDloop->channel) {
860     				SDloop->was_reset = 1;
861     				SDloop->expecting_cc_ua = 1;
862     			}
863     		}
864     	}
865     	return SCpnt->eh_state;
866     }
867     
868     /*
869      * Function:  scsi_try_host_reset
870      *
871      * Purpose:     Ask host adapter to reset itself, and the bus.
872      *
873      * Returns:     FAILED          Operation failed or not supported.
874      *              SUCCESS         Succeeded.
875      *
876      * Notes:
877      */
878     STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
879     {
880     	unsigned long flags;
881     	int rtn;
882     
883     	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
884     	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
885     	SCpnt->serial_number_at_timeout = SCpnt->serial_number;
886     
887     	if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
888     		return FAILED;
889     	}
890     	spin_lock_irqsave(&io_request_lock, flags);
891     	rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
892     	spin_unlock_irqrestore(&io_request_lock, flags);
893     
894     	if (rtn == SUCCESS)
895     		SCpnt->eh_state = SUCCESS;
896     
897     	/*
898     	 * If we had a successful host reset, mark the command blocks to expect
899     	 * a condition code of unit attention.
900     	 */
901     	scsi_sleep(HOST_RESET_SETTLE_TIME);
902     	if (SCpnt->eh_state == SUCCESS) {
903     		Scsi_Device *SDloop;
904     		for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
905     			SDloop->was_reset = 1;
906     			SDloop->expecting_cc_ua = 1;
907     		}
908     	}
909     	return SCpnt->eh_state;
910     }
911     
912     /*
913      * Function:  scsi_decide_disposition
914      *
915      * Purpose:     Examine a command block that has come back from the low-level
916      *              and figure out what to do next.
917      *
918      * Returns:     SUCCESS         - pass on to upper level.
919      *              FAILED          - pass on to error handler thread.
920      *              RETRY           - command should be retried.
921      *              SOFTERR         - command succeeded, but we need to log
922      *                                a soft error.
923      *
924      * Notes:       This is *ONLY* called when we are examining the status
925      *              after sending out the actual data command.  Any commands
926      *              that are queued for error recovery (i.e. TEST_UNIT_READY)
927      *              do *NOT* come through here.
928      *
929      *              NOTE - When this routine returns FAILED, it means the error
930      *              handler thread is woken.  In cases where the error code
931      *              indicates an error that doesn't require the error handler
932      *              thread (i.e. we don't need to abort/reset), then this function
933      *              should return SUCCESS.
934      */
935     int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
936     {
937     	int rtn;
938     
939     	/*
940     	 * If the device is offline, then we clearly just pass the result back
941     	 * up to the top level.
942     	 */
943     	if (SCpnt->device->online == FALSE) {
944     		SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
945     		return SUCCESS;
946     	}
947     	/*
948     	 * First check the host byte, to see if there is anything in there
949     	 * that would indicate what we need to do.
950     	 */
951     
952     	switch (host_byte(SCpnt->result)) {
953     	case DID_PASSTHROUGH:
954     		/*
955     		 * No matter what, pass this through to the upper layer.
956     		 * Nuke this special code so that it looks like we are saying
957     		 * DID_OK.
958     		 */
959     		SCpnt->result &= 0xff00ffff;
960     		return SUCCESS;
961     	case DID_OK:
962     		/*
963     		 * Looks good.  Drop through, and check the next byte.
964     		 */
965     		break;
966     	case DID_NO_CONNECT:
967     	case DID_BAD_TARGET:
968     	case DID_ABORT:
969     		/*
970     		 * Note - this means that we just report the status back to the
971     		 * top level driver, not that we actually think that it indicates
972     		 * success.
973     		 */
974     		return SUCCESS;
975     		/*
976     		 * When the low level driver returns DID_SOFT_ERROR,
977     		 * it is responsible for keeping an internal retry counter 
978     		 * in order to avoid endless loops (DB)
979     		 *
980     		 * Actually this is a bug in this function here.  We should
981     		 * be mindful of the maximum number of retries specified
982     		 * and not get stuck in a loop.
983     		 */
984     	case DID_SOFT_ERROR:
985     		goto maybe_retry;
986     
987     	case DID_BUS_BUSY:
988     	case DID_PARITY:
989     	case DID_ERROR:
990     		goto maybe_retry;
991     	case DID_TIME_OUT:
992     		/*
993     		   * When we scan the bus, we get timeout messages for
994     		   * these commands if there is no device available.
995     		   * Other hosts report DID_NO_CONNECT for the same thing.
996     		 */
997     		if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
998     		     SCpnt->cmnd[0] == INQUIRY)) {
999     			return SUCCESS;
1000     		} else {
1001     			return FAILED;
1002     		}
1003     	case DID_RESET:
1004     		/*
1005     		 * In the normal case where we haven't initiated a reset, this is
1006     		 * a failure.
1007     		 */
1008     		if (SCpnt->flags & IS_RESETTING) {
1009     			SCpnt->flags &= ~IS_RESETTING;
1010     			goto maybe_retry;
1011     		}
1012     		/*
1013     		 * Examine the sense data to figure out how to proceed from here.
1014     		 * If there is no sense data, we will be forced into the error
1015     		 * handler thread, where we get to examine the thing in a lot more
1016     		 * detail.
1017     		 */
1018     		return scsi_check_sense(SCpnt);
1019     	default:
1020     		return FAILED;
1021     	}
1022     
1023     	/*
1024     	 * Next, check the message byte.
1025     	 */
1026     	if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1027     		return FAILED;
1028     	}
1029     	/*
1030     	 * Now, check the status byte to see if this indicates anything special.
1031     	 */
1032     	switch (status_byte(SCpnt->result)) {
1033     	case QUEUE_FULL:
1034     		/*
1035     		 * The case of trying to send too many commands to a tagged queueing
1036     		 * device.
1037     		 */
1038     		return ADD_TO_MLQUEUE;
1039     	case GOOD:
1040     	case COMMAND_TERMINATED:
1041     		return SUCCESS;
1042     	case CHECK_CONDITION:
1043     		rtn = scsi_check_sense(SCpnt);
1044     		if (rtn == NEEDS_RETRY) {
1045     			goto maybe_retry;
1046     		}
1047     		return rtn;
1048     	case CONDITION_GOOD:
1049     	case INTERMEDIATE_GOOD:
1050     	case INTERMEDIATE_C_GOOD:
1051     		/*
1052     		 * Who knows?  FIXME(eric)
1053     		 */
1054     		return SUCCESS;
1055     	case BUSY:
1056     	case RESERVATION_CONFLICT:
1057     		goto maybe_retry;
1058     	default:
1059     		return FAILED;
1060     	}
1061     	return FAILED;
1062     
1063           maybe_retry:
1064     
1065     	if ((++SCpnt->retries) < SCpnt->allowed) {
1066     		return NEEDS_RETRY;
1067     	} else {
1068                     /*
1069                      * No more retries - report this one back to upper level.
1070                      */
1071     		return SUCCESS;
1072     	}
1073     }
1074     
1075     /*
1076      * Function:  scsi_eh_completed_normally
1077      *
1078      * Purpose:     Examine a command block that has come back from the low-level
1079      *              and figure out what to do next.
1080      *
1081      * Returns:     SUCCESS         - pass on to upper level.
1082      *              FAILED          - pass on to error handler thread.
1083      *              RETRY           - command should be retried.
1084      *              SOFTERR         - command succeeded, but we need to log
1085      *                                a soft error.
1086      *
1087      * Notes:       This is *ONLY* called when we are examining the status
1088      *              of commands queued during error recovery.  The main
1089      *              difference here is that we don't allow for the possibility
1090      *              of retries here, and we are a lot more restrictive about what
1091      *              we consider acceptable.
1092      */
1093     STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
1094     {
1095     	/*
1096     	 * First check the host byte, to see if there is anything in there
1097     	 * that would indicate what we need to do.
1098     	 */
1099     	if (host_byte(SCpnt->result) == DID_RESET) {
1100     		if (SCpnt->flags & IS_RESETTING) {
1101     			/*
1102     			 * OK, this is normal.  We don't know whether in fact the
1103     			 * command in question really needs to be rerun or not - 
1104     			 * if this was the original data command then the answer is yes,
1105     			 * otherwise we just flag it as success.
1106     			 */
1107     			SCpnt->flags &= ~IS_RESETTING;
1108     			return NEEDS_RETRY;
1109     		}
1110     		/*
1111     		 * Rats.  We are already in the error handler, so we now get to try
1112     		 * and figure out what to do next.  If the sense is valid, we have
1113     		 * a pretty good idea of what to do.  If not, we mark it as failed.
1114     		 */
1115     		return scsi_check_sense(SCpnt);
1116     	}
1117     	if (host_byte(SCpnt->result) != DID_OK) {
1118     		return FAILED;
1119     	}
1120     	/*
1121     	 * Next, check the message byte.
1122     	 */
1123     	if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1124     		return FAILED;
1125     	}
1126     	/*
1127     	 * Now, check the status byte to see if this indicates anything special.
1128     	 */
1129     	switch (status_byte(SCpnt->result)) {
1130     	case GOOD:
1131     	case COMMAND_TERMINATED:
1132     		return SUCCESS;
1133     	case CHECK_CONDITION:
1134     		return scsi_check_sense(SCpnt);
1135     	case CONDITION_GOOD:
1136     	case INTERMEDIATE_GOOD:
1137     	case INTERMEDIATE_C_GOOD:
1138     		/*
1139     		 * Who knows?  FIXME(eric)
1140     		 */
1141     		return SUCCESS;
1142     	case BUSY:
1143     	case QUEUE_FULL:
1144     	case RESERVATION_CONFLICT:
1145     	default:
1146     		return FAILED;
1147     	}
1148     	return FAILED;
1149     }
1150     
1151     /*
1152      * Function:  scsi_check_sense
1153      *
1154      * Purpose:     Examine sense information - give suggestion as to what
1155      *              we should do with it.
1156      */
1157     STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
1158     {
1159     	if (!scsi_sense_valid(SCpnt)) {
1160     		return FAILED;
1161     	}
1162     	if (SCpnt->sense_buffer[2] & 0xe0)
1163     		return SUCCESS;
1164     
1165     	switch (SCpnt->sense_buffer[2] & 0xf) {
1166     	case NO_SENSE:
1167     		return SUCCESS;
1168     	case RECOVERED_ERROR:
1169     		return /* SOFT_ERROR */ SUCCESS;
1170     
1171     	case ABORTED_COMMAND:
1172     		return NEEDS_RETRY;
1173     	case NOT_READY:
1174     	case UNIT_ATTENTION:
1175     		/*
1176     		 * If we are expecting a CC/UA because of a bus reset that we
1177     		 * performed, treat this just as a retry.  Otherwise this is
1178     		 * information that we should pass up to the upper-level driver
1179     		 * so that we can deal with it there.
1180     		 */
1181     		if (SCpnt->device->expecting_cc_ua) {
1182     			SCpnt->device->expecting_cc_ua = 0;
1183     			return NEEDS_RETRY;
1184     		}
1185     		/*
1186     		 * If the device is in the process of becoming ready, we 
1187     		 * should retry.
1188     		 */
1189     		if ((SCpnt->sense_buffer[12] == 0x04) &&
1190     			(SCpnt->sense_buffer[13] == 0x01)) {
1191     			return NEEDS_RETRY;
1192     		}
1193     		return SUCCESS;
1194     
1195     		/* these three are not supported */
1196     	case COPY_ABORTED:
1197     	case VOLUME_OVERFLOW:
1198     	case MISCOMPARE:
1199     		return SUCCESS;
1200     
1201     	case MEDIUM_ERROR:
1202     		return NEEDS_RETRY;
1203     
1204     	case ILLEGAL_REQUEST:
1205     	case BLANK_CHECK:
1206     	case DATA_PROTECT:
1207     	case HARDWARE_ERROR:
1208     	default:
1209     		return SUCCESS;
1210     	}
1211     }
1212     
1213     
1214     /*
1215      * Function:  scsi_restart_operations
1216      *
1217      * Purpose:     Restart IO operations to the specified host.
1218      *
1219      * Arguments:   host  - host that we are restarting
1220      *
1221      * Lock status: Assumed that locks are not held upon entry.
1222      *
1223      * Returns:     Nothing
1224      *
1225      * Notes:       When we entered the error handler, we blocked all further
1226      *              I/O to this device.  We need to 'reverse' this process.
1227      */
1228     STATIC void scsi_restart_operations(struct Scsi_Host *host)
1229     {
1230     	Scsi_Device *SDpnt;
1231     	unsigned long flags;
1232     
1233     	ASSERT_LOCK(&io_request_lock, 0);
1234     
1235     	/*
1236     	 * Next free up anything directly waiting upon the host.  This will be
1237     	 * requests for character device operations, and also for ioctls to queued
1238     	 * block devices.
1239     	 */
1240     	SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1241     
1242     	wake_up(&host->host_wait);
1243     
1244     	/*
1245     	 * Finally we need to re-initiate requests that may be pending.  We will
1246     	 * have had everything blocked while error handling is taking place, and
1247     	 * now that error recovery is done, we will need to ensure that these
1248     	 * requests are started.
1249     	 */
1250     	spin_lock_irqsave(&io_request_lock, flags);
1251     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1252     		request_queue_t *q;
1253     		if ((host->can_queue > 0 && (host->host_busy >= host->can_queue))
1254     		    || (host->host_blocked)
1255     		    || (host->host_self_blocked)
1256     		    || (SDpnt->device_blocked)) {
1257     			break;
1258     		}
1259     		q = &SDpnt->request_queue;
1260     		q->request_fn(q);
1261     	}
1262     	spin_unlock_irqrestore(&io_request_lock, flags);
1263     }
1264     
1265     /*
1266      * Function:  scsi_unjam_host
1267      *
1268      * Purpose:     Attempt to fix a host which has a command that failed for
1269      *              some reason.
1270      *
1271      * Arguments:   host    - host that needs unjamming.
1272      * 
1273      * Returns:     Nothing
1274      *
1275      * Notes:       When we come in here, we *know* that all commands on the
1276      *              bus have either completed, failed or timed out.  We also
1277      *              know that no further commands are being sent to the host,
1278      *              so things are relatively quiet and we have freedom to
1279      *              fiddle with things as we wish.
1280      *
1281      * Additional note:  This is only the *default* implementation.  It is possible
1282      *              for individual drivers to supply their own version of this
1283      *              function, and if the maintainer wishes to do this, it is
1284      *              strongly suggested that this function be taken as a template
1285      *              and modified.  This function was designed to correctly handle
1286      *              problems for about 95% of the different cases out there, and
1287      *              it should always provide at least a reasonable amount of error
1288      *              recovery.
1289      *
1290      * Note3:       Any command marked 'FAILED' or 'TIMEOUT' must eventually
1291      *              have scsi_finish_command() called for it.  We do all of
1292      *              the retry stuff here, so when we restart the host after we
1293      *              return it should have an empty queue.
1294      */
1295     STATIC int scsi_unjam_host(struct Scsi_Host *host)
1296     {
1297     	int devices_failed;
1298     	int numfailed;
1299     	int ourrtn;
1300     	int rtn = FALSE;
1301     	int result;
1302     	Scsi_Cmnd *SCloop;
1303     	Scsi_Cmnd *SCpnt;
1304     	Scsi_Device *SDpnt;
1305     	Scsi_Device *SDloop;
1306     	Scsi_Cmnd *SCdone;
1307     	int timed_out;
1308     
1309     	ASSERT_LOCK(&io_request_lock, 0);
1310     
1311     	SCdone = NULL;
1312     
1313     	/*
1314     	 * First, protect against any sort of race condition.  If any of the outstanding
1315     	 * commands are in states that indicate that we are not yet blocked (i.e. we are
1316     	 * not in a quiet state) then we got woken up in error.  If we ever end up here,
1317     	 * we need to re-examine some of the assumptions.
1318     	 */
1319     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1320     		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1321     			if (SCpnt->state == SCSI_STATE_FAILED
1322     			    || SCpnt->state == SCSI_STATE_TIMEOUT
1323     			    || SCpnt->state == SCSI_STATE_INITIALIZING
1324     			    || SCpnt->state == SCSI_STATE_UNUSED) {
1325     				continue;
1326     			}
1327     			/*
1328     			 * Rats.  Something is still floating around out there.  This could
1329     			 * be the result of the fact that the upper level drivers are still frobbing
1330     			 * commands that might have succeeded.  There are two outcomes.  One is that
1331     			 * the command block will eventually be freed, and the other one is that
1332     			 * the command will be queued and will be finished along the way.
1333     			 */
1334     			SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
1335     
1336     /*
1337      *        panic("SCSI Error handler woken too early\n");
1338      *
1339      * This is no longer a problem, since now the code cares only about
1340      * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1341      * Other states are useful only to release active commands when devices are
1342      * set offline. If (host->host_active == host->host_busy) we can safely assume
1343      * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1344      *
1345      * FIXME:
1346      * It is not easy to release correctly commands according to their state when 
1347      * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1348      * When a device is set offline, we can have some command with
1349      * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL, 
1350      * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1351      * (DB, 17 May 1998)
1352      */
1353     		}
1354     	}
1355     
1356     	/*
1357     	 * Next, see if we need to request sense information.  if so,
1358     	 * then get it now, so we have a better idea of what to do.
1359     	 * FIXME(eric) this has the unfortunate side effect that if a host
1360     	 * adapter does not automatically request sense information, that we end
1361     	 * up shutting it down before we request it.  All hosts should be doing this
1362     	 * anyways, so for now all I have to say is tough noogies if you end up in here.
1363     	 * On second thought, this is probably a good idea.  We *really* want to give
1364     	 * authors an incentive to automatically request this.
1365     	 */
1366     	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1367     
1368     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1369     		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1370     			if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
1371     				continue;
1372     			}
1373     			SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1374     							  SCpnt->target));
1375     			rtn = scsi_request_sense(SCpnt);
1376     			if (rtn != SUCCESS) {
1377     				continue;
1378     			}
1379     			SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1380     						  SCpnt, SCpnt->result));
1381     			SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
1382     
1383     			result = scsi_decide_disposition(SCpnt);
1384     
1385     			/*
1386     			 * If the result was normal, then just pass it along to the
1387     			 * upper level.
1388     			 */
1389     			if (result == SUCCESS) {
1390     				SCpnt->host->host_failed--;
1391     				scsi_eh_finish_command(&SCdone, SCpnt);
1392     			}
1393     			if (result != NEEDS_RETRY) {
1394     				continue;
1395     			}
1396     			/* 
1397     			 * We only come in here if we want to retry a
1398     			 * command.  The test to see whether the command
1399     			 * should be retried should be keeping track of the
1400     			 * number of tries, so we don't end up looping, of
1401     			 * course.  
1402     			 */
1403     			SCpnt->state = NEEDS_RETRY;
1404     			rtn = scsi_eh_retry_command(SCpnt);
1405     			if (rtn != SUCCESS) {
1406     				continue;
1407     			}
1408     			/*
1409     			 * We eventually hand this one back to the top level.
1410     			 */
1411     			SCpnt->host->host_failed--;
1412     			scsi_eh_finish_command(&SCdone, SCpnt);
1413     		}
1414     	}
1415     
1416     	/*
1417     	 * Go through the list of commands and figure out where we stand and how bad things
1418     	 * really are.
1419     	 */
1420     	numfailed = 0;
1421     	timed_out = 0;
1422     	devices_failed = 0;
1423     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1424     		unsigned int device_error = 0;
1425     
1426     		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1427     			if (SCpnt->state == SCSI_STATE_FAILED) {
1428     				SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1429     							 SCpnt->target));
1430     				numfailed++;
1431     				device_error++;
1432     			}
1433     			if (SCpnt->state == SCSI_STATE_TIMEOUT) {
1434     				SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1435     							 SCpnt->target));
1436     				timed_out++;
1437     				device_error++;
1438     			}
1439     		}
1440     		if (device_error > 0) {
1441     			devices_failed++;
1442     		}
1443     	}
1444     
1445     	SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1446     				  numfailed, timed_out, devices_failed));
1447     
1448     	if (host->host_failed == 0) {
1449     		ourrtn = TRUE;
1450     		goto leave;
1451     	}
1452     	/*
1453     	 * Next, try and see whether or not it makes sense to try and abort
1454     	 * the running command.  This only works out to be the case if we have
1455     	 * one command that has timed out.  If the command simply failed, it
1456     	 * makes no sense to try and abort the command, since as far as the
1457     	 * host adapter is concerned, it isn't running.
1458     	 */
1459     
1460     	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1461     
1462     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1463     		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1464     			if (SCloop->state != SCSI_STATE_TIMEOUT) {
1465     				continue;
1466     			}
1467     			rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
1468     			if (rtn == SUCCESS) {
1469     				rtn = scsi_test_unit_ready(SCloop);
1470     
1471     				if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1472     					rtn = scsi_eh_retry_command(SCloop);
1473     
1474     					if (rtn == SUCCESS) {
1475     						SCloop->host->host_failed--;
1476     						scsi_eh_finish_command(&SCdone, SCloop);
1477     					}
1478     				}
1479     			}
1480     		}
1481     	}
1482     
1483     	/*
1484     	 * If we have corrected all of the problems, then we are done.
1485     	 */
1486     	if (host->host_failed == 0) {
1487     		ourrtn = TRUE;
1488     		goto leave;
1489     	}
1490     	/*
1491     	 * Either the abort wasn't appropriate, or it didn't succeed.
1492     	 * Now try a bus device reset.  Still, look to see whether we have
1493     	 * multiple devices that are jammed or not - if we have multiple devices,
1494     	 * it makes no sense to try BUS_DEVICE_RESET - we really would need
1495     	 * to try a BUS_RESET instead.
1496     	 *
1497     	 * Does this make sense - should we try BDR on each device individually?
1498     	 * Yes, definitely.
1499     	 */
1500     	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1501     
1502     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1503     		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1504     			if (SCloop->state == SCSI_STATE_FAILED
1505     			    || SCloop->state == SCSI_STATE_TIMEOUT) {
1506     				break;
1507     			}
1508     		}
1509     
1510     		if (SCloop == NULL) {
1511     			continue;
1512     		}
1513     		/*
1514     		 * OK, we have a device that is having problems.  Try and send
1515     		 * a bus device reset to it.
1516     		 *
1517     		 * FIXME(eric) - make sure we handle the case where multiple
1518     		 * commands to the same device have failed. They all must
1519     		 * get properly restarted.
1520     		 */
1521     		rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
1522     
1523     		if (rtn == SUCCESS) {
1524     			rtn = scsi_test_unit_ready(SCloop);
1525     
1526     			if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1527     				rtn = scsi_eh_retry_command(SCloop);
1528     
1529     				if (rtn == SUCCESS) {
1530     					SCloop->host->host_failed--;
1531     					scsi_eh_finish_command(&SCdone, SCloop);
1532     				}
1533     			}
1534     		}
1535     	}
1536     
1537     	if (host->host_failed == 0) {
1538     		ourrtn = TRUE;
1539     		goto leave;
1540     	}
1541     	/*
1542     	 * If we ended up here, we have serious problems.  The only thing left
1543     	 * to try is a full bus reset.  If someone has grabbed the bus and isn't
1544     	 * letting go, then perhaps this will help.
1545     	 */
1546     	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1547     
1548     	/* 
1549     	 * We really want to loop over the various channels, and do this on
1550     	 * a channel by channel basis.  We should also check to see if any
1551     	 * of the failed commands are on soft_reset devices, and if so, skip
1552     	 * the reset.  
1553     	 */
1554     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1555     	      next_device:
1556     		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1557     			if (SCpnt->state != SCSI_STATE_FAILED
1558     			    && SCpnt->state != SCSI_STATE_TIMEOUT) {
1559     				continue;
1560     			}
1561     			/*
1562     			 * We have a failed command.  Make sure there are no other failed
1563     			 * commands on the same channel that are timed out and implement a
1564     			 * soft reset.
1565     			 */
1566     			for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1567     				for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1568     					if (SCloop->channel != SCpnt->channel) {
1569     						continue;
1570     					}
1571     					if (SCloop->state != SCSI_STATE_FAILED
1572     					    && SCloop->state != SCSI_STATE_TIMEOUT) {
1573     						continue;
1574     					}
1575     					if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
1576     						/* 
1577     						 * If this device uses the soft reset option, and this
1578     						 * is one of the devices acting up, then our only
1579     						 * option is to wait a bit, since the command is
1580     						 * supposedly still running.  
1581     						 *
1582     						 * FIXME(eric) - right now we will just end up falling
1583     						 * through to the 'take device offline' case.
1584     						 *
1585     						 * FIXME(eric) - It is possible that the command completed
1586     						 * *after* the error recovery procedure started, and if this
1587     						 * is the case, we are worrying about nothing here.
1588     						 */
1589     
1590     						scsi_sleep(1 * HZ);
1591     						goto next_device;
1592     					}
1593     				}
1594     			}
1595     
1596     			/*
1597     			 * We now know that we are able to perform a reset for the
1598     			 * bus that SCpnt points to.  There are no soft-reset devices
1599     			 * with outstanding timed out commands.
1600     			 */
1601     			rtn = scsi_try_bus_reset(SCpnt);
1602     			if (rtn == SUCCESS) {
1603     				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1604     					for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1605     						if (SCloop->channel != SCpnt->channel) {
1606     							continue;
1607     						}
1608     						if (SCloop->state != SCSI_STATE_FAILED
1609     						    && SCloop->state != SCSI_STATE_TIMEOUT) {
1610     							continue;
1611     						}
1612     						rtn = scsi_test_unit_ready(SCloop);
1613     
1614     						if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1615     							rtn = scsi_eh_retry_command(SCloop);
1616     
1617     							if (rtn == SUCCESS) {
1618     								SCpnt->host->host_failed--;
1619     								scsi_eh_finish_command(&SCdone, SCloop);
1620     							}
1621     						}
1622     						/*
1623     						 * If the bus reset worked, but we are still unable to
1624     						 * talk to the device, take it offline.
1625     						 * FIXME(eric) - is this really the correct thing to do?
1626     						 */
1627     						if (rtn != SUCCESS) {
1628     							printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after bus reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1629     
1630     							SDloop->online = FALSE;
1631     							SDloop->host->host_failed--;
1632     							scsi_eh_finish_command(&SCdone, SCloop);
1633     						}
1634     					}
1635     				}
1636     			}
1637     		}
1638     	}
1639     
1640     	if (host->host_failed == 0) {
1641     		ourrtn = TRUE;
1642     		goto leave;
1643     	}
1644     	/*
1645     	 * If we ended up here, we have serious problems.  The only thing left
1646     	 * to try is a full host reset - perhaps the firmware on the device
1647     	 * crashed, or something like that.
1648     	 *
1649     	 * It is assumed that a succesful host reset will cause *all* information
1650     	 * about the command to be flushed from both the host adapter *and* the
1651     	 * device.
1652     	 *
1653     	 * FIXME(eric) - it isn't clear that devices that implement the soft reset
1654     	 * option can ever be cleared except via cycling the power.  The problem is
1655     	 * that sending the host reset command will cause the host to forget
1656     	 * about the pending command, but the device won't forget.  For now, we
1657     	 * skip the host reset option if any of the failed devices are configured
1658     	 * to use the soft reset option.
1659     	 */
1660     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1661     	      next_device2:
1662     		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1663     			if (SCpnt->state != SCSI_STATE_FAILED
1664     			    && SCpnt->state != SCSI_STATE_TIMEOUT) {
1665     				continue;
1666     			}
1667     			if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
1668     				/* 
1669     				 * If this device uses the soft reset option, and this
1670     				 * is one of the devices acting up, then our only
1671     				 * option is to wait a bit, since the command is
1672     				 * supposedly still running.  
1673     				 *
1674     				 * FIXME(eric) - right now we will just end up falling
1675     				 * through to the 'take device offline' case.
1676     				 */
1677     				SCSI_LOG_ERROR_RECOVERY(3,
1678     							printk("scsi_unjam_host: Unable to try hard host reset\n"));
1679     
1680     				/*
1681     				 * Due to the spinlock, we will never get out of this
1682     				 * loop without a proper wait. (DB)
1683     				 */
1684     				scsi_sleep(1 * HZ);
1685     
1686     				goto next_device2;
1687     			}
1688     			SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1689     
1690     			/*
1691     			 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1692     			 */
1693     			rtn = scsi_try_host_reset(SCpnt);
1694     			if (rtn == SUCCESS) {
1695     				/*
1696     				 * FIXME(eric) we assume that all commands are flushed from the
1697     				 * controller.  We should get a DID_RESET for all of the commands
1698     				 * that were pending.  We should ignore these so that we can
1699     				 * guarantee that we are in a consistent state.
1700     				 *
1701     				 * I believe this to be the case right now, but this needs to be
1702     				 * tested.
1703     				 */
1704     				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1705     					for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1706     						if (SCloop->state != SCSI_STATE_FAILED
1707     						    && SCloop->state != SCSI_STATE_TIMEOUT) {
1708     							continue;
1709     						}
1710     						rtn = scsi_test_unit_ready(SCloop);
1711     
1712     						if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1713     							rtn = scsi_eh_retry_command(SCloop);
1714     
1715     							if (rtn == SUCCESS) {
1716     								SCpnt->host->host_failed--;
1717     								scsi_eh_finish_command(&SCdone, SCloop);
1718     							}
1719     						}
1720     						if (rtn != SUCCESS) {
1721     							printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after host reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1722     							SDloop->online = FALSE;
1723     							SDloop->host->host_failed--;
1724     							scsi_eh_finish_command(&SCdone, SCloop);
1725     						}
1726     					}
1727     				}
1728     			}
1729     		}
1730     	}
1731     
1732     	/*
1733     	 * If we solved all of the problems, then let's rev up the engines again.
1734     	 */
1735     	if (host->host_failed == 0) {
1736     		ourrtn = TRUE;
1737     		goto leave;
1738     	}
1739     	/*
1740     	 * If the HOST RESET failed, then for now we assume that the entire host
1741     	 * adapter is too hosed to be of any use.  For our purposes, however, it is
1742     	 * easier to simply take the devices offline that correspond to commands
1743     	 * that failed.
1744     	 */
1745     	SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1746     
1747     	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1748     		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1749     			if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
1750     				SDloop = SCloop->device;
1751     				if (SDloop->online == TRUE) {
1752     					printk(KERN_INFO "scsi: device set offline - command error recover failed: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1753     					SDloop->online = FALSE;
1754     				}
1755     
1756     				/*
1757     				 * This should pass the failure up to the top level driver, and
1758     				 * it will have to try and do something intelligent with it.
1759     				 */
1760     				SCloop->host->host_failed--;
1761     
1762     				if (SCloop->state == SCSI_STATE_TIMEOUT) {
1763     					SCloop->result |= (DRIVER_TIMEOUT << 24);
1764     				}
1765     				SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1766     				    SDloop->id, SCloop->result));
1767     
1768     				scsi_eh_finish_command(&SCdone, SCloop);
1769     			}
1770     		}
1771     	}
1772     
1773     	if (host->host_failed != 0) {
1774     		panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1775     	}
1776     	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1777     
1778     	ourrtn = FALSE;
1779     
1780           leave:
1781     
1782     	/*
1783     	 * We should have a list of commands that we 'finished' during the course of
1784     	 * error recovery.  This should be the same as the list of commands that timed out
1785     	 * or failed.  We are currently holding these things in a linked list - we didn't
1786     	 * put them in the bottom half queue because we wanted to keep things quiet while
1787     	 * we were working on recovery, and passing them up to the top level could easily
1788     	 * cause the top level to try and queue something else again.
1789     	 *
1790     	 * Start by marking that the host is no longer in error recovery.
1791     	 */
1792     	host->in_recovery = 0;
1793     
1794     	/*
1795     	 * Take the list of commands, and stick them in the bottom half queue.
1796     	 * The current implementation of scsi_done will do this for us - if need
1797     	 * be we can create a special version of this function to do the
1798     	 * same job for us.
1799     	 */
1800     	for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
1801     		SCdone = SCpnt->bh_next;
1802     		SCpnt->bh_next = NULL;
1803                     /*
1804                      * Oh, this is a vile hack.  scsi_done() expects a timer
1805                      * to be running on the command.  If there isn't, it assumes
1806                      * that the command has actually timed out, and a timer
1807                      * handler is running.  That may well be how we got into
1808                      * this fix, but right now things are stable.  We add
1809                      * a timer back again so that we can report completion.
1810                      * scsi_done() will immediately remove said timer from
1811                      * the command, and then process it.
1812                      */
1813     		scsi_add_timer(SCpnt, 100, scsi_eh_times_out);
1814     		scsi_done(SCpnt);
1815     	}
1816     
1817     	return (ourrtn);
1818     }
1819     
1820     
1821     /*
1822      * Function:  scsi_error_handler
1823      *
1824      * Purpose:     Handle errors/timeouts of scsi commands, try and clean up
1825      *              and unjam the bus, and restart things.
1826      *
1827      * Arguments:   host    - host for which we are running.
1828      *
1829      * Returns:     Never returns.
1830      *
1831      * Notes:       This is always run in the context of a kernel thread.  The
1832      *              idea is that we start this thing up when the kernel starts
1833      *              up (one per host that we detect), and it immediately goes to
1834      *              sleep and waits for some event (i.e. failure).  When this
1835      *              takes place, we have the job of trying to unjam the bus
1836      *              and restarting things.
1837      *
1838      */
1839     void scsi_error_handler(void *data)
1840     {
1841     	struct Scsi_Host *host = (struct Scsi_Host *) data;
1842     	int rtn;
1843     	DECLARE_MUTEX_LOCKED(sem);
1844     
1845             /*
1846              * We only listen to signals if the HA was loaded as a module.
1847              * If the HA was compiled into the kernel, then we don't listen
1848              * to any signals.
1849              */
1850             if( host->loaded_as_module ) {
1851     	siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
1852     	} else {
1853     	siginitsetinv(&current->blocked, 0);
1854             }
1855     
1856     	lock_kernel();
1857     
1858     	/*
1859     	 *    Flush resources
1860     	 */
1861     
1862     	daemonize();
1863     
1864     	/*
1865     	 * Set the name of this process.
1866     	 */
1867     
1868     	sprintf(current->comm, "scsi_eh_%d", host->host_no);
1869     
1870     	host->eh_wait = &sem;
1871     	host->ehandler = current;
1872     
1873     	unlock_kernel();
1874     
1875     	/*
1876     	 * Wake up the thread that created us.
1877     	 */
1878     	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", host->eh_notify->count.counter));
1879     
1880     	up(host->eh_notify);
1881     
1882     	while (1) {
1883     		/*
1884     		 * If we get a signal, it means we are supposed to go
1885     		 * away and die.  This typically happens if the user is
1886     		 * trying to unload a module.
1887     		 */
1888     		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1889     
1890     		/*
1891     		 * Note - we always use down_interruptible with the semaphore
1892     		 * even if the module was loaded as part of the kernel.  The
1893     		 * reason is that down() will cause this thread to be counted
1894     		 * in the load average as a running process, and down
1895     		 * interruptible doesn't.  Given that we need to allow this
1896     		 * thread to die if the driver was loaded as a module, using
1897     		 * semaphores isn't unreasonable.
1898     		 */
1899     		down_interruptible(&sem);
1900     		if( host->loaded_as_module ) {
1901     			if (signal_pending(current))
1902     				break;
1903                     }
1904     
1905     		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1906     
1907     		host->eh_active = 1;
1908     
1909     		/*
1910     		 * We have a host that is failing for some reason.  Figure out
1911     		 * what we need to do to get it up and online again (if we can).
1912     		 * If we fail, we end up taking the thing offline.
1913     		 */
1914     		if (host->hostt->eh_strategy_handler != NULL) {
1915     			rtn = host->hostt->eh_strategy_handler(host);
1916     		} else {
1917     			rtn = scsi_unjam_host(host);
1918     		}
1919     
1920     		host->eh_active = 0;
1921     
1922     		/*
1923     		 * Note - if the above fails completely, the action is to take
1924     		 * individual devices offline and flush the queue of any
1925     		 * outstanding requests that may have been pending.  When we
1926     		 * restart, we restart any I/O to any other devices on the bus
1927     		 * which are still online.
1928     		 */
1929     		scsi_restart_operations(host);
1930     
1931     	}
1932     
1933     	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1934     
1935     	/*
1936     	 * Make sure that nobody tries to wake us up again.
1937     	 */
1938     	host->eh_wait = NULL;
1939     
1940     	/*
1941     	 * Knock this down too.  From this point on, the host is flying
1942     	 * without a pilot.  If this is because the module is being unloaded,
1943     	 * that's fine.  If the user sent a signal to this thing, we are
1944     	 * potentially in real danger.
1945     	 */
1946     	host->in_recovery = 0;
1947     	host->eh_active = 0;
1948     	host->ehandler = NULL;
1949     
1950     	/*
1951     	 * If anyone is waiting for us to exit (i.e. someone trying to unload
1952     	 * a driver), then wake up that process to let them know we are on
1953     	 * the way out the door.  This may be overkill - I *think* that we
1954     	 * could probably just unload the driver and send the signal, and when
1955     	 * the error handling thread wakes up that it would just exit without
1956     	 * needing to touch any memory associated with the driver itself.
1957     	 */
1958     	if (host->eh_notify != NULL)
1959     		up(host->eh_notify);
1960     }
1961     
1962     /*
1963      * Overrides for Emacs so that we follow Linus's tabbing style.
1964      * Emacs will notice this stuff at the end of the file and automatically
1965      * adjust the settings for this buffer only.  This must remain at the end
1966      * of the file.
1967      * ---------------------------------------------------------------------------
1968      * Local variables:
1969      * c-indent-level: 4
1970      * c-brace-imaginary-offset: 0
1971      * c-brace-offset: -4
1972      * c-argdecl-indent: 4
1973      * c-label-offset: -4
1974      * c-continued-statement-offset: 4
1975      * c-continued-brace-offset: 0
1976      * indent-tabs-mode: nil
1977      * tab-width: 8
1978      * End:
1979      */
1980