File: /usr/src/linux/drivers/scsi/scsi_error.c
1 /*
2 * scsi_error.c Copyright (C) 1997 Eric Youngdale
3 *
4 * SCSI error/timeout handling
5 * Initial versions: Eric Youngdale. Based upon conversations with
6 * Leonard Zubkoff and David Miller at Linux Expo,
7 * ideas originating from all over the place.
8 *
9 */
10
11 #define __NO_VERSION__
12 #include <linux/module.h>
13
14 #include <linux/sched.h>
15 #include <linux/timer.h>
16 #include <linux/string.h>
17 #include <linux/slab.h>
18 #include <linux/ioport.h>
19 #include <linux/kernel.h>
20 #include <linux/stat.h>
21 #include <linux/blk.h>
22 #include <linux/interrupt.h>
23 #include <linux/delay.h>
24 #include <linux/smp_lock.h>
25
26 #define __KERNEL_SYSCALLS__
27
28 #include <linux/unistd.h>
29
30 #include <asm/system.h>
31 #include <asm/irq.h>
32 #include <asm/dma.h>
33
34 #include "scsi.h"
35 #include "hosts.h"
36 #include "constants.h"
37
38 /*
39 * We must always allow SHUTDOWN_SIGS. Even if we are not a module,
40 * the host drivers that we are using may be loaded as modules, and
41 * when we unload these, we need to ensure that the error handler thread
42 * can be shut down.
43 *
44 * Note - when we unload a module, we send a SIGHUP. We mustn't
45 * enable SIGTERM, as this is how the init shuts things down when you
46 * go to single-user mode. For that matter, init also sends SIGKILL,
47 * so we mustn't enable that one either. We use SIGHUP instead. Other
48 * options would be SIGPWR, I suppose.
49 */
50 #define SHUTDOWN_SIGS (sigmask(SIGHUP))
51
52 #ifdef DEBUG
53 #define SENSE_TIMEOUT SCSI_TIMEOUT
54 #define ABORT_TIMEOUT SCSI_TIMEOUT
55 #define RESET_TIMEOUT SCSI_TIMEOUT
56 #else
57 #define SENSE_TIMEOUT (10*HZ)
58 #define RESET_TIMEOUT (2*HZ)
59 #define ABORT_TIMEOUT (15*HZ)
60 #endif
61
62 #define STATIC
63
64 /*
65 * These should *probably* be handled by the host itself.
66 * Since it is allowed to sleep, it probably should.
67 */
68 #define BUS_RESET_SETTLE_TIME 5*HZ
69 #define HOST_RESET_SETTLE_TIME 10*HZ
70
71
72 static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
73
74 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
75 STATIC int scsi_request_sense(Scsi_Cmnd *);
76 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
77 STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
78 STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
79 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
80 STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
81 STATIC int scsi_try_host_reset(Scsi_Cmnd *);
82 STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
83 STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
84 STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
85 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
86 STATIC void scsi_restart_operations(struct Scsi_Host *);
87 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
88
89
90 /*
91 * Function: scsi_add_timer()
92 *
93 * Purpose: Start timeout timer for a single scsi command.
94 *
95 * Arguments: SCset - command that is about to start running.
96 * timeout - amount of time to allow this command to run.
97 * complete - timeout function to call if timer isn't
98 * canceled.
99 *
100 * Returns: Nothing
101 *
102 * Notes: This should be turned into an inline function.
103 *
104 * More Notes: Each scsi command has it's own timer, and as it is added to
105 * the queue, we set up the timer. When the command completes,
106 * we cancel the timer. Pretty simple, really, especially
107 * compared to the old way of handling this crap.
108 */
109 void scsi_add_timer(Scsi_Cmnd * SCset,
110 int timeout,
111 void (*complete) (Scsi_Cmnd *))
112 {
113
114 /*
115 * If the clock was already running for this command, then
116 * first delete the timer. The timer handling code gets rather
117 * confused if we don't do this.
118 */
119 if (SCset->eh_timeout.function != NULL) {
120 del_timer(&SCset->eh_timeout);
121 }
122 SCset->eh_timeout.data = (unsigned long) SCset;
123 SCset->eh_timeout.expires = jiffies + timeout;
124 SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
125
126 SCset->done_late = 0;
127
128 SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
129
130 add_timer(&SCset->eh_timeout);
131
132 }
133
134 /*
135 * Function: scsi_delete_timer()
136 *
137 * Purpose: Delete/cancel timer for a given function.
138 *
139 * Arguments: SCset - command that we are canceling timer for.
140 *
141 * Returns: 1 if we were able to detach the timer. 0 if we
142 * blew it, and the timer function has already started
143 * to run.
144 *
145 * Notes: This should be turned into an inline function.
146 */
147 int scsi_delete_timer(Scsi_Cmnd * SCset)
148 {
149 int rtn;
150
151 rtn = del_timer(&SCset->eh_timeout);
152
153 SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p %d\n", SCset, rtn));
154
155 SCset->eh_timeout.data = (unsigned long) NULL;
156 SCset->eh_timeout.function = NULL;
157
158 return rtn;
159 }
160
161 /*
162 * Function: scsi_times_out()
163 *
164 * Purpose: Timeout function for normal scsi commands..
165 *
166 * Arguments: SCpnt - command that is timing out.
167 *
168 * Returns: Nothing.
169 *
170 * Notes: We do not need to lock this. There is the potential for
171 * a race only in that the normal completion handling might
172 * run, but if the normal completion function determines
173 * that the timer has already fired, then it mustn't do
174 * anything.
175 */
176 void scsi_times_out(Scsi_Cmnd * SCpnt)
177 {
178 /*
179 * Notify the low-level code that this operation failed and we are
180 * reposessing the command.
181 */
182 #ifdef ERIC_neverdef
183 /*
184 * FIXME(eric)
185 * Allow the host adapter to push a queue ordering tag
186 * out to the bus to force the command in question to complete.
187 * If the host wants to do this, then we just restart the timer
188 * for the command. Before we really do this, some real thought
189 * as to the optimum way to handle this should be done. We *do*
190 * need to force ordering every so often to ensure that all requests
191 * do eventually complete, but I am not sure if this is the best way
192 * to actually go about it.
193 *
194 * Better yet, force a sync here, but don't block since we are in an
195 * interrupt.
196 */
197 if (SCpnt->host->hostt->eh_ordered_queue_tag) {
198 if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
199 scsi_add_timer(SCpnt, SCpnt->internal_timeout,
200 scsi_times_out);
201 return;
202 }
203 }
204 /*
205 * FIXME(eric) - add a second special interface to handle this
206 * case. Ideally that interface can also be used to request
207 * a queu
208 */
209 if (SCpnt->host->can_queue) {
210 SCpnt->host->hostt->queuecommand(SCpnt, NULL);
211 }
212 #endif
213
214 /* Set the serial_number_at_timeout to the current serial_number */
215 SCpnt->serial_number_at_timeout = SCpnt->serial_number;
216
217 SCpnt->eh_state = FAILED;
218 SCpnt->state = SCSI_STATE_TIMEOUT;
219 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
220
221 SCpnt->host->in_recovery = 1;
222 SCpnt->host->host_failed++;
223
224 SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
225 atomic_read(&SCpnt->host->host_active),
226 SCpnt->host->host_busy,
227 SCpnt->host->host_failed));
228
229 /*
230 * If the host is having troubles, then look to see if this was the last
231 * command that might have failed. If so, wake up the error handler.
232 */
233 if( SCpnt->host->eh_wait == NULL ) {
234 panic("Error handler thread not present at %p %p %s %d",
235 SCpnt, SCpnt->host, __FILE__, __LINE__);
236 }
237 if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
238 up(SCpnt->host->eh_wait);
239 }
240 }
241
242 /*
243 * Function scsi_block_when_processing_errors
244 *
245 * Purpose: Prevent more commands from being queued while error recovery
246 * is taking place.
247 *
248 * Arguments: SDpnt - device on which we are performing recovery.
249 *
250 * Returns: FALSE The device was taken offline by error recovery.
251 * TRUE OK to proceed.
252 *
253 * Notes: We block until the host is out of error recovery, and then
254 * check to see whether the host or the device is offline.
255 */
256 int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
257 {
258
259 SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
260
261 SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt->online));
262
263 return SDpnt->online;
264 }
265
266 /*
267 * Function: scsi_eh_times_out()
268 *
269 * Purpose: Timeout function for error handling.
270 *
271 * Arguments: SCpnt - command that is timing out.
272 *
273 * Returns: Nothing.
274 *
275 * Notes: During error handling, the kernel thread will be sleeping
276 * waiting for some action to complete on the device. Our only
277 * job is to record that it timed out, and to wake up the
278 * thread.
279 */
280 STATIC
281 void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
282 {
283 SCpnt->eh_state = SCSI_STATE_TIMEOUT;
284 SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt));
285
286 if (SCpnt->host->eh_action != NULL)
287 up(SCpnt->host->eh_action);
288 else
289 printk("Missing scsi error handler thread\n");
290 }
291
292
293 /*
294 * Function: scsi_eh_done()
295 *
296 * Purpose: Completion function for error handling.
297 *
298 * Arguments: SCpnt - command that is timing out.
299 *
300 * Returns: Nothing.
301 *
302 * Notes: During error handling, the kernel thread will be sleeping
303 * waiting for some action to complete on the device. Our only
304 * job is to record that the action completed, and to wake up the
305 * thread.
306 */
307 STATIC
308 void scsi_eh_done(Scsi_Cmnd * SCpnt)
309 {
310 int rtn;
311
312 /*
313 * If the timeout handler is already running, then just set the
314 * flag which says we finished late, and return. We have no
315 * way of stopping the timeout handler from running, so we must
316 * always defer to it.
317 */
318 rtn = del_timer(&SCpnt->eh_timeout);
319 if (!rtn) {
320 SCpnt->done_late = 1;
321 return;
322 }
323
324 SCpnt->request.rq_status = RQ_SCSI_DONE;
325
326 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
327 SCpnt->eh_state = SUCCESS;
328
329 SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt,
330 SCpnt->result));
331
332 if (SCpnt->host->eh_action != NULL)
333 up(SCpnt->host->eh_action);
334 }
335
336 /*
337 * Function: scsi_eh_action_done()
338 *
339 * Purpose: Completion function for error handling.
340 *
341 * Arguments: SCpnt - command that is timing out.
342 * answer - boolean that indicates whether operation succeeded.
343 *
344 * Returns: Nothing.
345 *
346 * Notes: This callback is only used for abort and reset operations.
347 */
348 STATIC
349 void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
350 {
351 SCpnt->request.rq_status = RQ_SCSI_DONE;
352
353 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
354 SCpnt->eh_state = (answer ? SUCCESS : FAILED);
355
356 if (SCpnt->host->eh_action != NULL)
357 up(SCpnt->host->eh_action);
358 }
359
360 /*
361 * Function: scsi_sense_valid()
362 *
363 * Purpose: Determine whether a host has automatically obtained sense
364 * information or not. If we have it, then give a recommendation
365 * as to what we should do next.
366 */
367 int scsi_sense_valid(Scsi_Cmnd * SCpnt)
368 {
369 if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
370 return FALSE;
371 }
372 return TRUE;
373 }
374
375 /*
376 * Function: scsi_eh_retry_command()
377 *
378 * Purpose: Retry the original command
379 *
380 * Returns: SUCCESS - we were able to get the sense data.
381 * FAILED - we were not able to get the sense data.
382 *
383 * Notes: This function will *NOT* return until the command either
384 * times out, or it completes.
385 */
386 STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
387 {
388 memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
389 sizeof(SCpnt->data_cmnd));
390 SCpnt->request_buffer = SCpnt->buffer;
391 SCpnt->request_bufflen = SCpnt->bufflen;
392 SCpnt->use_sg = SCpnt->old_use_sg;
393 SCpnt->cmd_len = SCpnt->old_cmd_len;
394 SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
395 SCpnt->underflow = SCpnt->old_underflow;
396
397 scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
398
399 /*
400 * Hey, we are done. Let's look to see what happened.
401 */
402 return SCpnt->eh_state;
403 }
404
405 /*
406 * Function: scsi_request_sense()
407 *
408 * Purpose: Request sense data from a particular target.
409 *
410 * Returns: SUCCESS - we were able to get the sense data.
411 * FAILED - we were not able to get the sense data.
412 *
413 * Notes: Some hosts automatically obtain this information, others
414 * require that we obtain it on our own.
415 *
416 * This function will *NOT* return until the command either
417 * times out, or it completes.
418 */
419 STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
420 {
421 static unsigned char generic_sense[6] =
422 {REQUEST_SENSE, 0, 0, 0, 255, 0};
423 unsigned char scsi_result0[256], *scsi_result = NULL;
424 int saved_result;
425
426 ASSERT_LOCK(&io_request_lock, 0);
427
428 memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
429 sizeof(generic_sense));
430
431 if (SCpnt->device->scsi_level <= SCSI_2)
432 SCpnt->cmnd[1] = SCpnt->lun << 5;
433
434 scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
435 ? &scsi_result0[0] : kmalloc(512, GFP_ATOMIC | GFP_DMA);
436
437 if (scsi_result == NULL) {
438 printk("cannot allocate scsi_result in scsi_request_sense.\n");
439 return FAILED;
440 }
441 /*
442 * Zero the sense buffer. Some host adapters automatically always request
443 * sense, so it is not a good idea that SCpnt->request_buffer and
444 * SCpnt->sense_buffer point to the same address (DB).
445 * 0 is not a valid sense code.
446 */
447 memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
448 memset((void *) scsi_result, 0, 256);
449
450 saved_result = SCpnt->result;
451 SCpnt->request_buffer = scsi_result;
452 SCpnt->request_bufflen = 256;
453 SCpnt->use_sg = 0;
454 SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
455 SCpnt->sc_data_direction = SCSI_DATA_READ;
456 SCpnt->underflow = 0;
457
458 scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
459
460 /* Last chance to have valid sense data */
461 if (!scsi_sense_valid(SCpnt))
462 memcpy((void *) SCpnt->sense_buffer,
463 SCpnt->request_buffer,
464 sizeof(SCpnt->sense_buffer));
465
466 if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
467 kfree(scsi_result);
468
469 /*
470 * When we eventually call scsi_finish, we really wish to complete
471 * the original request, so let's restore the original data. (DB)
472 */
473 memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
474 sizeof(SCpnt->data_cmnd));
475 SCpnt->result = saved_result;
476 SCpnt->request_buffer = SCpnt->buffer;
477 SCpnt->request_bufflen = SCpnt->bufflen;
478 SCpnt->use_sg = SCpnt->old_use_sg;
479 SCpnt->cmd_len = SCpnt->old_cmd_len;
480 SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
481 SCpnt->underflow = SCpnt->old_underflow;
482
483 /*
484 * Hey, we are done. Let's look to see what happened.
485 */
486 return SCpnt->eh_state;
487 }
488
489 /*
490 * Function: scsi_test_unit_ready()
491 *
492 * Purpose: Run test unit ready command to see if the device is talking to us or not.
493 *
494 */
495 STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
496 {
497 static unsigned char tur_command[6] =
498 {TEST_UNIT_READY, 0, 0, 0, 0, 0};
499
500 memcpy((void *) SCpnt->cmnd, (void *) tur_command,
501 sizeof(tur_command));
502
503 if (SCpnt->device->scsi_level <= SCSI_2)
504 SCpnt->cmnd[1] = SCpnt->lun << 5;
505
506 /*
507 * Zero the sense buffer. The SCSI spec mandates that any
508 * untransferred sense data should be interpreted as being zero.
509 */
510 memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
511
512 SCpnt->request_buffer = NULL;
513 SCpnt->request_bufflen = 0;
514 SCpnt->use_sg = 0;
515 SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
516 SCpnt->underflow = 0;
517 SCpnt->sc_data_direction = SCSI_DATA_NONE;
518
519 scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
520
521 /*
522 * When we eventually call scsi_finish, we really wish to complete
523 * the original request, so let's restore the original data. (DB)
524 */
525 memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
526 sizeof(SCpnt->data_cmnd));
527 SCpnt->request_buffer = SCpnt->buffer;
528 SCpnt->request_bufflen = SCpnt->bufflen;
529 SCpnt->use_sg = SCpnt->old_use_sg;
530 SCpnt->cmd_len = SCpnt->old_cmd_len;
531 SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
532 SCpnt->underflow = SCpnt->old_underflow;
533
534 /*
535 * Hey, we are done. Let's look to see what happened.
536 */
537 SCSI_LOG_ERROR_RECOVERY(3,
538 printk("scsi_test_unit_ready: SCpnt %p eh_state %x\n",
539 SCpnt, SCpnt->eh_state));
540 return SCpnt->eh_state;
541 }
542
543 /*
544 * This would normally need to get the IO request lock,
545 * but as it doesn't actually touch anything that needs
546 * to be locked we can avoid the lock here..
547 */
548 STATIC
549 void scsi_sleep_done(struct semaphore *sem)
550 {
551 if (sem != NULL) {
552 up(sem);
553 }
554 }
555
556 void scsi_sleep(int timeout)
557 {
558 DECLARE_MUTEX_LOCKED(sem);
559 struct timer_list timer;
560
561 init_timer(&timer);
562 timer.data = (unsigned long) &sem;
563 timer.expires = jiffies + timeout;
564 timer.function = (void (*)(unsigned long)) scsi_sleep_done;
565
566 SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout));
567
568 add_timer(&timer);
569
570 down(&sem);
571 del_timer(&timer);
572 }
573
574 /*
575 * Function: scsi_send_eh_cmnd
576 *
577 * Purpose: Send a command out to a device as part of error recovery.
578 *
579 * Notes: The initialization of the structures is quite a bit different
580 * in this case, and furthermore, there is a different completion
581 * handler.
582 */
583 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
584 {
585 unsigned long flags;
586 struct Scsi_Host *host;
587
588 ASSERT_LOCK(&io_request_lock, 0);
589
590 host = SCpnt->host;
591
592 retry:
593 /*
594 * We will use a queued command if possible, otherwise we will emulate the
595 * queuing and calling of completion function ourselves.
596 */
597 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
598
599 if (host->can_queue) {
600 DECLARE_MUTEX_LOCKED(sem);
601
602 SCpnt->eh_state = SCSI_STATE_QUEUED;
603
604 scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
605
606 /*
607 * Set up the semaphore so we wait for the command to complete.
608 */
609 SCpnt->host->eh_action = &sem;
610 SCpnt->request.rq_status = RQ_SCSI_BUSY;
611
612 spin_lock_irqsave(&io_request_lock, flags);
613 host->hostt->queuecommand(SCpnt, scsi_eh_done);
614 spin_unlock_irqrestore(&io_request_lock, flags);
615
616 down(&sem);
617
618 SCpnt->host->eh_action = NULL;
619
620 /*
621 * See if timeout. If so, tell the host to forget about it.
622 * In other words, we don't want a callback any more.
623 */
624 if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
625 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
626
627 /*
628 * As far as the low level driver is
629 * concerned, this command is still active, so
630 * we must give the low level driver a chance
631 * to abort it. (DB)
632 *
633 * FIXME(eric) - we are not tracking whether we could
634 * abort a timed out command or not. Not sure how
635 * we should treat them differently anyways.
636 */
637 spin_lock_irqsave(&io_request_lock, flags);
638 if (SCpnt->host->hostt->eh_abort_handler)
639 SCpnt->host->hostt->eh_abort_handler(SCpnt);
640 spin_unlock_irqrestore(&io_request_lock, flags);
641
642 SCpnt->request.rq_status = RQ_SCSI_DONE;
643 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
644
645 SCpnt->eh_state = FAILED;
646 }
647 SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
648 SCpnt, SCpnt->eh_state));
649 } else {
650 int temp;
651
652 /*
653 * We damn well had better never use this code. There is no timeout
654 * protection here, since we would end up waiting in the actual low
655 * level driver, we don't know how to wake it up.
656 */
657 spin_lock_irqsave(&io_request_lock, flags);
658 temp = host->hostt->command(SCpnt);
659 spin_unlock_irqrestore(&io_request_lock, flags);
660
661 SCpnt->result = temp;
662 /* Fall through to code below to examine status. */
663 SCpnt->eh_state = SUCCESS;
664 }
665
666 /*
667 * Now examine the actual status codes to see whether the command actually
668 * did complete normally.
669 */
670 if (SCpnt->eh_state == SUCCESS) {
671 int ret = scsi_eh_completed_normally(SCpnt);
672 SCSI_LOG_ERROR_RECOVERY(3,
673 printk("scsi_send_eh_cmnd: scsi_eh_completed_normally %x\n", ret));
674 switch (ret) {
675 case SUCCESS:
676 SCpnt->eh_state = SUCCESS;
677 break;
678 case NEEDS_RETRY:
679 goto retry;
680 case FAILED:
681 default:
682 SCpnt->eh_state = FAILED;
683 break;
684 }
685 } else {
686 SCpnt->eh_state = FAILED;
687 }
688 }
689
690 /*
691 * Function: scsi_unit_is_ready()
692 *
693 * Purpose: Called after TEST_UNIT_READY is run, to test to see if
694 * the unit responded in a way that indicates it is ready.
695 */
696 STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
697 {
698 if (SCpnt->result) {
699 if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
700 (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
701 ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
702 if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
703 ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
704 ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
705 return 0;
706 }
707 }
708 }
709 return 1;
710 }
711
712 /*
713 * Function: scsi_eh_finish_command
714 *
715 * Purpose: Handle a command that we are finished with WRT error handling.
716 *
717 * Arguments: SClist - pointer to list into which we are putting completed commands.
718 * SCpnt - command that is completing
719 *
720 * Notes: We don't want to use the normal command completion while we are
721 * are still handling errors - it may cause other commands to be queued,
722 * and that would disturb what we are doing. Thus we really want to keep
723 * a list of pending commands for final completion, and once we
724 * are ready to leave error handling we handle completion for real.
725 */
726 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
727 {
728 SCpnt->state = SCSI_STATE_BHQUEUE;
729 SCpnt->bh_next = *SClist;
730 /*
731 * Set this back so that the upper level can correctly free up
732 * things.
733 */
734 SCpnt->use_sg = SCpnt->old_use_sg;
735 SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
736 SCpnt->underflow = SCpnt->old_underflow;
737 *SClist = SCpnt;
738 }
739
740 /*
741 * Function: scsi_try_to_abort_command
742 *
743 * Purpose: Ask host adapter to abort a running command.
744 *
745 * Returns: FAILED Operation failed or not supported.
746 * SUCCESS Succeeded.
747 *
748 * Notes: This function will not return until the user's completion
749 * function has been called. There is no timeout on this
750 * operation. If the author of the low-level driver wishes
751 * this operation to be timed, they can provide this facility
752 * themselves. Helper functions in scsi_error.c can be supplied
753 * to make this easier to do.
754 *
755 * Notes: It may be possible to combine this with all of the reset
756 * handling to eliminate a lot of code duplication. I don't
757 * know what makes more sense at the moment - this is just a
758 * prototype.
759 */
760 STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
761 {
762 int rtn;
763 unsigned long flags;
764
765 SCpnt->eh_state = FAILED; /* Until we come up with something better */
766
767 if (SCpnt->host->hostt->eh_abort_handler == NULL) {
768 return FAILED;
769 }
770 /*
771 * scsi_done was called just after the command timed out and before
772 * we had a chance to process it. (DB)
773 */
774 if (SCpnt->serial_number == 0)
775 return SUCCESS;
776
777 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
778
779 spin_lock_irqsave(&io_request_lock, flags);
780 rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
781 spin_unlock_irqrestore(&io_request_lock, flags);
782 return rtn;
783 }
784
785 /*
786 * Function: scsi_try_bus_device_reset
787 *
788 * Purpose: Ask host adapter to perform a bus device reset for a given
789 * device.
790 *
791 * Returns: FAILED Operation failed or not supported.
792 * SUCCESS Succeeded.
793 *
794 * Notes: There is no timeout for this operation. If this operation is
795 * unreliable for a given host, then the host itself needs to put a
796 * timer on it, and set the host back to a consistent state prior
797 * to returning.
798 */
799 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
800 {
801 unsigned long flags;
802 int rtn;
803
804 SCpnt->eh_state = FAILED; /* Until we come up with something better */
805
806 if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
807 return FAILED;
808 }
809 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
810
811 spin_lock_irqsave(&io_request_lock, flags);
812 rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
813 spin_unlock_irqrestore(&io_request_lock, flags);
814
815 if (rtn == SUCCESS)
816 SCpnt->eh_state = SUCCESS;
817
818 return SCpnt->eh_state;
819 }
820
821 /*
822 * Function: scsi_try_bus_reset
823 *
824 * Purpose: Ask host adapter to perform a bus reset for a host.
825 *
826 * Returns: FAILED Operation failed or not supported.
827 * SUCCESS Succeeded.
828 *
829 * Notes:
830 */
831 STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
832 {
833 unsigned long flags;
834 int rtn;
835
836 SCpnt->eh_state = FAILED; /* Until we come up with something better */
837 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
838 SCpnt->serial_number_at_timeout = SCpnt->serial_number;
839
840 if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
841 return FAILED;
842 }
843
844 spin_lock_irqsave(&io_request_lock, flags);
845 rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
846 spin_unlock_irqrestore(&io_request_lock, flags);
847
848 if (rtn == SUCCESS)
849 SCpnt->eh_state = SUCCESS;
850
851 /*
852 * If we had a successful bus reset, mark the command blocks to expect
853 * a condition code of unit attention.
854 */
855 scsi_sleep(BUS_RESET_SETTLE_TIME);
856 if (SCpnt->eh_state == SUCCESS) {
857 Scsi_Device *SDloop;
858 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
859 if (SCpnt->channel == SDloop->channel) {
860 SDloop->was_reset = 1;
861 SDloop->expecting_cc_ua = 1;
862 }
863 }
864 }
865 return SCpnt->eh_state;
866 }
867
868 /*
869 * Function: scsi_try_host_reset
870 *
871 * Purpose: Ask host adapter to reset itself, and the bus.
872 *
873 * Returns: FAILED Operation failed or not supported.
874 * SUCCESS Succeeded.
875 *
876 * Notes:
877 */
878 STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
879 {
880 unsigned long flags;
881 int rtn;
882
883 SCpnt->eh_state = FAILED; /* Until we come up with something better */
884 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
885 SCpnt->serial_number_at_timeout = SCpnt->serial_number;
886
887 if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
888 return FAILED;
889 }
890 spin_lock_irqsave(&io_request_lock, flags);
891 rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
892 spin_unlock_irqrestore(&io_request_lock, flags);
893
894 if (rtn == SUCCESS)
895 SCpnt->eh_state = SUCCESS;
896
897 /*
898 * If we had a successful host reset, mark the command blocks to expect
899 * a condition code of unit attention.
900 */
901 scsi_sleep(HOST_RESET_SETTLE_TIME);
902 if (SCpnt->eh_state == SUCCESS) {
903 Scsi_Device *SDloop;
904 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
905 SDloop->was_reset = 1;
906 SDloop->expecting_cc_ua = 1;
907 }
908 }
909 return SCpnt->eh_state;
910 }
911
912 /*
913 * Function: scsi_decide_disposition
914 *
915 * Purpose: Examine a command block that has come back from the low-level
916 * and figure out what to do next.
917 *
918 * Returns: SUCCESS - pass on to upper level.
919 * FAILED - pass on to error handler thread.
920 * RETRY - command should be retried.
921 * SOFTERR - command succeeded, but we need to log
922 * a soft error.
923 *
924 * Notes: This is *ONLY* called when we are examining the status
925 * after sending out the actual data command. Any commands
926 * that are queued for error recovery (i.e. TEST_UNIT_READY)
927 * do *NOT* come through here.
928 *
929 * NOTE - When this routine returns FAILED, it means the error
930 * handler thread is woken. In cases where the error code
931 * indicates an error that doesn't require the error handler
932 * thread (i.e. we don't need to abort/reset), then this function
933 * should return SUCCESS.
934 */
935 int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
936 {
937 int rtn;
938
939 /*
940 * If the device is offline, then we clearly just pass the result back
941 * up to the top level.
942 */
943 if (SCpnt->device->online == FALSE) {
944 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
945 return SUCCESS;
946 }
947 /*
948 * First check the host byte, to see if there is anything in there
949 * that would indicate what we need to do.
950 */
951
952 switch (host_byte(SCpnt->result)) {
953 case DID_PASSTHROUGH:
954 /*
955 * No matter what, pass this through to the upper layer.
956 * Nuke this special code so that it looks like we are saying
957 * DID_OK.
958 */
959 SCpnt->result &= 0xff00ffff;
960 return SUCCESS;
961 case DID_OK:
962 /*
963 * Looks good. Drop through, and check the next byte.
964 */
965 break;
966 case DID_NO_CONNECT:
967 case DID_BAD_TARGET:
968 case DID_ABORT:
969 /*
970 * Note - this means that we just report the status back to the
971 * top level driver, not that we actually think that it indicates
972 * success.
973 */
974 return SUCCESS;
975 /*
976 * When the low level driver returns DID_SOFT_ERROR,
977 * it is responsible for keeping an internal retry counter
978 * in order to avoid endless loops (DB)
979 *
980 * Actually this is a bug in this function here. We should
981 * be mindful of the maximum number of retries specified
982 * and not get stuck in a loop.
983 */
984 case DID_SOFT_ERROR:
985 goto maybe_retry;
986
987 case DID_BUS_BUSY:
988 case DID_PARITY:
989 case DID_ERROR:
990 goto maybe_retry;
991 case DID_TIME_OUT:
992 /*
993 * When we scan the bus, we get timeout messages for
994 * these commands if there is no device available.
995 * Other hosts report DID_NO_CONNECT for the same thing.
996 */
997 if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
998 SCpnt->cmnd[0] == INQUIRY)) {
999 return SUCCESS;
1000 } else {
1001 return FAILED;
1002 }
1003 case DID_RESET:
1004 /*
1005 * In the normal case where we haven't initiated a reset, this is
1006 * a failure.
1007 */
1008 if (SCpnt->flags & IS_RESETTING) {
1009 SCpnt->flags &= ~IS_RESETTING;
1010 goto maybe_retry;
1011 }
1012 /*
1013 * Examine the sense data to figure out how to proceed from here.
1014 * If there is no sense data, we will be forced into the error
1015 * handler thread, where we get to examine the thing in a lot more
1016 * detail.
1017 */
1018 return scsi_check_sense(SCpnt);
1019 default:
1020 return FAILED;
1021 }
1022
1023 /*
1024 * Next, check the message byte.
1025 */
1026 if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1027 return FAILED;
1028 }
1029 /*
1030 * Now, check the status byte to see if this indicates anything special.
1031 */
1032 switch (status_byte(SCpnt->result)) {
1033 case QUEUE_FULL:
1034 /*
1035 * The case of trying to send too many commands to a tagged queueing
1036 * device.
1037 */
1038 return ADD_TO_MLQUEUE;
1039 case GOOD:
1040 case COMMAND_TERMINATED:
1041 return SUCCESS;
1042 case CHECK_CONDITION:
1043 rtn = scsi_check_sense(SCpnt);
1044 if (rtn == NEEDS_RETRY) {
1045 goto maybe_retry;
1046 }
1047 return rtn;
1048 case CONDITION_GOOD:
1049 case INTERMEDIATE_GOOD:
1050 case INTERMEDIATE_C_GOOD:
1051 /*
1052 * Who knows? FIXME(eric)
1053 */
1054 return SUCCESS;
1055 case BUSY:
1056 case RESERVATION_CONFLICT:
1057 goto maybe_retry;
1058 default:
1059 return FAILED;
1060 }
1061 return FAILED;
1062
1063 maybe_retry:
1064
1065 if ((++SCpnt->retries) < SCpnt->allowed) {
1066 return NEEDS_RETRY;
1067 } else {
1068 /*
1069 * No more retries - report this one back to upper level.
1070 */
1071 return SUCCESS;
1072 }
1073 }
1074
1075 /*
1076 * Function: scsi_eh_completed_normally
1077 *
1078 * Purpose: Examine a command block that has come back from the low-level
1079 * and figure out what to do next.
1080 *
1081 * Returns: SUCCESS - pass on to upper level.
1082 * FAILED - pass on to error handler thread.
1083 * RETRY - command should be retried.
1084 * SOFTERR - command succeeded, but we need to log
1085 * a soft error.
1086 *
1087 * Notes: This is *ONLY* called when we are examining the status
1088 * of commands queued during error recovery. The main
1089 * difference here is that we don't allow for the possibility
1090 * of retries here, and we are a lot more restrictive about what
1091 * we consider acceptable.
1092 */
1093 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
1094 {
1095 /*
1096 * First check the host byte, to see if there is anything in there
1097 * that would indicate what we need to do.
1098 */
1099 if (host_byte(SCpnt->result) == DID_RESET) {
1100 if (SCpnt->flags & IS_RESETTING) {
1101 /*
1102 * OK, this is normal. We don't know whether in fact the
1103 * command in question really needs to be rerun or not -
1104 * if this was the original data command then the answer is yes,
1105 * otherwise we just flag it as success.
1106 */
1107 SCpnt->flags &= ~IS_RESETTING;
1108 return NEEDS_RETRY;
1109 }
1110 /*
1111 * Rats. We are already in the error handler, so we now get to try
1112 * and figure out what to do next. If the sense is valid, we have
1113 * a pretty good idea of what to do. If not, we mark it as failed.
1114 */
1115 return scsi_check_sense(SCpnt);
1116 }
1117 if (host_byte(SCpnt->result) != DID_OK) {
1118 return FAILED;
1119 }
1120 /*
1121 * Next, check the message byte.
1122 */
1123 if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1124 return FAILED;
1125 }
1126 /*
1127 * Now, check the status byte to see if this indicates anything special.
1128 */
1129 switch (status_byte(SCpnt->result)) {
1130 case GOOD:
1131 case COMMAND_TERMINATED:
1132 return SUCCESS;
1133 case CHECK_CONDITION:
1134 return scsi_check_sense(SCpnt);
1135 case CONDITION_GOOD:
1136 case INTERMEDIATE_GOOD:
1137 case INTERMEDIATE_C_GOOD:
1138 /*
1139 * Who knows? FIXME(eric)
1140 */
1141 return SUCCESS;
1142 case BUSY:
1143 case QUEUE_FULL:
1144 case RESERVATION_CONFLICT:
1145 default:
1146 return FAILED;
1147 }
1148 return FAILED;
1149 }
1150
1151 /*
1152 * Function: scsi_check_sense
1153 *
1154 * Purpose: Examine sense information - give suggestion as to what
1155 * we should do with it.
1156 */
1157 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
1158 {
1159 if (!scsi_sense_valid(SCpnt)) {
1160 return FAILED;
1161 }
1162 if (SCpnt->sense_buffer[2] & 0xe0)
1163 return SUCCESS;
1164
1165 switch (SCpnt->sense_buffer[2] & 0xf) {
1166 case NO_SENSE:
1167 return SUCCESS;
1168 case RECOVERED_ERROR:
1169 return /* SOFT_ERROR */ SUCCESS;
1170
1171 case ABORTED_COMMAND:
1172 return NEEDS_RETRY;
1173 case NOT_READY:
1174 case UNIT_ATTENTION:
1175 /*
1176 * If we are expecting a CC/UA because of a bus reset that we
1177 * performed, treat this just as a retry. Otherwise this is
1178 * information that we should pass up to the upper-level driver
1179 * so that we can deal with it there.
1180 */
1181 if (SCpnt->device->expecting_cc_ua) {
1182 SCpnt->device->expecting_cc_ua = 0;
1183 return NEEDS_RETRY;
1184 }
1185 /*
1186 * If the device is in the process of becoming ready, we
1187 * should retry.
1188 */
1189 if ((SCpnt->sense_buffer[12] == 0x04) &&
1190 (SCpnt->sense_buffer[13] == 0x01)) {
1191 return NEEDS_RETRY;
1192 }
1193 return SUCCESS;
1194
1195 /* these three are not supported */
1196 case COPY_ABORTED:
1197 case VOLUME_OVERFLOW:
1198 case MISCOMPARE:
1199 return SUCCESS;
1200
1201 case MEDIUM_ERROR:
1202 return NEEDS_RETRY;
1203
1204 case ILLEGAL_REQUEST:
1205 case BLANK_CHECK:
1206 case DATA_PROTECT:
1207 case HARDWARE_ERROR:
1208 default:
1209 return SUCCESS;
1210 }
1211 }
1212
1213
1214 /*
1215 * Function: scsi_restart_operations
1216 *
1217 * Purpose: Restart IO operations to the specified host.
1218 *
1219 * Arguments: host - host that we are restarting
1220 *
1221 * Lock status: Assumed that locks are not held upon entry.
1222 *
1223 * Returns: Nothing
1224 *
1225 * Notes: When we entered the error handler, we blocked all further
1226 * I/O to this device. We need to 'reverse' this process.
1227 */
1228 STATIC void scsi_restart_operations(struct Scsi_Host *host)
1229 {
1230 Scsi_Device *SDpnt;
1231 unsigned long flags;
1232
1233 ASSERT_LOCK(&io_request_lock, 0);
1234
1235 /*
1236 * Next free up anything directly waiting upon the host. This will be
1237 * requests for character device operations, and also for ioctls to queued
1238 * block devices.
1239 */
1240 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1241
1242 wake_up(&host->host_wait);
1243
1244 /*
1245 * Finally we need to re-initiate requests that may be pending. We will
1246 * have had everything blocked while error handling is taking place, and
1247 * now that error recovery is done, we will need to ensure that these
1248 * requests are started.
1249 */
1250 spin_lock_irqsave(&io_request_lock, flags);
1251 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1252 request_queue_t *q;
1253 if ((host->can_queue > 0 && (host->host_busy >= host->can_queue))
1254 || (host->host_blocked)
1255 || (host->host_self_blocked)
1256 || (SDpnt->device_blocked)) {
1257 break;
1258 }
1259 q = &SDpnt->request_queue;
1260 q->request_fn(q);
1261 }
1262 spin_unlock_irqrestore(&io_request_lock, flags);
1263 }
1264
1265 /*
1266 * Function: scsi_unjam_host
1267 *
1268 * Purpose: Attempt to fix a host which has a command that failed for
1269 * some reason.
1270 *
1271 * Arguments: host - host that needs unjamming.
1272 *
1273 * Returns: Nothing
1274 *
1275 * Notes: When we come in here, we *know* that all commands on the
1276 * bus have either completed, failed or timed out. We also
1277 * know that no further commands are being sent to the host,
1278 * so things are relatively quiet and we have freedom to
1279 * fiddle with things as we wish.
1280 *
1281 * Additional note: This is only the *default* implementation. It is possible
1282 * for individual drivers to supply their own version of this
1283 * function, and if the maintainer wishes to do this, it is
1284 * strongly suggested that this function be taken as a template
1285 * and modified. This function was designed to correctly handle
1286 * problems for about 95% of the different cases out there, and
1287 * it should always provide at least a reasonable amount of error
1288 * recovery.
1289 *
1290 * Note3: Any command marked 'FAILED' or 'TIMEOUT' must eventually
1291 * have scsi_finish_command() called for it. We do all of
1292 * the retry stuff here, so when we restart the host after we
1293 * return it should have an empty queue.
1294 */
1295 STATIC int scsi_unjam_host(struct Scsi_Host *host)
1296 {
1297 int devices_failed;
1298 int numfailed;
1299 int ourrtn;
1300 int rtn = FALSE;
1301 int result;
1302 Scsi_Cmnd *SCloop;
1303 Scsi_Cmnd *SCpnt;
1304 Scsi_Device *SDpnt;
1305 Scsi_Device *SDloop;
1306 Scsi_Cmnd *SCdone;
1307 int timed_out;
1308
1309 ASSERT_LOCK(&io_request_lock, 0);
1310
1311 SCdone = NULL;
1312
1313 /*
1314 * First, protect against any sort of race condition. If any of the outstanding
1315 * commands are in states that indicate that we are not yet blocked (i.e. we are
1316 * not in a quiet state) then we got woken up in error. If we ever end up here,
1317 * we need to re-examine some of the assumptions.
1318 */
1319 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1320 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1321 if (SCpnt->state == SCSI_STATE_FAILED
1322 || SCpnt->state == SCSI_STATE_TIMEOUT
1323 || SCpnt->state == SCSI_STATE_INITIALIZING
1324 || SCpnt->state == SCSI_STATE_UNUSED) {
1325 continue;
1326 }
1327 /*
1328 * Rats. Something is still floating around out there. This could
1329 * be the result of the fact that the upper level drivers are still frobbing
1330 * commands that might have succeeded. There are two outcomes. One is that
1331 * the command block will eventually be freed, and the other one is that
1332 * the command will be queued and will be finished along the way.
1333 */
1334 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
1335
1336 /*
1337 * panic("SCSI Error handler woken too early\n");
1338 *
1339 * This is no longer a problem, since now the code cares only about
1340 * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1341 * Other states are useful only to release active commands when devices are
1342 * set offline. If (host->host_active == host->host_busy) we can safely assume
1343 * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1344 *
1345 * FIXME:
1346 * It is not easy to release correctly commands according to their state when
1347 * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1348 * When a device is set offline, we can have some command with
1349 * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
1350 * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1351 * (DB, 17 May 1998)
1352 */
1353 }
1354 }
1355
1356 /*
1357 * Next, see if we need to request sense information. if so,
1358 * then get it now, so we have a better idea of what to do.
1359 * FIXME(eric) this has the unfortunate side effect that if a host
1360 * adapter does not automatically request sense information, that we end
1361 * up shutting it down before we request it. All hosts should be doing this
1362 * anyways, so for now all I have to say is tough noogies if you end up in here.
1363 * On second thought, this is probably a good idea. We *really* want to give
1364 * authors an incentive to automatically request this.
1365 */
1366 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1367
1368 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1369 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1370 if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
1371 continue;
1372 }
1373 SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1374 SCpnt->target));
1375 rtn = scsi_request_sense(SCpnt);
1376 if (rtn != SUCCESS) {
1377 continue;
1378 }
1379 SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1380 SCpnt, SCpnt->result));
1381 SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
1382
1383 result = scsi_decide_disposition(SCpnt);
1384
1385 /*
1386 * If the result was normal, then just pass it along to the
1387 * upper level.
1388 */
1389 if (result == SUCCESS) {
1390 SCpnt->host->host_failed--;
1391 scsi_eh_finish_command(&SCdone, SCpnt);
1392 }
1393 if (result != NEEDS_RETRY) {
1394 continue;
1395 }
1396 /*
1397 * We only come in here if we want to retry a
1398 * command. The test to see whether the command
1399 * should be retried should be keeping track of the
1400 * number of tries, so we don't end up looping, of
1401 * course.
1402 */
1403 SCpnt->state = NEEDS_RETRY;
1404 rtn = scsi_eh_retry_command(SCpnt);
1405 if (rtn != SUCCESS) {
1406 continue;
1407 }
1408 /*
1409 * We eventually hand this one back to the top level.
1410 */
1411 SCpnt->host->host_failed--;
1412 scsi_eh_finish_command(&SCdone, SCpnt);
1413 }
1414 }
1415
1416 /*
1417 * Go through the list of commands and figure out where we stand and how bad things
1418 * really are.
1419 */
1420 numfailed = 0;
1421 timed_out = 0;
1422 devices_failed = 0;
1423 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1424 unsigned int device_error = 0;
1425
1426 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1427 if (SCpnt->state == SCSI_STATE_FAILED) {
1428 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1429 SCpnt->target));
1430 numfailed++;
1431 device_error++;
1432 }
1433 if (SCpnt->state == SCSI_STATE_TIMEOUT) {
1434 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1435 SCpnt->target));
1436 timed_out++;
1437 device_error++;
1438 }
1439 }
1440 if (device_error > 0) {
1441 devices_failed++;
1442 }
1443 }
1444
1445 SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1446 numfailed, timed_out, devices_failed));
1447
1448 if (host->host_failed == 0) {
1449 ourrtn = TRUE;
1450 goto leave;
1451 }
1452 /*
1453 * Next, try and see whether or not it makes sense to try and abort
1454 * the running command. This only works out to be the case if we have
1455 * one command that has timed out. If the command simply failed, it
1456 * makes no sense to try and abort the command, since as far as the
1457 * host adapter is concerned, it isn't running.
1458 */
1459
1460 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1461
1462 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1463 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1464 if (SCloop->state != SCSI_STATE_TIMEOUT) {
1465 continue;
1466 }
1467 rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
1468 if (rtn == SUCCESS) {
1469 rtn = scsi_test_unit_ready(SCloop);
1470
1471 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1472 rtn = scsi_eh_retry_command(SCloop);
1473
1474 if (rtn == SUCCESS) {
1475 SCloop->host->host_failed--;
1476 scsi_eh_finish_command(&SCdone, SCloop);
1477 }
1478 }
1479 }
1480 }
1481 }
1482
1483 /*
1484 * If we have corrected all of the problems, then we are done.
1485 */
1486 if (host->host_failed == 0) {
1487 ourrtn = TRUE;
1488 goto leave;
1489 }
1490 /*
1491 * Either the abort wasn't appropriate, or it didn't succeed.
1492 * Now try a bus device reset. Still, look to see whether we have
1493 * multiple devices that are jammed or not - if we have multiple devices,
1494 * it makes no sense to try BUS_DEVICE_RESET - we really would need
1495 * to try a BUS_RESET instead.
1496 *
1497 * Does this make sense - should we try BDR on each device individually?
1498 * Yes, definitely.
1499 */
1500 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1501
1502 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1503 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1504 if (SCloop->state == SCSI_STATE_FAILED
1505 || SCloop->state == SCSI_STATE_TIMEOUT) {
1506 break;
1507 }
1508 }
1509
1510 if (SCloop == NULL) {
1511 continue;
1512 }
1513 /*
1514 * OK, we have a device that is having problems. Try and send
1515 * a bus device reset to it.
1516 *
1517 * FIXME(eric) - make sure we handle the case where multiple
1518 * commands to the same device have failed. They all must
1519 * get properly restarted.
1520 */
1521 rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
1522
1523 if (rtn == SUCCESS) {
1524 rtn = scsi_test_unit_ready(SCloop);
1525
1526 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1527 rtn = scsi_eh_retry_command(SCloop);
1528
1529 if (rtn == SUCCESS) {
1530 SCloop->host->host_failed--;
1531 scsi_eh_finish_command(&SCdone, SCloop);
1532 }
1533 }
1534 }
1535 }
1536
1537 if (host->host_failed == 0) {
1538 ourrtn = TRUE;
1539 goto leave;
1540 }
1541 /*
1542 * If we ended up here, we have serious problems. The only thing left
1543 * to try is a full bus reset. If someone has grabbed the bus and isn't
1544 * letting go, then perhaps this will help.
1545 */
1546 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1547
1548 /*
1549 * We really want to loop over the various channels, and do this on
1550 * a channel by channel basis. We should also check to see if any
1551 * of the failed commands are on soft_reset devices, and if so, skip
1552 * the reset.
1553 */
1554 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1555 next_device:
1556 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1557 if (SCpnt->state != SCSI_STATE_FAILED
1558 && SCpnt->state != SCSI_STATE_TIMEOUT) {
1559 continue;
1560 }
1561 /*
1562 * We have a failed command. Make sure there are no other failed
1563 * commands on the same channel that are timed out and implement a
1564 * soft reset.
1565 */
1566 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1567 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1568 if (SCloop->channel != SCpnt->channel) {
1569 continue;
1570 }
1571 if (SCloop->state != SCSI_STATE_FAILED
1572 && SCloop->state != SCSI_STATE_TIMEOUT) {
1573 continue;
1574 }
1575 if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
1576 /*
1577 * If this device uses the soft reset option, and this
1578 * is one of the devices acting up, then our only
1579 * option is to wait a bit, since the command is
1580 * supposedly still running.
1581 *
1582 * FIXME(eric) - right now we will just end up falling
1583 * through to the 'take device offline' case.
1584 *
1585 * FIXME(eric) - It is possible that the command completed
1586 * *after* the error recovery procedure started, and if this
1587 * is the case, we are worrying about nothing here.
1588 */
1589
1590 scsi_sleep(1 * HZ);
1591 goto next_device;
1592 }
1593 }
1594 }
1595
1596 /*
1597 * We now know that we are able to perform a reset for the
1598 * bus that SCpnt points to. There are no soft-reset devices
1599 * with outstanding timed out commands.
1600 */
1601 rtn = scsi_try_bus_reset(SCpnt);
1602 if (rtn == SUCCESS) {
1603 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1604 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1605 if (SCloop->channel != SCpnt->channel) {
1606 continue;
1607 }
1608 if (SCloop->state != SCSI_STATE_FAILED
1609 && SCloop->state != SCSI_STATE_TIMEOUT) {
1610 continue;
1611 }
1612 rtn = scsi_test_unit_ready(SCloop);
1613
1614 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1615 rtn = scsi_eh_retry_command(SCloop);
1616
1617 if (rtn == SUCCESS) {
1618 SCpnt->host->host_failed--;
1619 scsi_eh_finish_command(&SCdone, SCloop);
1620 }
1621 }
1622 /*
1623 * If the bus reset worked, but we are still unable to
1624 * talk to the device, take it offline.
1625 * FIXME(eric) - is this really the correct thing to do?
1626 */
1627 if (rtn != SUCCESS) {
1628 printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after bus reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1629
1630 SDloop->online = FALSE;
1631 SDloop->host->host_failed--;
1632 scsi_eh_finish_command(&SCdone, SCloop);
1633 }
1634 }
1635 }
1636 }
1637 }
1638 }
1639
1640 if (host->host_failed == 0) {
1641 ourrtn = TRUE;
1642 goto leave;
1643 }
1644 /*
1645 * If we ended up here, we have serious problems. The only thing left
1646 * to try is a full host reset - perhaps the firmware on the device
1647 * crashed, or something like that.
1648 *
1649 * It is assumed that a succesful host reset will cause *all* information
1650 * about the command to be flushed from both the host adapter *and* the
1651 * device.
1652 *
1653 * FIXME(eric) - it isn't clear that devices that implement the soft reset
1654 * option can ever be cleared except via cycling the power. The problem is
1655 * that sending the host reset command will cause the host to forget
1656 * about the pending command, but the device won't forget. For now, we
1657 * skip the host reset option if any of the failed devices are configured
1658 * to use the soft reset option.
1659 */
1660 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1661 next_device2:
1662 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1663 if (SCpnt->state != SCSI_STATE_FAILED
1664 && SCpnt->state != SCSI_STATE_TIMEOUT) {
1665 continue;
1666 }
1667 if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
1668 /*
1669 * If this device uses the soft reset option, and this
1670 * is one of the devices acting up, then our only
1671 * option is to wait a bit, since the command is
1672 * supposedly still running.
1673 *
1674 * FIXME(eric) - right now we will just end up falling
1675 * through to the 'take device offline' case.
1676 */
1677 SCSI_LOG_ERROR_RECOVERY(3,
1678 printk("scsi_unjam_host: Unable to try hard host reset\n"));
1679
1680 /*
1681 * Due to the spinlock, we will never get out of this
1682 * loop without a proper wait. (DB)
1683 */
1684 scsi_sleep(1 * HZ);
1685
1686 goto next_device2;
1687 }
1688 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1689
1690 /*
1691 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1692 */
1693 rtn = scsi_try_host_reset(SCpnt);
1694 if (rtn == SUCCESS) {
1695 /*
1696 * FIXME(eric) we assume that all commands are flushed from the
1697 * controller. We should get a DID_RESET for all of the commands
1698 * that were pending. We should ignore these so that we can
1699 * guarantee that we are in a consistent state.
1700 *
1701 * I believe this to be the case right now, but this needs to be
1702 * tested.
1703 */
1704 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1705 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1706 if (SCloop->state != SCSI_STATE_FAILED
1707 && SCloop->state != SCSI_STATE_TIMEOUT) {
1708 continue;
1709 }
1710 rtn = scsi_test_unit_ready(SCloop);
1711
1712 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1713 rtn = scsi_eh_retry_command(SCloop);
1714
1715 if (rtn == SUCCESS) {
1716 SCpnt->host->host_failed--;
1717 scsi_eh_finish_command(&SCdone, SCloop);
1718 }
1719 }
1720 if (rtn != SUCCESS) {
1721 printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after host reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1722 SDloop->online = FALSE;
1723 SDloop->host->host_failed--;
1724 scsi_eh_finish_command(&SCdone, SCloop);
1725 }
1726 }
1727 }
1728 }
1729 }
1730 }
1731
1732 /*
1733 * If we solved all of the problems, then let's rev up the engines again.
1734 */
1735 if (host->host_failed == 0) {
1736 ourrtn = TRUE;
1737 goto leave;
1738 }
1739 /*
1740 * If the HOST RESET failed, then for now we assume that the entire host
1741 * adapter is too hosed to be of any use. For our purposes, however, it is
1742 * easier to simply take the devices offline that correspond to commands
1743 * that failed.
1744 */
1745 SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1746
1747 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1748 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1749 if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
1750 SDloop = SCloop->device;
1751 if (SDloop->online == TRUE) {
1752 printk(KERN_INFO "scsi: device set offline - command error recover failed: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1753 SDloop->online = FALSE;
1754 }
1755
1756 /*
1757 * This should pass the failure up to the top level driver, and
1758 * it will have to try and do something intelligent with it.
1759 */
1760 SCloop->host->host_failed--;
1761
1762 if (SCloop->state == SCSI_STATE_TIMEOUT) {
1763 SCloop->result |= (DRIVER_TIMEOUT << 24);
1764 }
1765 SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1766 SDloop->id, SCloop->result));
1767
1768 scsi_eh_finish_command(&SCdone, SCloop);
1769 }
1770 }
1771 }
1772
1773 if (host->host_failed != 0) {
1774 panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1775 }
1776 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1777
1778 ourrtn = FALSE;
1779
1780 leave:
1781
1782 /*
1783 * We should have a list of commands that we 'finished' during the course of
1784 * error recovery. This should be the same as the list of commands that timed out
1785 * or failed. We are currently holding these things in a linked list - we didn't
1786 * put them in the bottom half queue because we wanted to keep things quiet while
1787 * we were working on recovery, and passing them up to the top level could easily
1788 * cause the top level to try and queue something else again.
1789 *
1790 * Start by marking that the host is no longer in error recovery.
1791 */
1792 host->in_recovery = 0;
1793
1794 /*
1795 * Take the list of commands, and stick them in the bottom half queue.
1796 * The current implementation of scsi_done will do this for us - if need
1797 * be we can create a special version of this function to do the
1798 * same job for us.
1799 */
1800 for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
1801 SCdone = SCpnt->bh_next;
1802 SCpnt->bh_next = NULL;
1803 /*
1804 * Oh, this is a vile hack. scsi_done() expects a timer
1805 * to be running on the command. If there isn't, it assumes
1806 * that the command has actually timed out, and a timer
1807 * handler is running. That may well be how we got into
1808 * this fix, but right now things are stable. We add
1809 * a timer back again so that we can report completion.
1810 * scsi_done() will immediately remove said timer from
1811 * the command, and then process it.
1812 */
1813 scsi_add_timer(SCpnt, 100, scsi_eh_times_out);
1814 scsi_done(SCpnt);
1815 }
1816
1817 return (ourrtn);
1818 }
1819
1820
1821 /*
1822 * Function: scsi_error_handler
1823 *
1824 * Purpose: Handle errors/timeouts of scsi commands, try and clean up
1825 * and unjam the bus, and restart things.
1826 *
1827 * Arguments: host - host for which we are running.
1828 *
1829 * Returns: Never returns.
1830 *
1831 * Notes: This is always run in the context of a kernel thread. The
1832 * idea is that we start this thing up when the kernel starts
1833 * up (one per host that we detect), and it immediately goes to
1834 * sleep and waits for some event (i.e. failure). When this
1835 * takes place, we have the job of trying to unjam the bus
1836 * and restarting things.
1837 *
1838 */
1839 void scsi_error_handler(void *data)
1840 {
1841 struct Scsi_Host *host = (struct Scsi_Host *) data;
1842 int rtn;
1843 DECLARE_MUTEX_LOCKED(sem);
1844
1845 /*
1846 * We only listen to signals if the HA was loaded as a module.
1847 * If the HA was compiled into the kernel, then we don't listen
1848 * to any signals.
1849 */
1850 if( host->loaded_as_module ) {
1851 siginitsetinv(¤t->blocked, SHUTDOWN_SIGS);
1852 } else {
1853 siginitsetinv(¤t->blocked, 0);
1854 }
1855
1856 lock_kernel();
1857
1858 /*
1859 * Flush resources
1860 */
1861
1862 daemonize();
1863
1864 /*
1865 * Set the name of this process.
1866 */
1867
1868 sprintf(current->comm, "scsi_eh_%d", host->host_no);
1869
1870 host->eh_wait = &sem;
1871 host->ehandler = current;
1872
1873 unlock_kernel();
1874
1875 /*
1876 * Wake up the thread that created us.
1877 */
1878 SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", host->eh_notify->count.counter));
1879
1880 up(host->eh_notify);
1881
1882 while (1) {
1883 /*
1884 * If we get a signal, it means we are supposed to go
1885 * away and die. This typically happens if the user is
1886 * trying to unload a module.
1887 */
1888 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1889
1890 /*
1891 * Note - we always use down_interruptible with the semaphore
1892 * even if the module was loaded as part of the kernel. The
1893 * reason is that down() will cause this thread to be counted
1894 * in the load average as a running process, and down
1895 * interruptible doesn't. Given that we need to allow this
1896 * thread to die if the driver was loaded as a module, using
1897 * semaphores isn't unreasonable.
1898 */
1899 down_interruptible(&sem);
1900 if( host->loaded_as_module ) {
1901 if (signal_pending(current))
1902 break;
1903 }
1904
1905 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1906
1907 host->eh_active = 1;
1908
1909 /*
1910 * We have a host that is failing for some reason. Figure out
1911 * what we need to do to get it up and online again (if we can).
1912 * If we fail, we end up taking the thing offline.
1913 */
1914 if (host->hostt->eh_strategy_handler != NULL) {
1915 rtn = host->hostt->eh_strategy_handler(host);
1916 } else {
1917 rtn = scsi_unjam_host(host);
1918 }
1919
1920 host->eh_active = 0;
1921
1922 /*
1923 * Note - if the above fails completely, the action is to take
1924 * individual devices offline and flush the queue of any
1925 * outstanding requests that may have been pending. When we
1926 * restart, we restart any I/O to any other devices on the bus
1927 * which are still online.
1928 */
1929 scsi_restart_operations(host);
1930
1931 }
1932
1933 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1934
1935 /*
1936 * Make sure that nobody tries to wake us up again.
1937 */
1938 host->eh_wait = NULL;
1939
1940 /*
1941 * Knock this down too. From this point on, the host is flying
1942 * without a pilot. If this is because the module is being unloaded,
1943 * that's fine. If the user sent a signal to this thing, we are
1944 * potentially in real danger.
1945 */
1946 host->in_recovery = 0;
1947 host->eh_active = 0;
1948 host->ehandler = NULL;
1949
1950 /*
1951 * If anyone is waiting for us to exit (i.e. someone trying to unload
1952 * a driver), then wake up that process to let them know we are on
1953 * the way out the door. This may be overkill - I *think* that we
1954 * could probably just unload the driver and send the signal, and when
1955 * the error handling thread wakes up that it would just exit without
1956 * needing to touch any memory associated with the driver itself.
1957 */
1958 if (host->eh_notify != NULL)
1959 up(host->eh_notify);
1960 }
1961
1962 /*
1963 * Overrides for Emacs so that we follow Linus's tabbing style.
1964 * Emacs will notice this stuff at the end of the file and automatically
1965 * adjust the settings for this buffer only. This must remain at the end
1966 * of the file.
1967 * ---------------------------------------------------------------------------
1968 * Local variables:
1969 * c-indent-level: 4
1970 * c-brace-imaginary-offset: 0
1971 * c-brace-offset: -4
1972 * c-argdecl-indent: 4
1973 * c-label-offset: -4
1974 * c-continued-statement-offset: 4
1975 * c-continued-brace-offset: 0
1976 * indent-tabs-mode: nil
1977 * tab-width: 8
1978 * End:
1979 */
1980