File: /usr/src/linux/drivers/md/raid1.c
1 /*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
19 *
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25 #include <linux/module.h>
26 #include <linux/slab.h>
27 #include <linux/raid/raid1.h>
28 #include <asm/atomic.h>
29
30 #define MAJOR_NR MD_MAJOR
31 #define MD_DRIVER
32 #define MD_PERSONALITY
33
34 #define MAX_WORK_PER_DISK 128
35
36 #define NR_RESERVED_BUFS 32
37
38
39 /*
40 * The following can be used to debug the driver
41 */
42 #define RAID1_DEBUG 0
43
44 #if RAID1_DEBUG
45 #define PRINTK(x...) printk(x)
46 #define inline
47 #define __inline__
48 #else
49 #define PRINTK(x...) do { } while (0)
50 #endif
51
52
53 static mdk_personality_t raid1_personality;
54 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
55 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
56
57 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
58 {
59 /* return a linked list of "cnt" struct buffer_heads.
60 * don't take any off the free list unless we know we can
61 * get all we need, otherwise we could deadlock
62 */
63 struct buffer_head *bh=NULL;
64
65 while(cnt) {
66 struct buffer_head *t;
67 md_spin_lock_irq(&conf->device_lock);
68 if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
69 while (cnt) {
70 t = conf->freebh;
71 conf->freebh = t->b_next;
72 t->b_next = bh;
73 bh = t;
74 t->b_state = 0;
75 conf->freebh_cnt--;
76 cnt--;
77 }
78 md_spin_unlock_irq(&conf->device_lock);
79 if (cnt == 0)
80 break;
81 t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
82 if (t) {
83 t->b_next = bh;
84 bh = t;
85 cnt--;
86 } else {
87 PRINTK("raid1: waiting for %d bh\n", cnt);
88 conf->freebh_blocked = 1;
89 wait_disk_event(conf->wait_buffer,
90 !conf->freebh_blocked ||
91 conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
92 conf->freebh_blocked = 0;
93 }
94 }
95 return bh;
96 }
97
98 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
99 {
100 unsigned long flags;
101 spin_lock_irqsave(&conf->device_lock, flags);
102 while (bh) {
103 struct buffer_head *t = bh;
104 bh=bh->b_next;
105 if (t->b_pprev == NULL)
106 kmem_cache_free(bh_cachep, t);
107 else {
108 t->b_next= conf->freebh;
109 conf->freebh = t;
110 conf->freebh_cnt++;
111 }
112 }
113 spin_unlock_irqrestore(&conf->device_lock, flags);
114 wake_up(&conf->wait_buffer);
115 }
116
117 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
118 {
119 /* allocate cnt buffer_heads, possibly less if kmalloc fails */
120 int i = 0;
121
122 while (i < cnt) {
123 struct buffer_head *bh;
124 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
125 if (!bh) break;
126
127 md_spin_lock_irq(&conf->device_lock);
128 bh->b_pprev = &conf->freebh;
129 bh->b_next = conf->freebh;
130 conf->freebh = bh;
131 conf->freebh_cnt++;
132 md_spin_unlock_irq(&conf->device_lock);
133
134 i++;
135 }
136 return i;
137 }
138
139 static void raid1_shrink_bh(raid1_conf_t *conf)
140 {
141 /* discard all buffer_heads */
142
143 md_spin_lock_irq(&conf->device_lock);
144 while (conf->freebh) {
145 struct buffer_head *bh = conf->freebh;
146 conf->freebh = bh->b_next;
147 kmem_cache_free(bh_cachep, bh);
148 conf->freebh_cnt--;
149 }
150 md_spin_unlock_irq(&conf->device_lock);
151 }
152
153
154 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
155 {
156 struct raid1_bh *r1_bh = NULL;
157
158 do {
159 md_spin_lock_irq(&conf->device_lock);
160 if (!conf->freer1_blocked && conf->freer1) {
161 r1_bh = conf->freer1;
162 conf->freer1 = r1_bh->next_r1;
163 conf->freer1_cnt--;
164 r1_bh->next_r1 = NULL;
165 r1_bh->state = (1 << R1BH_PreAlloc);
166 r1_bh->bh_req.b_state = 0;
167 }
168 md_spin_unlock_irq(&conf->device_lock);
169 if (r1_bh)
170 return r1_bh;
171 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
172 if (r1_bh) {
173 memset(r1_bh, 0, sizeof(*r1_bh));
174 return r1_bh;
175 }
176 conf->freer1_blocked = 1;
177 wait_disk_event(conf->wait_buffer,
178 !conf->freer1_blocked ||
179 conf->freer1_cnt > NR_RESERVED_BUFS/2
180 );
181 conf->freer1_blocked = 0;
182 } while (1);
183 }
184
185 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
186 {
187 struct buffer_head *bh = r1_bh->mirror_bh_list;
188 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
189
190 r1_bh->mirror_bh_list = NULL;
191
192 if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
193 unsigned long flags;
194 spin_lock_irqsave(&conf->device_lock, flags);
195 r1_bh->next_r1 = conf->freer1;
196 conf->freer1 = r1_bh;
197 conf->freer1_cnt++;
198 spin_unlock_irqrestore(&conf->device_lock, flags);
199 /* don't need to wakeup wait_buffer because
200 * raid1_free_bh below will do that
201 */
202 } else {
203 kfree(r1_bh);
204 }
205 raid1_free_bh(conf, bh);
206 }
207
208 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
209 {
210 int i = 0;
211
212 while (i < cnt) {
213 struct raid1_bh *r1_bh;
214 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
215 if (!r1_bh)
216 break;
217 memset(r1_bh, 0, sizeof(*r1_bh));
218 set_bit(R1BH_PreAlloc, &r1_bh->state);
219 r1_bh->mddev = conf->mddev;
220
221 raid1_free_r1bh(r1_bh);
222 i++;
223 }
224 return i;
225 }
226
227 static void raid1_shrink_r1bh(raid1_conf_t *conf)
228 {
229 md_spin_lock_irq(&conf->device_lock);
230 while (conf->freer1) {
231 struct raid1_bh *r1_bh = conf->freer1;
232 conf->freer1 = r1_bh->next_r1;
233 conf->freer1_cnt--;
234 kfree(r1_bh);
235 }
236 md_spin_unlock_irq(&conf->device_lock);
237 }
238
239
240
241 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
242 {
243 unsigned long flags;
244 struct buffer_head *bh = r1_bh->mirror_bh_list;
245 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
246 r1_bh->mirror_bh_list = NULL;
247
248 spin_lock_irqsave(&conf->device_lock, flags);
249 r1_bh->next_r1 = conf->freebuf;
250 conf->freebuf = r1_bh;
251 spin_unlock_irqrestore(&conf->device_lock, flags);
252 raid1_free_bh(conf, bh);
253 }
254
255 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
256 {
257 struct raid1_bh *r1_bh;
258
259 md_spin_lock_irq(&conf->device_lock);
260 wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
261 r1_bh = conf->freebuf;
262 conf->freebuf = r1_bh->next_r1;
263 r1_bh->next_r1= NULL;
264 md_spin_unlock_irq(&conf->device_lock);
265
266 return r1_bh;
267 }
268
269 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
270 {
271 int i = 0;
272
273 md_spin_lock_irq(&conf->device_lock);
274 while (i < cnt) {
275 struct raid1_bh *r1_bh;
276 struct page *page;
277
278 page = alloc_page(GFP_KERNEL);
279 if (!page)
280 break;
281
282 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
283 if (!r1_bh) {
284 __free_page(page);
285 break;
286 }
287 memset(r1_bh, 0, sizeof(*r1_bh));
288 r1_bh->bh_req.b_page = page;
289 r1_bh->bh_req.b_data = page_address(page);
290 r1_bh->next_r1 = conf->freebuf;
291 conf->freebuf = r1_bh;
292 i++;
293 }
294 md_spin_unlock_irq(&conf->device_lock);
295 return i;
296 }
297
298 static void raid1_shrink_buffers (raid1_conf_t *conf)
299 {
300 md_spin_lock_irq(&conf->device_lock);
301 while (conf->freebuf) {
302 struct raid1_bh *r1_bh = conf->freebuf;
303 conf->freebuf = r1_bh->next_r1;
304 __free_page(r1_bh->bh_req.b_page);
305 kfree(r1_bh);
306 }
307 md_spin_unlock_irq(&conf->device_lock);
308 }
309
310 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
311 {
312 raid1_conf_t *conf = mddev_to_conf(mddev);
313 int i, disks = MD_SB_DISKS;
314
315 /*
316 * Later we do read balancing on the read side
317 * now we use the first available disk.
318 */
319
320 for (i = 0; i < disks; i++) {
321 if (conf->mirrors[i].operational) {
322 *rdev = conf->mirrors[i].dev;
323 return (0);
324 }
325 }
326
327 printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
328 return (-1);
329 }
330
331 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
332 {
333 unsigned long flags;
334 mddev_t *mddev = r1_bh->mddev;
335 raid1_conf_t *conf = mddev_to_conf(mddev);
336
337 md_spin_lock_irqsave(&retry_list_lock, flags);
338 if (raid1_retry_list == NULL)
339 raid1_retry_tail = &raid1_retry_list;
340 *raid1_retry_tail = r1_bh;
341 raid1_retry_tail = &r1_bh->next_r1;
342 r1_bh->next_r1 = NULL;
343 md_spin_unlock_irqrestore(&retry_list_lock, flags);
344 md_wakeup_thread(conf->thread);
345 }
346
347
348 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
349 {
350 unsigned long flags;
351 spin_lock_irqsave(&conf->segment_lock, flags);
352 if (sector < conf->start_active)
353 conf->cnt_done--;
354 else if (sector >= conf->start_future && conf->phase == phase)
355 conf->cnt_future--;
356 else if (!--conf->cnt_pending)
357 wake_up(&conf->wait_ready);
358
359 spin_unlock_irqrestore(&conf->segment_lock, flags);
360 }
361
362 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
363 {
364 unsigned long flags;
365 spin_lock_irqsave(&conf->segment_lock, flags);
366 if (sector >= conf->start_ready)
367 --conf->cnt_ready;
368 else if (sector >= conf->start_active) {
369 if (!--conf->cnt_active) {
370 conf->start_active = conf->start_ready;
371 wake_up(&conf->wait_done);
372 }
373 }
374 spin_unlock_irqrestore(&conf->segment_lock, flags);
375 }
376
377 /*
378 * raid1_end_bh_io() is called when we have finished servicing a mirrored
379 * operation and are ready to return a success/failure code to the buffer
380 * cache layer.
381 */
382 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
383 {
384 struct buffer_head *bh = r1_bh->master_bh;
385
386 io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
387 test_bit(R1BH_SyncPhase, &r1_bh->state));
388
389 bh->b_end_io(bh, uptodate);
390 raid1_free_r1bh(r1_bh);
391 }
392 void raid1_end_request (struct buffer_head *bh, int uptodate)
393 {
394 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
395
396 /*
397 * this branch is our 'one mirror IO has finished' event handler:
398 */
399 if (!uptodate)
400 md_error (r1_bh->mddev, bh->b_dev);
401 else
402 /*
403 * Set R1BH_Uptodate in our master buffer_head, so that
404 * we will return a good error code for to the higher
405 * levels even if IO on some other mirrored buffer fails.
406 *
407 * The 'master' represents the complex operation to
408 * user-side. So if something waits for IO, then it will
409 * wait for the 'master' buffer_head.
410 */
411 set_bit (R1BH_Uptodate, &r1_bh->state);
412
413 /*
414 * We split up the read and write side, imho they are
415 * conceptually different.
416 */
417
418 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
419 /*
420 * we have only one buffer_head on the read side
421 */
422
423 if (uptodate) {
424 raid1_end_bh_io(r1_bh, uptodate);
425 return;
426 }
427 /*
428 * oops, read error:
429 */
430 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
431 partition_name(bh->b_dev), bh->b_blocknr);
432 raid1_reschedule_retry(r1_bh);
433 return;
434 }
435
436 /*
437 * WRITE:
438 *
439 * Let's see if all mirrored write operations have finished
440 * already.
441 */
442
443 if (atomic_dec_and_test(&r1_bh->remaining))
444 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
445 }
446
447 /*
448 * This routine returns the disk from which the requested read should
449 * be done. It bookkeeps the last read position for every disk
450 * in array and when new read requests come, the disk which last
451 * position is nearest to the request, is chosen.
452 *
453 * TODO: now if there are 2 mirrors in the same 2 devices, performance
454 * degrades dramatically because position is mirror, not device based.
455 * This should be changed to be device based. Also atomic sequential
456 * reads should be somehow balanced.
457 */
458
459 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
460 {
461 int new_disk = conf->last_used;
462 const int sectors = bh->b_size >> 9;
463 const unsigned long this_sector = bh->b_rsector;
464 int disk = new_disk;
465 unsigned long new_distance;
466 unsigned long current_distance;
467
468 /*
469 * Check if it is sane at all to balance
470 */
471
472 if (conf->resync_mirrors)
473 goto rb_out;
474
475
476 /* make sure that disk is operational */
477 while( !conf->mirrors[new_disk].operational) {
478 if (new_disk <= 0) new_disk = conf->raid_disks;
479 new_disk--;
480 if (new_disk == disk) {
481 /*
482 * This means no working disk was found
483 * Nothing much to do, lets not change anything
484 * and hope for the best...
485 */
486
487 new_disk = conf->last_used;
488
489 goto rb_out;
490 }
491 }
492 disk = new_disk;
493 /* now disk == new_disk == starting point for search */
494
495 /*
496 * Don't touch anything for sequential reads.
497 */
498
499 if (this_sector == conf->mirrors[new_disk].head_position)
500 goto rb_out;
501
502 /*
503 * If reads have been done only on a single disk
504 * for a time, lets give another disk a change.
505 * This is for kicking those idling disks so that
506 * they would find work near some hotspot.
507 */
508
509 if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
510 conf->sect_count = 0;
511
512 do {
513 if (new_disk<=0)
514 new_disk = conf->raid_disks;
515 new_disk--;
516 if (new_disk == disk)
517 break;
518 } while ((conf->mirrors[new_disk].write_only) ||
519 (!conf->mirrors[new_disk].operational));
520
521 goto rb_out;
522 }
523
524 current_distance = abs(this_sector -
525 conf->mirrors[disk].head_position);
526
527 /* Find the disk which is closest */
528
529 do {
530 if (disk <= 0)
531 disk = conf->raid_disks;
532 disk--;
533
534 if ((conf->mirrors[disk].write_only) ||
535 (!conf->mirrors[disk].operational))
536 continue;
537
538 new_distance = abs(this_sector -
539 conf->mirrors[disk].head_position);
540
541 if (new_distance < current_distance) {
542 conf->sect_count = 0;
543 current_distance = new_distance;
544 new_disk = disk;
545 }
546 } while (disk != conf->last_used);
547
548 rb_out:
549 conf->mirrors[new_disk].head_position = this_sector + sectors;
550
551 conf->last_used = new_disk;
552 conf->sect_count += sectors;
553
554 return new_disk;
555 }
556
557 static int raid1_make_request (mddev_t *mddev, int rw,
558 struct buffer_head * bh)
559 {
560 raid1_conf_t *conf = mddev_to_conf(mddev);
561 struct buffer_head *bh_req, *bhl;
562 struct raid1_bh * r1_bh;
563 int disks = MD_SB_DISKS;
564 int i, sum_bhs = 0;
565 struct mirror_info *mirror;
566
567 if (!buffer_locked(bh))
568 BUG();
569
570 /*
571 * make_request() can abort the operation when READA is being
572 * used and no empty request is available.
573 *
574 * Currently, just replace the command with READ/WRITE.
575 */
576 if (rw == READA)
577 rw = READ;
578
579 r1_bh = raid1_alloc_r1bh (conf);
580
581 spin_lock_irq(&conf->segment_lock);
582 wait_event_lock_irq(conf->wait_done,
583 bh->b_rsector < conf->start_active ||
584 bh->b_rsector >= conf->start_future,
585 conf->segment_lock);
586 if (bh->b_rsector < conf->start_active)
587 conf->cnt_done++;
588 else {
589 conf->cnt_future++;
590 if (conf->phase)
591 set_bit(R1BH_SyncPhase, &r1_bh->state);
592 }
593 spin_unlock_irq(&conf->segment_lock);
594
595 /*
596 * i think the read and write branch should be separated completely,
597 * since we want to do read balancing on the read side for example.
598 * Alternative implementations? :) --mingo
599 */
600
601 r1_bh->master_bh = bh;
602 r1_bh->mddev = mddev;
603 r1_bh->cmd = rw;
604
605 if (rw == READ) {
606 /*
607 * read balancing logic:
608 */
609 mirror = conf->mirrors + raid1_read_balance(conf, bh);
610
611 bh_req = &r1_bh->bh_req;
612 memcpy(bh_req, bh, sizeof(*bh));
613 bh_req->b_blocknr = bh->b_rsector;
614 bh_req->b_dev = mirror->dev;
615 bh_req->b_rdev = mirror->dev;
616 /* bh_req->b_rsector = bh->n_rsector; */
617 bh_req->b_end_io = raid1_end_request;
618 bh_req->b_private = r1_bh;
619 generic_make_request (rw, bh_req);
620 return 0;
621 }
622
623 /*
624 * WRITE:
625 */
626
627 bhl = raid1_alloc_bh(conf, conf->raid_disks);
628 for (i = 0; i < disks; i++) {
629 struct buffer_head *mbh;
630 if (!conf->mirrors[i].operational)
631 continue;
632
633 /*
634 * We should use a private pool (size depending on NR_REQUEST),
635 * to avoid writes filling up the memory with bhs
636 *
637 * Such pools are much faster than kmalloc anyways (so we waste
638 * almost nothing by not using the master bh when writing and
639 * win alot of cleanness) but for now we are cool enough. --mingo
640 *
641 * It's safe to sleep here, buffer heads cannot be used in a shared
642 * manner in the write branch. Look how we lock the buffer at the
643 * beginning of this function to grok the difference ;)
644 */
645 mbh = bhl;
646 if (mbh == NULL) {
647 MD_BUG();
648 break;
649 }
650 bhl = mbh->b_next;
651 mbh->b_next = NULL;
652 mbh->b_this_page = (struct buffer_head *)1;
653
654 /*
655 * prepare mirrored mbh (fields ordered for max mem throughput):
656 */
657 mbh->b_blocknr = bh->b_rsector;
658 mbh->b_dev = conf->mirrors[i].dev;
659 mbh->b_rdev = conf->mirrors[i].dev;
660 mbh->b_rsector = bh->b_rsector;
661 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
662 (1<<BH_Mapped) | (1<<BH_Lock);
663
664 atomic_set(&mbh->b_count, 1);
665 mbh->b_size = bh->b_size;
666 mbh->b_page = bh->b_page;
667 mbh->b_data = bh->b_data;
668 mbh->b_list = BUF_LOCKED;
669 mbh->b_end_io = raid1_end_request;
670 mbh->b_private = r1_bh;
671
672 mbh->b_next = r1_bh->mirror_bh_list;
673 r1_bh->mirror_bh_list = mbh;
674 sum_bhs++;
675 }
676 if (bhl) raid1_free_bh(conf,bhl);
677 if (!sum_bhs) {
678 /* Gag - all mirrors non-operational.. */
679 raid1_end_bh_io(r1_bh, 0);
680 return 0;
681 }
682 md_atomic_set(&r1_bh->remaining, sum_bhs);
683
684 /*
685 * We have to be a bit careful about the semaphore above, thats
686 * why we start the requests separately. Since kmalloc() could
687 * fail, sleep and make_request() can sleep too, this is the
688 * safer solution. Imagine, end_request decreasing the semaphore
689 * before we could have set it up ... We could play tricks with
690 * the semaphore (presetting it and correcting at the end if
691 * sum_bhs is not 'n' but we have to do end_request by hand if
692 * all requests finish until we had a chance to set up the
693 * semaphore correctly ... lots of races).
694 */
695 bh = r1_bh->mirror_bh_list;
696 while(bh) {
697 struct buffer_head *bh2 = bh;
698 bh = bh->b_next;
699 generic_make_request(rw, bh2);
700 }
701 return (0);
702 }
703
704 static int raid1_status (char *page, mddev_t *mddev)
705 {
706 raid1_conf_t *conf = mddev_to_conf(mddev);
707 int sz = 0, i;
708
709 sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
710 conf->working_disks);
711 for (i = 0; i < conf->raid_disks; i++)
712 sz += sprintf (page+sz, "%s",
713 conf->mirrors[i].operational ? "U" : "_");
714 sz += sprintf (page+sz, "]");
715 return sz;
716 }
717
718 #define LAST_DISK KERN_ALERT \
719 "raid1: only one disk left and IO error.\n"
720
721 #define NO_SPARE_DISK KERN_ALERT \
722 "raid1: no spare disk left, degrading mirror level by one.\n"
723
724 #define DISK_FAILED KERN_ALERT \
725 "raid1: Disk failure on %s, disabling device. \n" \
726 " Operation continuing on %d devices\n"
727
728 #define START_SYNCING KERN_ALERT \
729 "raid1: start syncing spare disk.\n"
730
731 #define ALREADY_SYNCING KERN_INFO \
732 "raid1: syncing already in progress.\n"
733
734 static void mark_disk_bad (mddev_t *mddev, int failed)
735 {
736 raid1_conf_t *conf = mddev_to_conf(mddev);
737 struct mirror_info *mirror = conf->mirrors+failed;
738 mdp_super_t *sb = mddev->sb;
739
740 mirror->operational = 0;
741 mark_disk_faulty(sb->disks+mirror->number);
742 mark_disk_nonsync(sb->disks+mirror->number);
743 mark_disk_inactive(sb->disks+mirror->number);
744 if (!mirror->write_only)
745 sb->active_disks--;
746 sb->working_disks--;
747 sb->failed_disks++;
748 mddev->sb_dirty = 1;
749 md_wakeup_thread(conf->thread);
750 if (!mirror->write_only)
751 conf->working_disks--;
752 printk (DISK_FAILED, partition_name (mirror->dev),
753 conf->working_disks);
754 }
755
756 static int raid1_error (mddev_t *mddev, kdev_t dev)
757 {
758 raid1_conf_t *conf = mddev_to_conf(mddev);
759 struct mirror_info * mirrors = conf->mirrors;
760 int disks = MD_SB_DISKS;
761 int i;
762
763 /* Find the drive.
764 * If it is not operational, then we have already marked it as dead
765 * else if it is the last working disks, ignore the error, let the
766 * next level up know.
767 * else mark the drive as failed
768 */
769
770 for (i = 0; i < disks; i++)
771 if (mirrors[i].dev==dev && mirrors[i].operational)
772 break;
773 if (i == disks)
774 return 0;
775
776 if (i < conf->raid_disks && conf->working_disks == 1) {
777 /* Don't fail the drive, act as though we were just a
778 * normal single drive
779 */
780
781 return 1;
782 }
783 mark_disk_bad(mddev, i);
784 return 0;
785 }
786
787 #undef LAST_DISK
788 #undef NO_SPARE_DISK
789 #undef DISK_FAILED
790 #undef START_SYNCING
791
792
793 static void print_raid1_conf (raid1_conf_t *conf)
794 {
795 int i;
796 struct mirror_info *tmp;
797
798 printk("RAID1 conf printout:\n");
799 if (!conf) {
800 printk("(conf==NULL)\n");
801 return;
802 }
803 printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
804 conf->raid_disks, conf->nr_disks);
805
806 for (i = 0; i < MD_SB_DISKS; i++) {
807 tmp = conf->mirrors + i;
808 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
809 i, tmp->spare,tmp->operational,
810 tmp->number,tmp->raid_disk,tmp->used_slot,
811 partition_name(tmp->dev));
812 }
813 }
814
815 static void close_sync(raid1_conf_t *conf)
816 {
817 mddev_t *mddev = conf->mddev;
818 /* If reconstruction was interrupted, we need to close the "active" and "pending"
819 * holes.
820 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
821 */
822 /* this is really needed when recovery stops too... */
823 spin_lock_irq(&conf->segment_lock);
824 conf->start_active = conf->start_pending;
825 conf->start_ready = conf->start_pending;
826 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
827 conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
828 conf->start_future = mddev->sb->size+1;
829 conf->cnt_pending = conf->cnt_future;
830 conf->cnt_future = 0;
831 conf->phase = conf->phase ^1;
832 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
833 conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
834 conf->phase = 0;
835 conf->cnt_future = conf->cnt_done;;
836 conf->cnt_done = 0;
837 spin_unlock_irq(&conf->segment_lock);
838 wake_up(&conf->wait_done);
839 }
840
841 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
842 {
843 int err = 0;
844 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
845 raid1_conf_t *conf = mddev->private;
846 struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
847 mdp_super_t *sb = mddev->sb;
848 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
849 mdk_rdev_t *spare_rdev, *failed_rdev;
850
851 print_raid1_conf(conf);
852 md_spin_lock_irq(&conf->device_lock);
853 /*
854 * find the disk ...
855 */
856 switch (state) {
857
858 case DISKOP_SPARE_ACTIVE:
859
860 /*
861 * Find the failed disk within the RAID1 configuration ...
862 * (this can only be in the first conf->working_disks part)
863 */
864 for (i = 0; i < conf->raid_disks; i++) {
865 tmp = conf->mirrors + i;
866 if ((!tmp->operational && !tmp->spare) ||
867 !tmp->used_slot) {
868 failed_disk = i;
869 break;
870 }
871 }
872 /*
873 * When we activate a spare disk we _must_ have a disk in
874 * the lower (active) part of the array to replace.
875 */
876 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
877 MD_BUG();
878 err = 1;
879 goto abort;
880 }
881 /* fall through */
882
883 case DISKOP_SPARE_WRITE:
884 case DISKOP_SPARE_INACTIVE:
885
886 /*
887 * Find the spare disk ... (can only be in the 'high'
888 * area of the array)
889 */
890 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
891 tmp = conf->mirrors + i;
892 if (tmp->spare && tmp->number == (*d)->number) {
893 spare_disk = i;
894 break;
895 }
896 }
897 if (spare_disk == -1) {
898 MD_BUG();
899 err = 1;
900 goto abort;
901 }
902 break;
903
904 case DISKOP_HOT_REMOVE_DISK:
905
906 for (i = 0; i < MD_SB_DISKS; i++) {
907 tmp = conf->mirrors + i;
908 if (tmp->used_slot && (tmp->number == (*d)->number)) {
909 if (tmp->operational) {
910 err = -EBUSY;
911 goto abort;
912 }
913 removed_disk = i;
914 break;
915 }
916 }
917 if (removed_disk == -1) {
918 MD_BUG();
919 err = 1;
920 goto abort;
921 }
922 break;
923
924 case DISKOP_HOT_ADD_DISK:
925
926 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
927 tmp = conf->mirrors + i;
928 if (!tmp->used_slot) {
929 added_disk = i;
930 break;
931 }
932 }
933 if (added_disk == -1) {
934 MD_BUG();
935 err = 1;
936 goto abort;
937 }
938 break;
939 }
940
941 switch (state) {
942 /*
943 * Switch the spare disk to write-only mode:
944 */
945 case DISKOP_SPARE_WRITE:
946 sdisk = conf->mirrors + spare_disk;
947 sdisk->operational = 1;
948 sdisk->write_only = 1;
949 break;
950 /*
951 * Deactivate a spare disk:
952 */
953 case DISKOP_SPARE_INACTIVE:
954 close_sync(conf);
955 sdisk = conf->mirrors + spare_disk;
956 sdisk->operational = 0;
957 sdisk->write_only = 0;
958 break;
959 /*
960 * Activate (mark read-write) the (now sync) spare disk,
961 * which means we switch it's 'raid position' (->raid_disk)
962 * with the failed disk. (only the first 'conf->nr_disks'
963 * slots are used for 'real' disks and we must preserve this
964 * property)
965 */
966 case DISKOP_SPARE_ACTIVE:
967 close_sync(conf);
968 sdisk = conf->mirrors + spare_disk;
969 fdisk = conf->mirrors + failed_disk;
970
971 spare_desc = &sb->disks[sdisk->number];
972 failed_desc = &sb->disks[fdisk->number];
973
974 if (spare_desc != *d) {
975 MD_BUG();
976 err = 1;
977 goto abort;
978 }
979
980 if (spare_desc->raid_disk != sdisk->raid_disk) {
981 MD_BUG();
982 err = 1;
983 goto abort;
984 }
985
986 if (sdisk->raid_disk != spare_disk) {
987 MD_BUG();
988 err = 1;
989 goto abort;
990 }
991
992 if (failed_desc->raid_disk != fdisk->raid_disk) {
993 MD_BUG();
994 err = 1;
995 goto abort;
996 }
997
998 if (fdisk->raid_disk != failed_disk) {
999 MD_BUG();
1000 err = 1;
1001 goto abort;
1002 }
1003
1004 /*
1005 * do the switch finally
1006 */
1007 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1008 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1009
1010 /* There must be a spare_rdev, but there may not be a
1011 * failed_rdev. That slot might be empty...
1012 */
1013 spare_rdev->desc_nr = failed_desc->number;
1014 if (failed_rdev)
1015 failed_rdev->desc_nr = spare_desc->number;
1016
1017 xchg_values(*spare_desc, *failed_desc);
1018 xchg_values(*fdisk, *sdisk);
1019
1020 /*
1021 * (careful, 'failed' and 'spare' are switched from now on)
1022 *
1023 * we want to preserve linear numbering and we want to
1024 * give the proper raid_disk number to the now activated
1025 * disk. (this means we switch back these values)
1026 */
1027
1028 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1029 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1030 xchg_values(spare_desc->number, failed_desc->number);
1031 xchg_values(sdisk->number, fdisk->number);
1032
1033 *d = failed_desc;
1034
1035 if (sdisk->dev == MKDEV(0,0))
1036 sdisk->used_slot = 0;
1037 /*
1038 * this really activates the spare.
1039 */
1040 fdisk->spare = 0;
1041 fdisk->write_only = 0;
1042
1043 /*
1044 * if we activate a spare, we definitely replace a
1045 * non-operational disk slot in the 'low' area of
1046 * the disk array.
1047 */
1048
1049 conf->working_disks++;
1050
1051 break;
1052
1053 case DISKOP_HOT_REMOVE_DISK:
1054 rdisk = conf->mirrors + removed_disk;
1055
1056 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1057 MD_BUG();
1058 err = 1;
1059 goto abort;
1060 }
1061 rdisk->dev = MKDEV(0,0);
1062 rdisk->used_slot = 0;
1063 conf->nr_disks--;
1064 break;
1065
1066 case DISKOP_HOT_ADD_DISK:
1067 adisk = conf->mirrors + added_disk;
1068 added_desc = *d;
1069
1070 if (added_disk != added_desc->number) {
1071 MD_BUG();
1072 err = 1;
1073 goto abort;
1074 }
1075
1076 adisk->number = added_desc->number;
1077 adisk->raid_disk = added_desc->raid_disk;
1078 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1079
1080 adisk->operational = 0;
1081 adisk->write_only = 0;
1082 adisk->spare = 1;
1083 adisk->used_slot = 1;
1084 adisk->head_position = 0;
1085 conf->nr_disks++;
1086
1087 break;
1088
1089 default:
1090 MD_BUG();
1091 err = 1;
1092 goto abort;
1093 }
1094 abort:
1095 md_spin_unlock_irq(&conf->device_lock);
1096 if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1097 /* should move to "END_REBUILD" when such exists */
1098 raid1_shrink_buffers(conf);
1099
1100 print_raid1_conf(conf);
1101 return err;
1102 }
1103
1104
1105 #define IO_ERROR KERN_ALERT \
1106 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1107
1108 #define REDIRECT_SECTOR KERN_ERR \
1109 "raid1: %s: redirecting sector %lu to another mirror\n"
1110
1111 /*
1112 * This is a kernel thread which:
1113 *
1114 * 1. Retries failed read operations on working mirrors.
1115 * 2. Updates the raid superblock when problems encounter.
1116 * 3. Performs writes following reads for array syncronising.
1117 */
1118 static void end_sync_write(struct buffer_head *bh, int uptodate);
1119 static void end_sync_read(struct buffer_head *bh, int uptodate);
1120
1121 static void raid1d (void *data)
1122 {
1123 struct raid1_bh *r1_bh;
1124 struct buffer_head *bh;
1125 unsigned long flags;
1126 mddev_t *mddev;
1127 kdev_t dev;
1128
1129
1130 for (;;) {
1131 md_spin_lock_irqsave(&retry_list_lock, flags);
1132 r1_bh = raid1_retry_list;
1133 if (!r1_bh)
1134 break;
1135 raid1_retry_list = r1_bh->next_r1;
1136 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1137
1138 mddev = r1_bh->mddev;
1139 if (mddev->sb_dirty) {
1140 printk(KERN_INFO "raid1: dirty sb detected, updating.\n");
1141 mddev->sb_dirty = 0;
1142 md_update_sb(mddev);
1143 }
1144 bh = &r1_bh->bh_req;
1145 switch(r1_bh->cmd) {
1146 case SPECIAL:
1147 /* have to allocate lots of bh structures and
1148 * schedule writes
1149 */
1150 if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1151 int i, sum_bhs = 0;
1152 int disks = MD_SB_DISKS;
1153 struct buffer_head *bhl, *mbh;
1154 raid1_conf_t *conf;
1155
1156 conf = mddev_to_conf(mddev);
1157 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1158 for (i = 0; i < disks ; i++) {
1159 if (!conf->mirrors[i].operational)
1160 continue;
1161 if (i==conf->last_used)
1162 /* we read from here, no need to write */
1163 continue;
1164 if (i < conf->raid_disks
1165 && !conf->resync_mirrors)
1166 /* don't need to write this,
1167 * we are just rebuilding */
1168 continue;
1169 mbh = bhl;
1170 if (!mbh) {
1171 MD_BUG();
1172 break;
1173 }
1174 bhl = mbh->b_next;
1175 mbh->b_this_page = (struct buffer_head *)1;
1176
1177
1178 /*
1179 * prepare mirrored bh (fields ordered for max mem throughput):
1180 */
1181 mbh->b_blocknr = bh->b_blocknr;
1182 mbh->b_dev = conf->mirrors[i].dev;
1183 mbh->b_rdev = conf->mirrors[i].dev;
1184 mbh->b_rsector = bh->b_blocknr;
1185 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
1186 (1<<BH_Mapped) | (1<<BH_Lock);
1187 atomic_set(&mbh->b_count, 1);
1188 mbh->b_size = bh->b_size;
1189 mbh->b_page = bh->b_page;
1190 mbh->b_data = bh->b_data;
1191 mbh->b_list = BUF_LOCKED;
1192 mbh->b_end_io = end_sync_write;
1193 mbh->b_private = r1_bh;
1194
1195 mbh->b_next = r1_bh->mirror_bh_list;
1196 r1_bh->mirror_bh_list = mbh;
1197
1198 sum_bhs++;
1199 }
1200 md_atomic_set(&r1_bh->remaining, sum_bhs);
1201 if (bhl) raid1_free_bh(conf, bhl);
1202 mbh = r1_bh->mirror_bh_list;
1203
1204 if (!sum_bhs) {
1205 /* nowhere to write this too... I guess we
1206 * must be done
1207 */
1208 sync_request_done(bh->b_blocknr, conf);
1209 md_done_sync(mddev, bh->b_size>>9, 0);
1210 raid1_free_buf(r1_bh);
1211 } else
1212 while (mbh) {
1213 struct buffer_head *bh1 = mbh;
1214 mbh = mbh->b_next;
1215 generic_make_request(WRITE, bh1);
1216 md_sync_acct(bh1->b_dev, bh1->b_size/512);
1217 }
1218 } else {
1219 /* There is no point trying a read-for-reconstruct
1220 * as reconstruct is about to be aborted
1221 */
1222
1223 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1224 md_done_sync(mddev, bh->b_size>>9, 0);
1225 }
1226
1227 break;
1228 case READ:
1229 case READA:
1230 dev = bh->b_dev;
1231 raid1_map (mddev, &bh->b_dev);
1232 if (bh->b_dev == dev) {
1233 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1234 raid1_end_bh_io(r1_bh, 0);
1235 } else {
1236 printk (REDIRECT_SECTOR,
1237 partition_name(bh->b_dev), bh->b_blocknr);
1238 bh->b_rdev = bh->b_dev;
1239 bh->b_rsector = bh->b_blocknr;
1240 generic_make_request (r1_bh->cmd, bh);
1241 }
1242 break;
1243 }
1244 }
1245 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1246 }
1247 #undef IO_ERROR
1248 #undef REDIRECT_SECTOR
1249
1250 /*
1251 * Private kernel thread to reconstruct mirrors after an unclean
1252 * shutdown.
1253 */
1254 static void raid1syncd (void *data)
1255 {
1256 raid1_conf_t *conf = data;
1257 mddev_t *mddev = conf->mddev;
1258
1259 if (!conf->resync_mirrors)
1260 return;
1261 if (conf->resync_mirrors == 2)
1262 return;
1263 down(&mddev->recovery_sem);
1264 if (!md_do_sync(mddev, NULL)) {
1265 /*
1266 * Only if everything went Ok.
1267 */
1268 conf->resync_mirrors = 0;
1269 }
1270
1271 close_sync(conf);
1272
1273 up(&mddev->recovery_sem);
1274 raid1_shrink_buffers(conf);
1275 }
1276
1277 /*
1278 * perform a "sync" on one "block"
1279 *
1280 * We need to make sure that no normal I/O request - particularly write
1281 * requests - conflict with active sync requests.
1282 * This is achieved by conceptually dividing the device space into a
1283 * number of sections:
1284 * DONE: 0 .. a-1 These blocks are in-sync
1285 * ACTIVE: a.. b-1 These blocks may have active sync requests, but
1286 * no normal IO requests
1287 * READY: b .. c-1 These blocks have no normal IO requests - sync
1288 * request may be happening
1289 * PENDING: c .. d-1 These blocks may have IO requests, but no new
1290 * ones will be added
1291 * FUTURE: d .. end These blocks are not to be considered yet. IO may
1292 * be happening, but not sync
1293 *
1294 * We keep a
1295 * phase which flips (0 or 1) each time d moves and
1296 * a count of:
1297 * z = active io requests in FUTURE since d moved - marked with
1298 * current phase
1299 * y = active io requests in FUTURE before d moved, or PENDING -
1300 * marked with previous phase
1301 * x = active sync requests in READY
1302 * w = active sync requests in ACTIVE
1303 * v = active io requests in DONE
1304 *
1305 * Normally, a=b=c=d=0 and z= active io requests
1306 * or a=b=c=d=END and v= active io requests
1307 * Allowed changes to a,b,c,d:
1308 * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
1309 * B: y==0 -> c=d
1310 * C: b=c, w+=x, x=0
1311 * D: w==0 -> a=b
1312 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1313 *
1314 * At start of sync we apply A.
1315 * When y reaches 0, we apply B then A then being sync requests
1316 * When sync point reaches c-1, we wait for y==0, and W==0, and
1317 * then apply apply B then A then D then C.
1318 * Finally, we apply E
1319 *
1320 * The sync request simply issues a "read" against a working drive
1321 * This is marked so that on completion the raid1d thread is woken to
1322 * issue suitable write requests
1323 */
1324
1325 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1326 {
1327 raid1_conf_t *conf = mddev_to_conf(mddev);
1328 struct mirror_info *mirror;
1329 struct raid1_bh *r1_bh;
1330 struct buffer_head *bh;
1331 int bsize;
1332 int disk;
1333 int block_nr;
1334
1335 spin_lock_irq(&conf->segment_lock);
1336 if (!sector_nr) {
1337 /* initialize ...*/
1338 int buffs;
1339 conf->start_active = 0;
1340 conf->start_ready = 0;
1341 conf->start_pending = 0;
1342 conf->start_future = 0;
1343 conf->phase = 0;
1344 /* we want enough buffers to hold twice the window of 128*/
1345 buffs = 128 *2 / (PAGE_SIZE>>9);
1346 buffs = raid1_grow_buffers(conf, buffs);
1347 if (buffs < 2)
1348 goto nomem;
1349
1350 conf->window = buffs*(PAGE_SIZE>>9)/2;
1351 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1352 conf->cnt_done = conf->cnt_pending = 0;
1353 if (conf->cnt_ready || conf->cnt_active)
1354 MD_BUG();
1355 }
1356 while (sector_nr >= conf->start_pending) {
1357 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1358 sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1359 conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1360 wait_event_lock_irq(conf->wait_done,
1361 !conf->cnt_active,
1362 conf->segment_lock);
1363 wait_event_lock_irq(conf->wait_ready,
1364 !conf->cnt_pending,
1365 conf->segment_lock);
1366 conf->start_active = conf->start_ready;
1367 conf->start_ready = conf->start_pending;
1368 conf->start_pending = conf->start_future;
1369 conf->start_future = conf->start_future+conf->window;
1370 // Note: falling off the end is not a problem
1371 conf->phase = conf->phase ^1;
1372 conf->cnt_active = conf->cnt_ready;
1373 conf->cnt_ready = 0;
1374 conf->cnt_pending = conf->cnt_future;
1375 conf->cnt_future = 0;
1376 wake_up(&conf->wait_done);
1377 }
1378 conf->cnt_ready++;
1379 spin_unlock_irq(&conf->segment_lock);
1380
1381
1382 /* If reconstructing, and >1 working disc,
1383 * could dedicate one to rebuild and others to
1384 * service read requests ..
1385 */
1386 disk = conf->last_used;
1387 /* make sure disk is operational */
1388 while (!conf->mirrors[disk].operational) {
1389 if (disk <= 0) disk = conf->raid_disks;
1390 disk--;
1391 if (disk == conf->last_used)
1392 break;
1393 }
1394 conf->last_used = disk;
1395
1396 mirror = conf->mirrors+conf->last_used;
1397
1398 r1_bh = raid1_alloc_buf (conf);
1399 r1_bh->master_bh = NULL;
1400 r1_bh->mddev = mddev;
1401 r1_bh->cmd = SPECIAL;
1402 bh = &r1_bh->bh_req;
1403
1404 block_nr = sector_nr;
1405 bsize = 512;
1406 while (!(block_nr & 1) && bsize < PAGE_SIZE
1407 && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1408 block_nr >>= 1;
1409 bsize <<= 1;
1410 }
1411 bh->b_size = bsize;
1412 bh->b_list = BUF_LOCKED;
1413 bh->b_dev = mirror->dev;
1414 bh->b_rdev = mirror->dev;
1415 bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1416 if (!bh->b_page)
1417 BUG();
1418 if (!bh->b_data)
1419 BUG();
1420 if (bh->b_data != page_address(bh->b_page))
1421 BUG();
1422 bh->b_end_io = end_sync_read;
1423 bh->b_private = r1_bh;
1424 bh->b_blocknr = sector_nr;
1425 bh->b_rsector = sector_nr;
1426 init_waitqueue_head(&bh->b_wait);
1427
1428 generic_make_request(READ, bh);
1429 md_sync_acct(bh->b_dev, bh->b_size/512);
1430
1431 return (bsize >> 9);
1432
1433 nomem:
1434 raid1_shrink_buffers(conf);
1435 spin_unlock_irq(&conf->segment_lock);
1436 return -ENOMEM;
1437 }
1438
1439 static void end_sync_read(struct buffer_head *bh, int uptodate)
1440 {
1441 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1442
1443 /* we have read a block, now it needs to be re-written,
1444 * or re-read if the read failed.
1445 * We don't do much here, just schedule handling by raid1d
1446 */
1447 if (!uptodate)
1448 md_error (r1_bh->mddev, bh->b_dev);
1449 else
1450 set_bit(R1BH_Uptodate, &r1_bh->state);
1451 raid1_reschedule_retry(r1_bh);
1452 }
1453
1454 static void end_sync_write(struct buffer_head *bh, int uptodate)
1455 {
1456 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1457
1458 if (!uptodate)
1459 md_error (r1_bh->mddev, bh->b_dev);
1460 if (atomic_dec_and_test(&r1_bh->remaining)) {
1461 mddev_t *mddev = r1_bh->mddev;
1462 unsigned long sect = bh->b_blocknr;
1463 int size = bh->b_size;
1464 raid1_free_buf(r1_bh);
1465 sync_request_done(sect, mddev_to_conf(mddev));
1466 md_done_sync(mddev,size>>9, uptodate);
1467 }
1468 }
1469
1470 #define INVALID_LEVEL KERN_WARNING \
1471 "raid1: md%d: raid level not set to mirroring (%d)\n"
1472
1473 #define NO_SB KERN_ERR \
1474 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1475
1476 #define ERRORS KERN_ERR \
1477 "raid1: disabled mirror %s (errors detected)\n"
1478
1479 #define NOT_IN_SYNC KERN_ERR \
1480 "raid1: disabled mirror %s (not in sync)\n"
1481
1482 #define INCONSISTENT KERN_ERR \
1483 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1484
1485 #define ALREADY_RUNNING KERN_ERR \
1486 "raid1: disabled mirror %s (mirror %d already operational)\n"
1487
1488 #define OPERATIONAL KERN_INFO \
1489 "raid1: device %s operational as mirror %d\n"
1490
1491 #define MEM_ERROR KERN_ERR \
1492 "raid1: couldn't allocate memory for md%d\n"
1493
1494 #define SPARE KERN_INFO \
1495 "raid1: spare disk %s\n"
1496
1497 #define NONE_OPERATIONAL KERN_ERR \
1498 "raid1: no operational mirrors for md%d\n"
1499
1500 #define ARRAY_IS_ACTIVE KERN_INFO \
1501 "raid1: raid set md%d active with %d out of %d mirrors\n"
1502
1503 #define THREAD_ERROR KERN_ERR \
1504 "raid1: couldn't allocate thread for md%d\n"
1505
1506 #define START_RESYNC KERN_WARNING \
1507 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1508
1509 static int raid1_run (mddev_t *mddev)
1510 {
1511 raid1_conf_t *conf;
1512 int i, j, disk_idx;
1513 struct mirror_info *disk;
1514 mdp_super_t *sb = mddev->sb;
1515 mdp_disk_t *descriptor;
1516 mdk_rdev_t *rdev;
1517 struct md_list_head *tmp;
1518 int start_recovery = 0;
1519
1520 MOD_INC_USE_COUNT;
1521
1522 if (sb->level != 1) {
1523 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1524 goto out;
1525 }
1526 /*
1527 * copy the already verified devices into our private RAID1
1528 * bookkeeping area. [whatever we allocate in raid1_run(),
1529 * should be freed in raid1_stop()]
1530 */
1531
1532 conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1533 mddev->private = conf;
1534 if (!conf) {
1535 printk(MEM_ERROR, mdidx(mddev));
1536 goto out;
1537 }
1538 memset(conf, 0, sizeof(*conf));
1539
1540 ITERATE_RDEV(mddev,rdev,tmp) {
1541 if (rdev->faulty) {
1542 printk(ERRORS, partition_name(rdev->dev));
1543 } else {
1544 if (!rdev->sb) {
1545 MD_BUG();
1546 continue;
1547 }
1548 }
1549 if (rdev->desc_nr == -1) {
1550 MD_BUG();
1551 continue;
1552 }
1553 descriptor = &sb->disks[rdev->desc_nr];
1554 disk_idx = descriptor->raid_disk;
1555 disk = conf->mirrors + disk_idx;
1556
1557 if (disk_faulty(descriptor)) {
1558 disk->number = descriptor->number;
1559 disk->raid_disk = disk_idx;
1560 disk->dev = rdev->dev;
1561 disk->sect_limit = MAX_WORK_PER_DISK;
1562 disk->operational = 0;
1563 disk->write_only = 0;
1564 disk->spare = 0;
1565 disk->used_slot = 1;
1566 disk->head_position = 0;
1567 continue;
1568 }
1569 if (disk_active(descriptor)) {
1570 if (!disk_sync(descriptor)) {
1571 printk(NOT_IN_SYNC,
1572 partition_name(rdev->dev));
1573 continue;
1574 }
1575 if ((descriptor->number > MD_SB_DISKS) ||
1576 (disk_idx > sb->raid_disks)) {
1577
1578 printk(INCONSISTENT,
1579 partition_name(rdev->dev));
1580 continue;
1581 }
1582 if (disk->operational) {
1583 printk(ALREADY_RUNNING,
1584 partition_name(rdev->dev),
1585 disk_idx);
1586 continue;
1587 }
1588 printk(OPERATIONAL, partition_name(rdev->dev),
1589 disk_idx);
1590 disk->number = descriptor->number;
1591 disk->raid_disk = disk_idx;
1592 disk->dev = rdev->dev;
1593 disk->sect_limit = MAX_WORK_PER_DISK;
1594 disk->operational = 1;
1595 disk->write_only = 0;
1596 disk->spare = 0;
1597 disk->used_slot = 1;
1598 disk->head_position = 0;
1599 conf->working_disks++;
1600 } else {
1601 /*
1602 * Must be a spare disk ..
1603 */
1604 printk(SPARE, partition_name(rdev->dev));
1605 disk->number = descriptor->number;
1606 disk->raid_disk = disk_idx;
1607 disk->dev = rdev->dev;
1608 disk->sect_limit = MAX_WORK_PER_DISK;
1609 disk->operational = 0;
1610 disk->write_only = 0;
1611 disk->spare = 1;
1612 disk->used_slot = 1;
1613 disk->head_position = 0;
1614 }
1615 }
1616 conf->raid_disks = sb->raid_disks;
1617 conf->nr_disks = sb->nr_disks;
1618 conf->mddev = mddev;
1619 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1620
1621 conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1622 init_waitqueue_head(&conf->wait_buffer);
1623 init_waitqueue_head(&conf->wait_done);
1624 init_waitqueue_head(&conf->wait_ready);
1625
1626 if (!conf->working_disks) {
1627 printk(NONE_OPERATIONAL, mdidx(mddev));
1628 goto out_free_conf;
1629 }
1630
1631
1632 /* pre-allocate some buffer_head structures.
1633 * As a minimum, 1 r1bh and raid_disks buffer_heads
1634 * would probably get us by in tight memory situations,
1635 * but a few more is probably a good idea.
1636 * For now, try NR_RESERVED_BUFS r1bh and
1637 * NR_RESERVED_BUFS*raid_disks bufferheads
1638 * This will allow at least NR_RESERVED_BUFS concurrent
1639 * reads or writes even if kmalloc starts failing
1640 */
1641 if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1642 raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1643 < NR_RESERVED_BUFS*conf->raid_disks) {
1644 printk(MEM_ERROR, mdidx(mddev));
1645 goto out_free_conf;
1646 }
1647
1648 for (i = 0; i < MD_SB_DISKS; i++) {
1649
1650 descriptor = sb->disks+i;
1651 disk_idx = descriptor->raid_disk;
1652 disk = conf->mirrors + disk_idx;
1653
1654 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1655 !disk->used_slot) {
1656
1657 disk->number = descriptor->number;
1658 disk->raid_disk = disk_idx;
1659 disk->dev = MKDEV(0,0);
1660
1661 disk->operational = 0;
1662 disk->write_only = 0;
1663 disk->spare = 0;
1664 disk->used_slot = 1;
1665 disk->head_position = 0;
1666 }
1667 }
1668
1669 /*
1670 * find the first working one and use it as a starting point
1671 * to read balancing.
1672 */
1673 for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1674 /* nothing */;
1675 conf->last_used = j;
1676
1677
1678 if (conf->working_disks != sb->raid_disks) {
1679 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1680 start_recovery = 1;
1681 }
1682
1683 {
1684 const char * name = "raid1d";
1685
1686 conf->thread = md_register_thread(raid1d, conf, name);
1687 if (!conf->thread) {
1688 printk(THREAD_ERROR, mdidx(mddev));
1689 goto out_free_conf;
1690 }
1691 }
1692
1693 if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1694 const char * name = "raid1syncd";
1695
1696 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1697 if (!conf->resync_thread) {
1698 printk(THREAD_ERROR, mdidx(mddev));
1699 goto out_free_conf;
1700 }
1701
1702 printk(START_RESYNC, mdidx(mddev));
1703 conf->resync_mirrors = 1;
1704 md_wakeup_thread(conf->resync_thread);
1705 }
1706
1707 /*
1708 * Regenerate the "device is in sync with the raid set" bit for
1709 * each device.
1710 */
1711 for (i = 0; i < MD_SB_DISKS; i++) {
1712 mark_disk_nonsync(sb->disks+i);
1713 for (j = 0; j < sb->raid_disks; j++) {
1714 if (!conf->mirrors[j].operational)
1715 continue;
1716 if (sb->disks[i].number == conf->mirrors[j].number)
1717 mark_disk_sync(sb->disks+i);
1718 }
1719 }
1720 sb->active_disks = conf->working_disks;
1721
1722 if (start_recovery)
1723 md_recover_arrays();
1724
1725
1726 printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1727 /*
1728 * Ok, everything is just fine now
1729 */
1730 return 0;
1731
1732 out_free_conf:
1733 raid1_shrink_r1bh(conf);
1734 raid1_shrink_bh(conf);
1735 raid1_shrink_buffers(conf);
1736 kfree(conf);
1737 mddev->private = NULL;
1738 out:
1739 MOD_DEC_USE_COUNT;
1740 return -EIO;
1741 }
1742
1743 #undef INVALID_LEVEL
1744 #undef NO_SB
1745 #undef ERRORS
1746 #undef NOT_IN_SYNC
1747 #undef INCONSISTENT
1748 #undef ALREADY_RUNNING
1749 #undef OPERATIONAL
1750 #undef SPARE
1751 #undef NONE_OPERATIONAL
1752 #undef ARRAY_IS_ACTIVE
1753
1754 static int raid1_stop_resync (mddev_t *mddev)
1755 {
1756 raid1_conf_t *conf = mddev_to_conf(mddev);
1757
1758 if (conf->resync_thread) {
1759 if (conf->resync_mirrors) {
1760 conf->resync_mirrors = 2;
1761 md_interrupt_thread(conf->resync_thread);
1762
1763 printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1764 return 1;
1765 }
1766 return 0;
1767 }
1768 return 0;
1769 }
1770
1771 static int raid1_restart_resync (mddev_t *mddev)
1772 {
1773 raid1_conf_t *conf = mddev_to_conf(mddev);
1774
1775 if (conf->resync_mirrors) {
1776 if (!conf->resync_thread) {
1777 MD_BUG();
1778 return 0;
1779 }
1780 conf->resync_mirrors = 1;
1781 md_wakeup_thread(conf->resync_thread);
1782 return 1;
1783 }
1784 return 0;
1785 }
1786
1787 static int raid1_stop (mddev_t *mddev)
1788 {
1789 raid1_conf_t *conf = mddev_to_conf(mddev);
1790
1791 md_unregister_thread(conf->thread);
1792 if (conf->resync_thread)
1793 md_unregister_thread(conf->resync_thread);
1794 raid1_shrink_r1bh(conf);
1795 raid1_shrink_bh(conf);
1796 raid1_shrink_buffers(conf);
1797 kfree(conf);
1798 mddev->private = NULL;
1799 MOD_DEC_USE_COUNT;
1800 return 0;
1801 }
1802
1803 static mdk_personality_t raid1_personality=
1804 {
1805 name: "raid1",
1806 make_request: raid1_make_request,
1807 run: raid1_run,
1808 stop: raid1_stop,
1809 status: raid1_status,
1810 error_handler: raid1_error,
1811 diskop: raid1_diskop,
1812 stop_resync: raid1_stop_resync,
1813 restart_resync: raid1_restart_resync,
1814 sync_request: raid1_sync_request
1815 };
1816
1817 static int md__init raid1_init (void)
1818 {
1819 return register_md_personality (RAID1, &raid1_personality);
1820 }
1821
1822 static void raid1_exit (void)
1823 {
1824 unregister_md_personality (RAID1);
1825 }
1826
1827 module_init(raid1_init);
1828 module_exit(raid1_exit);
1829
1830