File: /usr/src/linux/drivers/md/raid1.c

1     /*
2      * raid1.c : Multiple Devices driver for Linux
3      *
4      * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5      *
6      * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7      *
8      * RAID-1 management functions.
9      *
10      * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11      *
12      * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13      * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14      *
15      * This program is free software; you can redistribute it and/or modify
16      * it under the terms of the GNU General Public License as published by
17      * the Free Software Foundation; either version 2, or (at your option)
18      * any later version.
19      *
20      * You should have received a copy of the GNU General Public License
21      * (for example /usr/src/linux/COPYING); if not, write to the Free
22      * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23      */
24     
25     #include <linux/module.h>
26     #include <linux/slab.h>
27     #include <linux/raid/raid1.h>
28     #include <asm/atomic.h>
29     
30     #define MAJOR_NR MD_MAJOR
31     #define MD_DRIVER
32     #define MD_PERSONALITY
33     
34     #define MAX_WORK_PER_DISK 128
35     
36     #define	NR_RESERVED_BUFS	32
37     
38     
39     /*
40      * The following can be used to debug the driver
41      */
42     #define RAID1_DEBUG	0
43     
44     #if RAID1_DEBUG
45     #define PRINTK(x...)   printk(x)
46     #define inline
47     #define __inline__
48     #else
49     #define PRINTK(x...)  do { } while (0)
50     #endif
51     
52     
53     static mdk_personality_t raid1_personality;
54     static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
55     struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
56     
57     static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
58     {
59     	/* return a linked list of "cnt" struct buffer_heads.
60     	 * don't take any off the free list unless we know we can
61     	 * get all we need, otherwise we could deadlock
62     	 */
63     	struct buffer_head *bh=NULL;
64     
65     	while(cnt) {
66     		struct buffer_head *t;
67     		md_spin_lock_irq(&conf->device_lock);
68     		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
69     			while (cnt) {
70     				t = conf->freebh;
71     				conf->freebh = t->b_next;
72     				t->b_next = bh;
73     				bh = t;
74     				t->b_state = 0;
75     				conf->freebh_cnt--;
76     				cnt--;
77     			}
78     		md_spin_unlock_irq(&conf->device_lock);
79     		if (cnt == 0)
80     			break;
81     		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
82     		if (t) {
83     			t->b_next = bh;
84     			bh = t;
85     			cnt--;
86     		} else {
87     			PRINTK("raid1: waiting for %d bh\n", cnt);
88     			conf->freebh_blocked = 1;
89     			wait_disk_event(conf->wait_buffer,
90     					!conf->freebh_blocked ||
91     					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
92     			conf->freebh_blocked = 0;
93     		}
94     	}
95     	return bh;
96     }
97     
98     static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
99     {
100     	unsigned long flags;
101     	spin_lock_irqsave(&conf->device_lock, flags);
102     	while (bh) {
103     		struct buffer_head *t = bh;
104     		bh=bh->b_next;
105     		if (t->b_pprev == NULL)
106     			kmem_cache_free(bh_cachep, t);
107     		else {
108     			t->b_next= conf->freebh;
109     			conf->freebh = t;
110     			conf->freebh_cnt++;
111     		}
112     	}
113     	spin_unlock_irqrestore(&conf->device_lock, flags);
114     	wake_up(&conf->wait_buffer);
115     }
116     
117     static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
118     {
119     	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
120     	int i = 0;
121     
122     	while (i < cnt) {
123     		struct buffer_head *bh;
124     		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
125     		if (!bh) break;
126     
127     		md_spin_lock_irq(&conf->device_lock);
128     		bh->b_pprev = &conf->freebh;
129     		bh->b_next = conf->freebh;
130     		conf->freebh = bh;
131     		conf->freebh_cnt++;
132     		md_spin_unlock_irq(&conf->device_lock);
133     
134     		i++;
135     	}
136     	return i;
137     }
138     
139     static void raid1_shrink_bh(raid1_conf_t *conf)
140     {
141     	/* discard all buffer_heads */
142     
143     	md_spin_lock_irq(&conf->device_lock);
144     	while (conf->freebh) {
145     		struct buffer_head *bh = conf->freebh;
146     		conf->freebh = bh->b_next;
147     		kmem_cache_free(bh_cachep, bh);
148     		conf->freebh_cnt--;
149     	}
150     	md_spin_unlock_irq(&conf->device_lock);
151     }
152     		
153     
154     static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
155     {
156     	struct raid1_bh *r1_bh = NULL;
157     
158     	do {
159     		md_spin_lock_irq(&conf->device_lock);
160     		if (!conf->freer1_blocked && conf->freer1) {
161     			r1_bh = conf->freer1;
162     			conf->freer1 = r1_bh->next_r1;
163     			conf->freer1_cnt--;
164     			r1_bh->next_r1 = NULL;
165     			r1_bh->state = (1 << R1BH_PreAlloc);
166     			r1_bh->bh_req.b_state = 0;
167     		}
168     		md_spin_unlock_irq(&conf->device_lock);
169     		if (r1_bh)
170     			return r1_bh;
171     		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
172     		if (r1_bh) {
173     			memset(r1_bh, 0, sizeof(*r1_bh));
174     			return r1_bh;
175     		}
176     		conf->freer1_blocked = 1;
177     		wait_disk_event(conf->wait_buffer,
178     				!conf->freer1_blocked ||
179     				conf->freer1_cnt > NR_RESERVED_BUFS/2
180     			);
181     		conf->freer1_blocked = 0;
182     	} while (1);
183     }
184     
185     static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
186     {
187     	struct buffer_head *bh = r1_bh->mirror_bh_list;
188     	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
189     
190     	r1_bh->mirror_bh_list = NULL;
191     
192     	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
193     		unsigned long flags;
194     		spin_lock_irqsave(&conf->device_lock, flags);
195     		r1_bh->next_r1 = conf->freer1;
196     		conf->freer1 = r1_bh;
197     		conf->freer1_cnt++;
198     		spin_unlock_irqrestore(&conf->device_lock, flags);
199     		/* don't need to wakeup wait_buffer because
200     		 *  raid1_free_bh below will do that
201     		 */
202     	} else {
203     		kfree(r1_bh);
204     	}
205     	raid1_free_bh(conf, bh);
206     }
207     
208     static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
209     {
210     	int i = 0;
211     
212     	while (i < cnt) {
213     		struct raid1_bh *r1_bh;
214     		r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
215     		if (!r1_bh)
216     			break;
217     		memset(r1_bh, 0, sizeof(*r1_bh));
218     		set_bit(R1BH_PreAlloc, &r1_bh->state);
219     		r1_bh->mddev = conf->mddev;
220     
221     		raid1_free_r1bh(r1_bh);
222     		i++;
223     	}
224     	return i;
225     }
226     
227     static void raid1_shrink_r1bh(raid1_conf_t *conf)
228     {
229     	md_spin_lock_irq(&conf->device_lock);
230     	while (conf->freer1) {
231     		struct raid1_bh *r1_bh = conf->freer1;
232     		conf->freer1 = r1_bh->next_r1;
233     		conf->freer1_cnt--;
234     		kfree(r1_bh);
235     	}
236     	md_spin_unlock_irq(&conf->device_lock);
237     }
238     
239     
240     
241     static inline void raid1_free_buf(struct raid1_bh *r1_bh)
242     {
243     	unsigned long flags;
244     	struct buffer_head *bh = r1_bh->mirror_bh_list;
245     	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
246     	r1_bh->mirror_bh_list = NULL;
247     	
248     	spin_lock_irqsave(&conf->device_lock, flags);
249     	r1_bh->next_r1 = conf->freebuf;
250     	conf->freebuf = r1_bh;
251     	spin_unlock_irqrestore(&conf->device_lock, flags);
252     	raid1_free_bh(conf, bh);
253     }
254     
255     static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
256     {
257     	struct raid1_bh *r1_bh;
258     
259     	md_spin_lock_irq(&conf->device_lock);
260     	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
261     	r1_bh = conf->freebuf;
262     	conf->freebuf = r1_bh->next_r1;
263     	r1_bh->next_r1= NULL;
264     	md_spin_unlock_irq(&conf->device_lock);
265     
266     	return r1_bh;
267     }
268     
269     static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
270     {
271     	int i = 0;
272     
273     	md_spin_lock_irq(&conf->device_lock);
274     	while (i < cnt) {
275     		struct raid1_bh *r1_bh;
276     		struct page *page;
277     
278     		page = alloc_page(GFP_KERNEL);
279     		if (!page)
280     			break;
281     
282     		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
283     		if (!r1_bh) {
284     			__free_page(page);
285     			break;
286     		}
287     		memset(r1_bh, 0, sizeof(*r1_bh));
288     		r1_bh->bh_req.b_page = page;
289     		r1_bh->bh_req.b_data = page_address(page);
290     		r1_bh->next_r1 = conf->freebuf;
291     		conf->freebuf = r1_bh;
292     		i++;
293     	}
294     	md_spin_unlock_irq(&conf->device_lock);
295     	return i;
296     }
297     
298     static void raid1_shrink_buffers (raid1_conf_t *conf)
299     {
300     	md_spin_lock_irq(&conf->device_lock);
301     	while (conf->freebuf) {
302     		struct raid1_bh *r1_bh = conf->freebuf;
303     		conf->freebuf = r1_bh->next_r1;
304     		__free_page(r1_bh->bh_req.b_page);
305     		kfree(r1_bh);
306     	}
307     	md_spin_unlock_irq(&conf->device_lock);
308     }
309     
310     static int raid1_map (mddev_t *mddev, kdev_t *rdev)
311     {
312     	raid1_conf_t *conf = mddev_to_conf(mddev);
313     	int i, disks = MD_SB_DISKS;
314     
315     	/*
316     	 * Later we do read balancing on the read side 
317     	 * now we use the first available disk.
318     	 */
319     
320     	for (i = 0; i < disks; i++) {
321     		if (conf->mirrors[i].operational) {
322     			*rdev = conf->mirrors[i].dev;
323     			return (0);
324     		}
325     	}
326     
327     	printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
328     	return (-1);
329     }
330     
331     static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
332     {
333     	unsigned long flags;
334     	mddev_t *mddev = r1_bh->mddev;
335     	raid1_conf_t *conf = mddev_to_conf(mddev);
336     
337     	md_spin_lock_irqsave(&retry_list_lock, flags);
338     	if (raid1_retry_list == NULL)
339     		raid1_retry_tail = &raid1_retry_list;
340     	*raid1_retry_tail = r1_bh;
341     	raid1_retry_tail = &r1_bh->next_r1;
342     	r1_bh->next_r1 = NULL;
343     	md_spin_unlock_irqrestore(&retry_list_lock, flags);
344     	md_wakeup_thread(conf->thread);
345     }
346     
347     
348     static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
349     {
350     	unsigned long flags;
351     	spin_lock_irqsave(&conf->segment_lock, flags);
352     	if (sector < conf->start_active)
353     		conf->cnt_done--;
354     	else if (sector >= conf->start_future && conf->phase == phase)
355     		conf->cnt_future--;
356     	else if (!--conf->cnt_pending)
357     		wake_up(&conf->wait_ready);
358     
359     	spin_unlock_irqrestore(&conf->segment_lock, flags);
360     }
361     
362     static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
363     {
364     	unsigned long flags;
365     	spin_lock_irqsave(&conf->segment_lock, flags);
366     	if (sector >= conf->start_ready)
367     		--conf->cnt_ready;
368     	else if (sector >= conf->start_active) {
369     		if (!--conf->cnt_active) {
370     			conf->start_active = conf->start_ready;
371     			wake_up(&conf->wait_done);
372     		}
373     	}
374     	spin_unlock_irqrestore(&conf->segment_lock, flags);
375     }
376     
377     /*
378      * raid1_end_bh_io() is called when we have finished servicing a mirrored
379      * operation and are ready to return a success/failure code to the buffer
380      * cache layer.
381      */
382     static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
383     {
384     	struct buffer_head *bh = r1_bh->master_bh;
385     
386     	io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
387     			test_bit(R1BH_SyncPhase, &r1_bh->state));
388     
389     	bh->b_end_io(bh, uptodate);
390     	raid1_free_r1bh(r1_bh);
391     }
392     void raid1_end_request (struct buffer_head *bh, int uptodate)
393     {
394     	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
395     
396     	/*
397     	 * this branch is our 'one mirror IO has finished' event handler:
398     	 */
399     	if (!uptodate)
400     		md_error (r1_bh->mddev, bh->b_dev);
401     	else
402     		/*
403     		 * Set R1BH_Uptodate in our master buffer_head, so that
404     		 * we will return a good error code for to the higher
405     		 * levels even if IO on some other mirrored buffer fails.
406     		 *
407     		 * The 'master' represents the complex operation to 
408     		 * user-side. So if something waits for IO, then it will
409     		 * wait for the 'master' buffer_head.
410     		 */
411     		set_bit (R1BH_Uptodate, &r1_bh->state);
412     
413     	/*
414     	 * We split up the read and write side, imho they are 
415     	 * conceptually different.
416     	 */
417     
418     	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
419     		/*
420     		 * we have only one buffer_head on the read side
421     		 */
422     		
423     		if (uptodate) {
424     			raid1_end_bh_io(r1_bh, uptodate);
425     			return;
426     		}
427     		/*
428     		 * oops, read error:
429     		 */
430     		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", 
431     			 partition_name(bh->b_dev), bh->b_blocknr);
432     		raid1_reschedule_retry(r1_bh);
433     		return;
434     	}
435     
436     	/*
437     	 * WRITE:
438     	 *
439     	 * Let's see if all mirrored write operations have finished 
440     	 * already.
441     	 */
442     
443     	if (atomic_dec_and_test(&r1_bh->remaining))
444     		raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
445     }
446     
447     /*
448      * This routine returns the disk from which the requested read should
449      * be done. It bookkeeps the last read position for every disk
450      * in array and when new read requests come, the disk which last
451      * position is nearest to the request, is chosen.
452      *
453      * TODO: now if there are 2 mirrors in the same 2 devices, performance
454      * degrades dramatically because position is mirror, not device based.
455      * This should be changed to be device based. Also atomic sequential
456      * reads should be somehow balanced.
457      */
458     
459     static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
460     {
461     	int new_disk = conf->last_used;
462     	const int sectors = bh->b_size >> 9;
463     	const unsigned long this_sector = bh->b_rsector;
464     	int disk = new_disk;
465     	unsigned long new_distance;
466     	unsigned long current_distance;
467     	
468     	/*
469     	 * Check if it is sane at all to balance
470     	 */
471     	
472     	if (conf->resync_mirrors)
473     		goto rb_out;
474     	
475     
476     	/* make sure that disk is operational */
477     	while( !conf->mirrors[new_disk].operational) {
478     		if (new_disk <= 0) new_disk = conf->raid_disks;
479     		new_disk--;
480     		if (new_disk == disk) {
481     			/*
482     			 * This means no working disk was found
483     			 * Nothing much to do, lets not change anything
484     			 * and hope for the best...
485     			 */
486     			
487     			new_disk = conf->last_used;
488     
489     			goto rb_out;
490     		}
491     	}
492     	disk = new_disk;
493     	/* now disk == new_disk == starting point for search */
494     	
495     	/*
496     	 * Don't touch anything for sequential reads.
497     	 */
498     
499     	if (this_sector == conf->mirrors[new_disk].head_position)
500     		goto rb_out;
501     	
502     	/*
503     	 * If reads have been done only on a single disk
504     	 * for a time, lets give another disk a change.
505     	 * This is for kicking those idling disks so that
506     	 * they would find work near some hotspot.
507     	 */
508     	
509     	if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
510     		conf->sect_count = 0;
511     
512     		do {
513     			if (new_disk<=0)
514     				new_disk = conf->raid_disks;
515     			new_disk--;
516     			if (new_disk == disk)
517     				break;
518     		} while ((conf->mirrors[new_disk].write_only) ||
519     			 (!conf->mirrors[new_disk].operational));
520     
521     		goto rb_out;
522     	}
523     	
524     	current_distance = abs(this_sector -
525     				conf->mirrors[disk].head_position);
526     	
527     	/* Find the disk which is closest */
528     	
529     	do {
530     		if (disk <= 0)
531     			disk = conf->raid_disks;
532     		disk--;
533     		
534     		if ((conf->mirrors[disk].write_only) ||
535     				(!conf->mirrors[disk].operational))
536     			continue;
537     		
538     		new_distance = abs(this_sector -
539     					conf->mirrors[disk].head_position);
540     		
541     		if (new_distance < current_distance) {
542     			conf->sect_count = 0;
543     			current_distance = new_distance;
544     			new_disk = disk;
545     		}
546     	} while (disk != conf->last_used);
547     
548     rb_out:
549     	conf->mirrors[new_disk].head_position = this_sector + sectors;
550     
551     	conf->last_used = new_disk;
552     	conf->sect_count += sectors;
553     
554     	return new_disk;
555     }
556     
557     static int raid1_make_request (mddev_t *mddev, int rw,
558     			       struct buffer_head * bh)
559     {
560     	raid1_conf_t *conf = mddev_to_conf(mddev);
561     	struct buffer_head *bh_req, *bhl;
562     	struct raid1_bh * r1_bh;
563     	int disks = MD_SB_DISKS;
564     	int i, sum_bhs = 0;
565     	struct mirror_info *mirror;
566     
567     	if (!buffer_locked(bh))
568     		BUG();
569     	
570     /*
571      * make_request() can abort the operation when READA is being
572      * used and no empty request is available.
573      *
574      * Currently, just replace the command with READ/WRITE.
575      */
576     	if (rw == READA)
577     		rw = READ;
578     
579     	r1_bh = raid1_alloc_r1bh (conf);
580     
581     	spin_lock_irq(&conf->segment_lock);
582     	wait_event_lock_irq(conf->wait_done,
583     			bh->b_rsector < conf->start_active ||
584     			bh->b_rsector >= conf->start_future,
585     			conf->segment_lock);
586     	if (bh->b_rsector < conf->start_active) 
587     		conf->cnt_done++;
588     	else {
589     		conf->cnt_future++;
590     		if (conf->phase)
591     			set_bit(R1BH_SyncPhase, &r1_bh->state);
592     	}
593     	spin_unlock_irq(&conf->segment_lock);
594     	
595     	/*
596     	 * i think the read and write branch should be separated completely,
597     	 * since we want to do read balancing on the read side for example.
598     	 * Alternative implementations? :) --mingo
599     	 */
600     
601     	r1_bh->master_bh = bh;
602     	r1_bh->mddev = mddev;
603     	r1_bh->cmd = rw;
604     
605     	if (rw == READ) {
606     		/*
607     		 * read balancing logic:
608     		 */
609     		mirror = conf->mirrors + raid1_read_balance(conf, bh);
610     
611     		bh_req = &r1_bh->bh_req;
612     		memcpy(bh_req, bh, sizeof(*bh));
613     		bh_req->b_blocknr = bh->b_rsector;
614     		bh_req->b_dev = mirror->dev;
615     		bh_req->b_rdev = mirror->dev;
616     	/*	bh_req->b_rsector = bh->n_rsector; */
617     		bh_req->b_end_io = raid1_end_request;
618     		bh_req->b_private = r1_bh;
619     		generic_make_request (rw, bh_req);
620     		return 0;
621     	}
622     
623     	/*
624     	 * WRITE:
625     	 */
626     
627     	bhl = raid1_alloc_bh(conf, conf->raid_disks);
628     	for (i = 0; i < disks; i++) {
629     		struct buffer_head *mbh;
630     		if (!conf->mirrors[i].operational) 
631     			continue;
632      
633     	/*
634     	 * We should use a private pool (size depending on NR_REQUEST),
635     	 * to avoid writes filling up the memory with bhs
636     	 *
637      	 * Such pools are much faster than kmalloc anyways (so we waste
638      	 * almost nothing by not using the master bh when writing and
639      	 * win alot of cleanness) but for now we are cool enough. --mingo
640      	 *
641     	 * It's safe to sleep here, buffer heads cannot be used in a shared
642      	 * manner in the write branch. Look how we lock the buffer at the
643      	 * beginning of this function to grok the difference ;)
644     	 */
645      		mbh = bhl;
646     		if (mbh == NULL) {
647     			MD_BUG();
648     			break;
649     		}
650     		bhl = mbh->b_next;
651     		mbh->b_next = NULL;
652     		mbh->b_this_page = (struct buffer_head *)1;
653     		
654      	/*
655      	 * prepare mirrored mbh (fields ordered for max mem throughput):
656      	 */
657     		mbh->b_blocknr    = bh->b_rsector;
658     		mbh->b_dev        = conf->mirrors[i].dev;
659     		mbh->b_rdev	  = conf->mirrors[i].dev;
660     		mbh->b_rsector	  = bh->b_rsector;
661     		mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
662     						(1<<BH_Mapped) | (1<<BH_Lock);
663     
664     		atomic_set(&mbh->b_count, 1);
665      		mbh->b_size       = bh->b_size;
666      		mbh->b_page	  = bh->b_page;
667      		mbh->b_data	  = bh->b_data;
668      		mbh->b_list       = BUF_LOCKED;
669      		mbh->b_end_io     = raid1_end_request;
670      		mbh->b_private    = r1_bh;
671     
672     		mbh->b_next = r1_bh->mirror_bh_list;
673     		r1_bh->mirror_bh_list = mbh;
674     		sum_bhs++;
675     	}
676     	if (bhl) raid1_free_bh(conf,bhl);
677     	if (!sum_bhs) {
678     		/* Gag - all mirrors non-operational.. */
679     		raid1_end_bh_io(r1_bh, 0);
680     		return 0;
681     	}
682     	md_atomic_set(&r1_bh->remaining, sum_bhs);
683     
684     	/*
685     	 * We have to be a bit careful about the semaphore above, thats
686     	 * why we start the requests separately. Since kmalloc() could
687     	 * fail, sleep and make_request() can sleep too, this is the
688     	 * safer solution. Imagine, end_request decreasing the semaphore
689     	 * before we could have set it up ... We could play tricks with
690     	 * the semaphore (presetting it and correcting at the end if
691     	 * sum_bhs is not 'n' but we have to do end_request by hand if
692     	 * all requests finish until we had a chance to set up the
693     	 * semaphore correctly ... lots of races).
694     	 */
695     	bh = r1_bh->mirror_bh_list;
696     	while(bh) {
697     		struct buffer_head *bh2 = bh;
698     		bh = bh->b_next;
699     		generic_make_request(rw, bh2);
700     	}
701     	return (0);
702     }
703     
704     static int raid1_status (char *page, mddev_t *mddev)
705     {
706     	raid1_conf_t *conf = mddev_to_conf(mddev);
707     	int sz = 0, i;
708     	
709     	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
710     						 conf->working_disks);
711     	for (i = 0; i < conf->raid_disks; i++)
712     		sz += sprintf (page+sz, "%s",
713     			conf->mirrors[i].operational ? "U" : "_");
714     	sz += sprintf (page+sz, "]");
715     	return sz;
716     }
717     
718     #define LAST_DISK KERN_ALERT \
719     "raid1: only one disk left and IO error.\n"
720     
721     #define NO_SPARE_DISK KERN_ALERT \
722     "raid1: no spare disk left, degrading mirror level by one.\n"
723     
724     #define DISK_FAILED KERN_ALERT \
725     "raid1: Disk failure on %s, disabling device. \n" \
726     "	Operation continuing on %d devices\n"
727     
728     #define START_SYNCING KERN_ALERT \
729     "raid1: start syncing spare disk.\n"
730     
731     #define ALREADY_SYNCING KERN_INFO \
732     "raid1: syncing already in progress.\n"
733     
734     static void mark_disk_bad (mddev_t *mddev, int failed)
735     {
736     	raid1_conf_t *conf = mddev_to_conf(mddev);
737     	struct mirror_info *mirror = conf->mirrors+failed;
738     	mdp_super_t *sb = mddev->sb;
739     
740     	mirror->operational = 0;
741     	mark_disk_faulty(sb->disks+mirror->number);
742     	mark_disk_nonsync(sb->disks+mirror->number);
743     	mark_disk_inactive(sb->disks+mirror->number);
744     	if (!mirror->write_only)
745     		sb->active_disks--;
746     	sb->working_disks--;
747     	sb->failed_disks++;
748     	mddev->sb_dirty = 1;
749     	md_wakeup_thread(conf->thread);
750     	if (!mirror->write_only)
751     		conf->working_disks--;
752     	printk (DISK_FAILED, partition_name (mirror->dev),
753     				 conf->working_disks);
754     }
755     
756     static int raid1_error (mddev_t *mddev, kdev_t dev)
757     {
758     	raid1_conf_t *conf = mddev_to_conf(mddev);
759     	struct mirror_info * mirrors = conf->mirrors;
760     	int disks = MD_SB_DISKS;
761     	int i;
762     
763     	/* Find the drive.
764     	 * If it is not operational, then we have already marked it as dead
765     	 * else if it is the last working disks, ignore the error, let the
766     	 * next level up know.
767     	 * else mark the drive as failed
768     	 */
769     
770     	for (i = 0; i < disks; i++)
771     		if (mirrors[i].dev==dev && mirrors[i].operational)
772     			break;
773     	if (i == disks)
774     		return 0;
775     
776     	if (i < conf->raid_disks && conf->working_disks == 1) {
777     		/* Don't fail the drive, act as though we were just a
778     		 * normal single drive
779     		 */
780     
781     		return 1;
782     	}
783     	mark_disk_bad(mddev, i);
784     	return 0;
785     }
786     
787     #undef LAST_DISK
788     #undef NO_SPARE_DISK
789     #undef DISK_FAILED
790     #undef START_SYNCING
791     
792     
793     static void print_raid1_conf (raid1_conf_t *conf)
794     {
795     	int i;
796     	struct mirror_info *tmp;
797     
798     	printk("RAID1 conf printout:\n");
799     	if (!conf) {
800     		printk("(conf==NULL)\n");
801     		return;
802     	}
803     	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
804     			 conf->raid_disks, conf->nr_disks);
805     
806     	for (i = 0; i < MD_SB_DISKS; i++) {
807     		tmp = conf->mirrors + i;
808     		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
809     			i, tmp->spare,tmp->operational,
810     			tmp->number,tmp->raid_disk,tmp->used_slot,
811     			partition_name(tmp->dev));
812     	}
813     }
814     
815     static void close_sync(raid1_conf_t *conf)
816     {
817     	mddev_t *mddev = conf->mddev;
818     	/* If reconstruction was interrupted, we need to close the "active" and "pending"
819     	 * holes.
820     	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
821     	 */
822     	/* this is really needed when recovery stops too... */
823     	spin_lock_irq(&conf->segment_lock);
824     	conf->start_active = conf->start_pending;
825     	conf->start_ready = conf->start_pending;
826     	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
827     	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
828     	conf->start_future = mddev->sb->size+1;
829     	conf->cnt_pending = conf->cnt_future;
830     	conf->cnt_future = 0;
831     	conf->phase = conf->phase ^1;
832     	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
833     	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
834     	conf->phase = 0;
835     	conf->cnt_future = conf->cnt_done;;
836     	conf->cnt_done = 0;
837     	spin_unlock_irq(&conf->segment_lock);
838     	wake_up(&conf->wait_done);
839     }
840     
841     static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
842     {
843     	int err = 0;
844     	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
845     	raid1_conf_t *conf = mddev->private;
846     	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
847     	mdp_super_t *sb = mddev->sb;
848     	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
849     	mdk_rdev_t *spare_rdev, *failed_rdev;
850     
851     	print_raid1_conf(conf);
852     	md_spin_lock_irq(&conf->device_lock);
853     	/*
854     	 * find the disk ...
855     	 */
856     	switch (state) {
857     
858     	case DISKOP_SPARE_ACTIVE:
859     
860     		/*
861     		 * Find the failed disk within the RAID1 configuration ...
862     		 * (this can only be in the first conf->working_disks part)
863     		 */
864     		for (i = 0; i < conf->raid_disks; i++) {
865     			tmp = conf->mirrors + i;
866     			if ((!tmp->operational && !tmp->spare) ||
867     					!tmp->used_slot) {
868     				failed_disk = i;
869     				break;
870     			}
871     		}
872     		/*
873     		 * When we activate a spare disk we _must_ have a disk in
874     		 * the lower (active) part of the array to replace. 
875     		 */
876     		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
877     			MD_BUG();
878     			err = 1;
879     			goto abort;
880     		}
881     		/* fall through */
882     
883     	case DISKOP_SPARE_WRITE:
884     	case DISKOP_SPARE_INACTIVE:
885     
886     		/*
887     		 * Find the spare disk ... (can only be in the 'high'
888     		 * area of the array)
889     		 */
890     		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
891     			tmp = conf->mirrors + i;
892     			if (tmp->spare && tmp->number == (*d)->number) {
893     				spare_disk = i;
894     				break;
895     			}
896     		}
897     		if (spare_disk == -1) {
898     			MD_BUG();
899     			err = 1;
900     			goto abort;
901     		}
902     		break;
903     
904     	case DISKOP_HOT_REMOVE_DISK:
905     
906     		for (i = 0; i < MD_SB_DISKS; i++) {
907     			tmp = conf->mirrors + i;
908     			if (tmp->used_slot && (tmp->number == (*d)->number)) {
909     				if (tmp->operational) {
910     					err = -EBUSY;
911     					goto abort;
912     				}
913     				removed_disk = i;
914     				break;
915     			}
916     		}
917     		if (removed_disk == -1) {
918     			MD_BUG();
919     			err = 1;
920     			goto abort;
921     		}
922     		break;
923     
924     	case DISKOP_HOT_ADD_DISK:
925     
926     		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
927     			tmp = conf->mirrors + i;
928     			if (!tmp->used_slot) {
929     				added_disk = i;
930     				break;
931     			}
932     		}
933     		if (added_disk == -1) {
934     			MD_BUG();
935     			err = 1;
936     			goto abort;
937     		}
938     		break;
939     	}
940     
941     	switch (state) {
942     	/*
943     	 * Switch the spare disk to write-only mode:
944     	 */
945     	case DISKOP_SPARE_WRITE:
946     		sdisk = conf->mirrors + spare_disk;
947     		sdisk->operational = 1;
948     		sdisk->write_only = 1;
949     		break;
950     	/*
951     	 * Deactivate a spare disk:
952     	 */
953     	case DISKOP_SPARE_INACTIVE:
954     		close_sync(conf);
955     		sdisk = conf->mirrors + spare_disk;
956     		sdisk->operational = 0;
957     		sdisk->write_only = 0;
958     		break;
959     	/*
960     	 * Activate (mark read-write) the (now sync) spare disk,
961     	 * which means we switch it's 'raid position' (->raid_disk)
962     	 * with the failed disk. (only the first 'conf->nr_disks'
963     	 * slots are used for 'real' disks and we must preserve this
964     	 * property)
965     	 */
966     	case DISKOP_SPARE_ACTIVE:
967     		close_sync(conf);
968     		sdisk = conf->mirrors + spare_disk;
969     		fdisk = conf->mirrors + failed_disk;
970     
971     		spare_desc = &sb->disks[sdisk->number];
972     		failed_desc = &sb->disks[fdisk->number];
973     
974     		if (spare_desc != *d) {
975     			MD_BUG();
976     			err = 1;
977     			goto abort;
978     		}
979     
980     		if (spare_desc->raid_disk != sdisk->raid_disk) {
981     			MD_BUG();
982     			err = 1;
983     			goto abort;
984     		}
985     			
986     		if (sdisk->raid_disk != spare_disk) {
987     			MD_BUG();
988     			err = 1;
989     			goto abort;
990     		}
991     
992     		if (failed_desc->raid_disk != fdisk->raid_disk) {
993     			MD_BUG();
994     			err = 1;
995     			goto abort;
996     		}
997     
998     		if (fdisk->raid_disk != failed_disk) {
999     			MD_BUG();
1000     			err = 1;
1001     			goto abort;
1002     		}
1003     
1004     		/*
1005     		 * do the switch finally
1006     		 */
1007     		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1008     		failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1009     
1010     		/* There must be a spare_rdev, but there may not be a
1011     		 * failed_rdev.  That slot might be empty...
1012     		 */
1013     		spare_rdev->desc_nr = failed_desc->number;
1014     		if (failed_rdev)
1015     			failed_rdev->desc_nr = spare_desc->number;
1016     		
1017     		xchg_values(*spare_desc, *failed_desc);
1018     		xchg_values(*fdisk, *sdisk);
1019     
1020     		/*
1021     		 * (careful, 'failed' and 'spare' are switched from now on)
1022     		 *
1023     		 * we want to preserve linear numbering and we want to
1024     		 * give the proper raid_disk number to the now activated
1025     		 * disk. (this means we switch back these values)
1026     		 */
1027     	
1028     		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1029     		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1030     		xchg_values(spare_desc->number, failed_desc->number);
1031     		xchg_values(sdisk->number, fdisk->number);
1032     
1033     		*d = failed_desc;
1034     
1035     		if (sdisk->dev == MKDEV(0,0))
1036     			sdisk->used_slot = 0;
1037     		/*
1038     		 * this really activates the spare.
1039     		 */
1040     		fdisk->spare = 0;
1041     		fdisk->write_only = 0;
1042     
1043     		/*
1044     		 * if we activate a spare, we definitely replace a
1045     		 * non-operational disk slot in the 'low' area of
1046     		 * the disk array.
1047     		 */
1048     
1049     		conf->working_disks++;
1050     
1051     		break;
1052     
1053     	case DISKOP_HOT_REMOVE_DISK:
1054     		rdisk = conf->mirrors + removed_disk;
1055     
1056     		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1057     			MD_BUG();	
1058     			err = 1;
1059     			goto abort;
1060     		}
1061     		rdisk->dev = MKDEV(0,0);
1062     		rdisk->used_slot = 0;
1063     		conf->nr_disks--;
1064     		break;
1065     
1066     	case DISKOP_HOT_ADD_DISK:
1067     		adisk = conf->mirrors + added_disk;
1068     		added_desc = *d;
1069     
1070     		if (added_disk != added_desc->number) {
1071     			MD_BUG();	
1072     			err = 1;
1073     			goto abort;
1074     		}
1075     
1076     		adisk->number = added_desc->number;
1077     		adisk->raid_disk = added_desc->raid_disk;
1078     		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1079     
1080     		adisk->operational = 0;
1081     		adisk->write_only = 0;
1082     		adisk->spare = 1;
1083     		adisk->used_slot = 1;
1084     		adisk->head_position = 0;
1085     		conf->nr_disks++;
1086     
1087     		break;
1088     
1089     	default:
1090     		MD_BUG();	
1091     		err = 1;
1092     		goto abort;
1093     	}
1094     abort:
1095     	md_spin_unlock_irq(&conf->device_lock);
1096     	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1097     		/* should move to "END_REBUILD" when such exists */
1098     		raid1_shrink_buffers(conf);
1099     
1100     	print_raid1_conf(conf);
1101     	return err;
1102     }
1103     
1104     
1105     #define IO_ERROR KERN_ALERT \
1106     "raid1: %s: unrecoverable I/O read error for block %lu\n"
1107     
1108     #define REDIRECT_SECTOR KERN_ERR \
1109     "raid1: %s: redirecting sector %lu to another mirror\n"
1110     
1111     /*
1112      * This is a kernel thread which:
1113      *
1114      *	1.	Retries failed read operations on working mirrors.
1115      *	2.	Updates the raid superblock when problems encounter.
1116      *	3.	Performs writes following reads for array syncronising.
1117      */
1118     static void end_sync_write(struct buffer_head *bh, int uptodate);
1119     static void end_sync_read(struct buffer_head *bh, int uptodate);
1120     
1121     static void raid1d (void *data)
1122     {
1123     	struct raid1_bh *r1_bh;
1124     	struct buffer_head *bh;
1125     	unsigned long flags;
1126     	mddev_t *mddev;
1127     	kdev_t dev;
1128     
1129     
1130     	for (;;) {
1131     		md_spin_lock_irqsave(&retry_list_lock, flags);
1132     		r1_bh = raid1_retry_list;
1133     		if (!r1_bh)
1134     			break;
1135     		raid1_retry_list = r1_bh->next_r1;
1136     		md_spin_unlock_irqrestore(&retry_list_lock, flags);
1137     
1138     		mddev = r1_bh->mddev;
1139     		if (mddev->sb_dirty) {
1140     			printk(KERN_INFO "raid1: dirty sb detected, updating.\n");
1141     			mddev->sb_dirty = 0;
1142     			md_update_sb(mddev);
1143     		}
1144     		bh = &r1_bh->bh_req;
1145     		switch(r1_bh->cmd) {
1146     		case SPECIAL:
1147     			/* have to allocate lots of bh structures and
1148     			 * schedule writes
1149     			 */
1150     			if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1151     				int i, sum_bhs = 0;
1152     				int disks = MD_SB_DISKS;
1153     				struct buffer_head *bhl, *mbh;
1154     				raid1_conf_t *conf;
1155     				
1156     				conf = mddev_to_conf(mddev);
1157     				bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1158     				for (i = 0; i < disks ; i++) {
1159     					if (!conf->mirrors[i].operational)
1160     						continue;
1161     					if (i==conf->last_used)
1162     						/* we read from here, no need to write */
1163     						continue;
1164     					if (i < conf->raid_disks
1165     					    && !conf->resync_mirrors)
1166     						/* don't need to write this,
1167     						 * we are just rebuilding */
1168     						continue;
1169     					mbh = bhl;
1170     					if (!mbh) {
1171     						MD_BUG();
1172     						break;
1173     					}
1174     					bhl = mbh->b_next;
1175     					mbh->b_this_page = (struct buffer_head *)1;
1176     
1177     						
1178     				/*
1179     				 * prepare mirrored bh (fields ordered for max mem throughput):
1180     				 */
1181     					mbh->b_blocknr    = bh->b_blocknr;
1182     					mbh->b_dev        = conf->mirrors[i].dev;
1183     					mbh->b_rdev	  = conf->mirrors[i].dev;
1184     					mbh->b_rsector	  = bh->b_blocknr;
1185     					mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1186     						(1<<BH_Mapped) | (1<<BH_Lock);
1187     					atomic_set(&mbh->b_count, 1);
1188     					mbh->b_size       = bh->b_size;
1189     					mbh->b_page	  = bh->b_page;
1190     					mbh->b_data	  = bh->b_data;
1191     					mbh->b_list       = BUF_LOCKED;
1192     					mbh->b_end_io     = end_sync_write;
1193     					mbh->b_private    = r1_bh;
1194     
1195     					mbh->b_next = r1_bh->mirror_bh_list;
1196     					r1_bh->mirror_bh_list = mbh;
1197     
1198     					sum_bhs++;
1199     				}
1200     				md_atomic_set(&r1_bh->remaining, sum_bhs);
1201     				if (bhl) raid1_free_bh(conf, bhl);
1202     				mbh = r1_bh->mirror_bh_list;
1203     
1204     				if (!sum_bhs) {
1205     					/* nowhere to write this too... I guess we
1206     					 * must be done
1207     					 */
1208     					sync_request_done(bh->b_blocknr, conf);
1209     					md_done_sync(mddev, bh->b_size>>9, 0);
1210     					raid1_free_buf(r1_bh);
1211     				} else
1212     				while (mbh) {
1213     					struct buffer_head *bh1 = mbh;
1214     					mbh = mbh->b_next;
1215     					generic_make_request(WRITE, bh1);
1216     					md_sync_acct(bh1->b_dev, bh1->b_size/512);
1217     				}
1218     			} else {
1219     				/* There is no point trying a read-for-reconstruct
1220     				 * as reconstruct is about to be aborted
1221     				 */
1222     
1223     				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1224     				md_done_sync(mddev, bh->b_size>>9, 0);
1225     			}
1226     
1227     			break;
1228     		case READ:
1229     		case READA:
1230     			dev = bh->b_dev;
1231     			raid1_map (mddev, &bh->b_dev);
1232     			if (bh->b_dev == dev) {
1233     				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1234     				raid1_end_bh_io(r1_bh, 0);
1235     			} else {
1236     				printk (REDIRECT_SECTOR,
1237     					partition_name(bh->b_dev), bh->b_blocknr);
1238     				bh->b_rdev = bh->b_dev;
1239     				bh->b_rsector = bh->b_blocknr;
1240     				generic_make_request (r1_bh->cmd, bh);
1241     			}
1242     			break;
1243     		}
1244     	}
1245     	md_spin_unlock_irqrestore(&retry_list_lock, flags);
1246     }
1247     #undef IO_ERROR
1248     #undef REDIRECT_SECTOR
1249     
1250     /*
1251      * Private kernel thread to reconstruct mirrors after an unclean
1252      * shutdown.
1253      */
1254     static void raid1syncd (void *data)
1255     {
1256     	raid1_conf_t *conf = data;
1257     	mddev_t *mddev = conf->mddev;
1258     
1259     	if (!conf->resync_mirrors)
1260     		return;
1261     	if (conf->resync_mirrors == 2)
1262     		return;
1263     	down(&mddev->recovery_sem);
1264     	if (!md_do_sync(mddev, NULL)) {
1265     		/*
1266     		 * Only if everything went Ok.
1267     		 */
1268     		conf->resync_mirrors = 0;
1269     	}
1270     
1271     	close_sync(conf);
1272     
1273     	up(&mddev->recovery_sem);
1274     	raid1_shrink_buffers(conf);
1275     }
1276     
1277     /*
1278      * perform a "sync" on one "block"
1279      *
1280      * We need to make sure that no normal I/O request - particularly write
1281      * requests - conflict with active sync requests.
1282      * This is achieved by conceptually dividing the device space into a
1283      * number of sections:
1284      *  DONE: 0 .. a-1     These blocks are in-sync
1285      *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1286      *                     no normal IO requests
1287      *  READY: b .. c-1    These blocks have no normal IO requests - sync
1288      *                     request may be happening
1289      *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1290      *                     ones will be added
1291      *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1292      *                     be happening, but not sync
1293      *
1294      * We keep a
1295      *   phase    which flips (0 or 1) each time d moves and
1296      * a count of:
1297      *   z =  active io requests in FUTURE since d moved - marked with
1298      *        current phase
1299      *   y =  active io requests in FUTURE before d moved, or PENDING -
1300      *        marked with previous phase
1301      *   x =  active sync requests in READY
1302      *   w =  active sync requests in ACTIVE
1303      *   v =  active io requests in DONE
1304      *
1305      * Normally, a=b=c=d=0 and z= active io requests
1306      *   or a=b=c=d=END and v= active io requests
1307      * Allowed changes to a,b,c,d:
1308      * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1309      * B:  y==0 -> c=d
1310      * C:   b=c, w+=x, x=0
1311      * D:  w==0 -> a=b
1312      * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1313      *
1314      * At start of sync we apply A.
1315      * When y reaches 0, we apply B then A then being sync requests
1316      * When sync point reaches c-1, we wait for y==0, and W==0, and
1317      * then apply apply B then A then D then C.
1318      * Finally, we apply E
1319      *
1320      * The sync request simply issues a "read" against a working drive
1321      * This is marked so that on completion the raid1d thread is woken to
1322      * issue suitable write requests
1323      */
1324     
1325     static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1326     {
1327     	raid1_conf_t *conf = mddev_to_conf(mddev);
1328     	struct mirror_info *mirror;
1329     	struct raid1_bh *r1_bh;
1330     	struct buffer_head *bh;
1331     	int bsize;
1332     	int disk;
1333     	int block_nr;
1334     
1335     	spin_lock_irq(&conf->segment_lock);
1336     	if (!sector_nr) {
1337     		/* initialize ...*/
1338     		int buffs;
1339     		conf->start_active = 0;
1340     		conf->start_ready = 0;
1341     		conf->start_pending = 0;
1342     		conf->start_future = 0;
1343     		conf->phase = 0;
1344     		/* we want enough buffers to hold twice the window of 128*/
1345     		buffs = 128 *2 / (PAGE_SIZE>>9);
1346     		buffs = raid1_grow_buffers(conf, buffs);
1347     		if (buffs < 2)
1348     			goto nomem;
1349     		
1350     		conf->window = buffs*(PAGE_SIZE>>9)/2;
1351     		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1352     		conf->cnt_done = conf->cnt_pending = 0;
1353     		if (conf->cnt_ready || conf->cnt_active)
1354     			MD_BUG();
1355     	}
1356     	while (sector_nr >= conf->start_pending) {
1357     		PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1358     			sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1359     			conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1360     		wait_event_lock_irq(conf->wait_done,
1361     					!conf->cnt_active,
1362     					conf->segment_lock);
1363     		wait_event_lock_irq(conf->wait_ready,
1364     					!conf->cnt_pending,
1365     					conf->segment_lock);
1366     		conf->start_active = conf->start_ready;
1367     		conf->start_ready = conf->start_pending;
1368     		conf->start_pending = conf->start_future;
1369     		conf->start_future = conf->start_future+conf->window;
1370     		// Note: falling off the end is not a problem
1371     		conf->phase = conf->phase ^1;
1372     		conf->cnt_active = conf->cnt_ready;
1373     		conf->cnt_ready = 0;
1374     		conf->cnt_pending = conf->cnt_future;
1375     		conf->cnt_future = 0;
1376     		wake_up(&conf->wait_done);
1377     	}
1378     	conf->cnt_ready++;
1379     	spin_unlock_irq(&conf->segment_lock);
1380     		
1381     
1382     	/* If reconstructing, and >1 working disc,
1383     	 * could dedicate one to rebuild and others to
1384     	 * service read requests ..
1385     	 */
1386     	disk = conf->last_used;
1387     	/* make sure disk is operational */
1388     	while (!conf->mirrors[disk].operational) {
1389     		if (disk <= 0) disk = conf->raid_disks;
1390     		disk--;
1391     		if (disk == conf->last_used)
1392     			break;
1393     	}
1394     	conf->last_used = disk;
1395     	
1396     	mirror = conf->mirrors+conf->last_used;
1397     	
1398     	r1_bh = raid1_alloc_buf (conf);
1399     	r1_bh->master_bh = NULL;
1400     	r1_bh->mddev = mddev;
1401     	r1_bh->cmd = SPECIAL;
1402     	bh = &r1_bh->bh_req;
1403     
1404     	block_nr = sector_nr;
1405     	bsize = 512;
1406     	while (!(block_nr & 1) && bsize < PAGE_SIZE
1407     			&& (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1408     		block_nr >>= 1;
1409     		bsize <<= 1;
1410     	}
1411     	bh->b_size = bsize;
1412     	bh->b_list = BUF_LOCKED;
1413     	bh->b_dev = mirror->dev;
1414     	bh->b_rdev = mirror->dev;
1415     	bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1416     	if (!bh->b_page)
1417     		BUG();
1418     	if (!bh->b_data)
1419     		BUG();
1420     	if (bh->b_data != page_address(bh->b_page))
1421     		BUG();
1422     	bh->b_end_io = end_sync_read;
1423     	bh->b_private = r1_bh;
1424     	bh->b_blocknr = sector_nr;
1425     	bh->b_rsector = sector_nr;
1426     	init_waitqueue_head(&bh->b_wait);
1427     
1428     	generic_make_request(READ, bh);
1429     	md_sync_acct(bh->b_dev, bh->b_size/512);
1430     
1431     	return (bsize >> 9);
1432     
1433     nomem:
1434     	raid1_shrink_buffers(conf);
1435     	spin_unlock_irq(&conf->segment_lock);
1436     	return -ENOMEM;
1437     }
1438     
1439     static void end_sync_read(struct buffer_head *bh, int uptodate)
1440     {
1441     	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1442     
1443     	/* we have read a block, now it needs to be re-written,
1444     	 * or re-read if the read failed.
1445     	 * We don't do much here, just schedule handling by raid1d
1446     	 */
1447     	if (!uptodate)
1448     		md_error (r1_bh->mddev, bh->b_dev);
1449     	else
1450     		set_bit(R1BH_Uptodate, &r1_bh->state);
1451     	raid1_reschedule_retry(r1_bh);
1452     }
1453     
1454     static void end_sync_write(struct buffer_head *bh, int uptodate)
1455     {
1456      	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1457     	
1458     	if (!uptodate)
1459      		md_error (r1_bh->mddev, bh->b_dev);
1460     	if (atomic_dec_and_test(&r1_bh->remaining)) {
1461     		mddev_t *mddev = r1_bh->mddev;
1462      		unsigned long sect = bh->b_blocknr;
1463     		int size = bh->b_size;
1464     		raid1_free_buf(r1_bh);
1465     		sync_request_done(sect, mddev_to_conf(mddev));
1466     		md_done_sync(mddev,size>>9, uptodate);
1467     	}
1468     }
1469     
1470     #define INVALID_LEVEL KERN_WARNING \
1471     "raid1: md%d: raid level not set to mirroring (%d)\n"
1472     
1473     #define NO_SB KERN_ERR \
1474     "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1475     
1476     #define ERRORS KERN_ERR \
1477     "raid1: disabled mirror %s (errors detected)\n"
1478     
1479     #define NOT_IN_SYNC KERN_ERR \
1480     "raid1: disabled mirror %s (not in sync)\n"
1481     
1482     #define INCONSISTENT KERN_ERR \
1483     "raid1: disabled mirror %s (inconsistent descriptor)\n"
1484     
1485     #define ALREADY_RUNNING KERN_ERR \
1486     "raid1: disabled mirror %s (mirror %d already operational)\n"
1487     
1488     #define OPERATIONAL KERN_INFO \
1489     "raid1: device %s operational as mirror %d\n"
1490     
1491     #define MEM_ERROR KERN_ERR \
1492     "raid1: couldn't allocate memory for md%d\n"
1493     
1494     #define SPARE KERN_INFO \
1495     "raid1: spare disk %s\n"
1496     
1497     #define NONE_OPERATIONAL KERN_ERR \
1498     "raid1: no operational mirrors for md%d\n"
1499     
1500     #define ARRAY_IS_ACTIVE KERN_INFO \
1501     "raid1: raid set md%d active with %d out of %d mirrors\n"
1502     
1503     #define THREAD_ERROR KERN_ERR \
1504     "raid1: couldn't allocate thread for md%d\n"
1505     
1506     #define START_RESYNC KERN_WARNING \
1507     "raid1: raid set md%d not clean; reconstructing mirrors\n"
1508     
1509     static int raid1_run (mddev_t *mddev)
1510     {
1511     	raid1_conf_t *conf;
1512     	int i, j, disk_idx;
1513     	struct mirror_info *disk;
1514     	mdp_super_t *sb = mddev->sb;
1515     	mdp_disk_t *descriptor;
1516     	mdk_rdev_t *rdev;
1517     	struct md_list_head *tmp;
1518     	int start_recovery = 0;
1519     
1520     	MOD_INC_USE_COUNT;
1521     
1522     	if (sb->level != 1) {
1523     		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1524     		goto out;
1525     	}
1526     	/*
1527     	 * copy the already verified devices into our private RAID1
1528     	 * bookkeeping area. [whatever we allocate in raid1_run(),
1529     	 * should be freed in raid1_stop()]
1530     	 */
1531     
1532     	conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1533     	mddev->private = conf;
1534     	if (!conf) {
1535     		printk(MEM_ERROR, mdidx(mddev));
1536     		goto out;
1537     	}
1538     	memset(conf, 0, sizeof(*conf));
1539     
1540     	ITERATE_RDEV(mddev,rdev,tmp) {
1541     		if (rdev->faulty) {
1542     			printk(ERRORS, partition_name(rdev->dev));
1543     		} else {
1544     			if (!rdev->sb) {
1545     				MD_BUG();
1546     				continue;
1547     			}
1548     		}
1549     		if (rdev->desc_nr == -1) {
1550     			MD_BUG();
1551     			continue;
1552     		}
1553     		descriptor = &sb->disks[rdev->desc_nr];
1554     		disk_idx = descriptor->raid_disk;
1555     		disk = conf->mirrors + disk_idx;
1556     
1557     		if (disk_faulty(descriptor)) {
1558     			disk->number = descriptor->number;
1559     			disk->raid_disk = disk_idx;
1560     			disk->dev = rdev->dev;
1561     			disk->sect_limit = MAX_WORK_PER_DISK;
1562     			disk->operational = 0;
1563     			disk->write_only = 0;
1564     			disk->spare = 0;
1565     			disk->used_slot = 1;
1566     			disk->head_position = 0;
1567     			continue;
1568     		}
1569     		if (disk_active(descriptor)) {
1570     			if (!disk_sync(descriptor)) {
1571     				printk(NOT_IN_SYNC,
1572     					partition_name(rdev->dev));
1573     				continue;
1574     			}
1575     			if ((descriptor->number > MD_SB_DISKS) ||
1576     					 (disk_idx > sb->raid_disks)) {
1577     
1578     				printk(INCONSISTENT,
1579     					partition_name(rdev->dev));
1580     				continue;
1581     			}
1582     			if (disk->operational) {
1583     				printk(ALREADY_RUNNING,
1584     					partition_name(rdev->dev),
1585     					disk_idx);
1586     				continue;
1587     			}
1588     			printk(OPERATIONAL, partition_name(rdev->dev),
1589      					disk_idx);
1590     			disk->number = descriptor->number;
1591     			disk->raid_disk = disk_idx;
1592     			disk->dev = rdev->dev;
1593     			disk->sect_limit = MAX_WORK_PER_DISK;
1594     			disk->operational = 1;
1595     			disk->write_only = 0;
1596     			disk->spare = 0;
1597     			disk->used_slot = 1;
1598     			disk->head_position = 0;
1599     			conf->working_disks++;
1600     		} else {
1601     		/*
1602     		 * Must be a spare disk ..
1603     		 */
1604     			printk(SPARE, partition_name(rdev->dev));
1605     			disk->number = descriptor->number;
1606     			disk->raid_disk = disk_idx;
1607     			disk->dev = rdev->dev;
1608     			disk->sect_limit = MAX_WORK_PER_DISK;
1609     			disk->operational = 0;
1610     			disk->write_only = 0;
1611     			disk->spare = 1;
1612     			disk->used_slot = 1;
1613     			disk->head_position = 0;
1614     		}
1615     	}
1616     	conf->raid_disks = sb->raid_disks;
1617     	conf->nr_disks = sb->nr_disks;
1618     	conf->mddev = mddev;
1619     	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1620     
1621     	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1622     	init_waitqueue_head(&conf->wait_buffer);
1623     	init_waitqueue_head(&conf->wait_done);
1624     	init_waitqueue_head(&conf->wait_ready);
1625     
1626     	if (!conf->working_disks) {
1627     		printk(NONE_OPERATIONAL, mdidx(mddev));
1628     		goto out_free_conf;
1629     	}
1630     
1631     
1632     	/* pre-allocate some buffer_head structures.
1633     	 * As a minimum, 1 r1bh and raid_disks buffer_heads
1634     	 * would probably get us by in tight memory situations,
1635     	 * but a few more is probably a good idea.
1636     	 * For now, try NR_RESERVED_BUFS r1bh and
1637     	 * NR_RESERVED_BUFS*raid_disks bufferheads
1638     	 * This will allow at least NR_RESERVED_BUFS concurrent
1639     	 * reads or writes even if kmalloc starts failing
1640     	 */
1641     	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1642     	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1643     	                      < NR_RESERVED_BUFS*conf->raid_disks) {
1644     		printk(MEM_ERROR, mdidx(mddev));
1645     		goto out_free_conf;
1646     	}
1647     
1648     	for (i = 0; i < MD_SB_DISKS; i++) {
1649     		
1650     		descriptor = sb->disks+i;
1651     		disk_idx = descriptor->raid_disk;
1652     		disk = conf->mirrors + disk_idx;
1653     
1654     		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1655     				!disk->used_slot) {
1656     
1657     			disk->number = descriptor->number;
1658     			disk->raid_disk = disk_idx;
1659     			disk->dev = MKDEV(0,0);
1660     
1661     			disk->operational = 0;
1662     			disk->write_only = 0;
1663     			disk->spare = 0;
1664     			disk->used_slot = 1;
1665     			disk->head_position = 0;
1666     		}
1667     	}
1668     
1669     	/*
1670     	 * find the first working one and use it as a starting point
1671     	 * to read balancing.
1672     	 */
1673     	for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1674     		/* nothing */;
1675     	conf->last_used = j;
1676     
1677     
1678     	if (conf->working_disks != sb->raid_disks) {
1679     		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1680     		start_recovery = 1;
1681     	}
1682     
1683     	{
1684     		const char * name = "raid1d";
1685     
1686     		conf->thread = md_register_thread(raid1d, conf, name);
1687     		if (!conf->thread) {
1688     			printk(THREAD_ERROR, mdidx(mddev));
1689     			goto out_free_conf;
1690     		}
1691     	}
1692     
1693     	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
1694     		const char * name = "raid1syncd";
1695     
1696     		conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1697     		if (!conf->resync_thread) {
1698     			printk(THREAD_ERROR, mdidx(mddev));
1699     			goto out_free_conf;
1700     		}
1701     
1702     		printk(START_RESYNC, mdidx(mddev));
1703     		conf->resync_mirrors = 1;
1704     		md_wakeup_thread(conf->resync_thread);
1705     	}
1706     
1707     	/*
1708     	 * Regenerate the "device is in sync with the raid set" bit for
1709     	 * each device.
1710     	 */
1711     	for (i = 0; i < MD_SB_DISKS; i++) {
1712     		mark_disk_nonsync(sb->disks+i);
1713     		for (j = 0; j < sb->raid_disks; j++) {
1714     			if (!conf->mirrors[j].operational)
1715     				continue;
1716     			if (sb->disks[i].number == conf->mirrors[j].number)
1717     				mark_disk_sync(sb->disks+i);
1718     		}
1719     	}
1720     	sb->active_disks = conf->working_disks;
1721     
1722     	if (start_recovery)
1723     		md_recover_arrays();
1724     
1725     
1726     	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1727     	/*
1728     	 * Ok, everything is just fine now
1729     	 */
1730     	return 0;
1731     
1732     out_free_conf:
1733     	raid1_shrink_r1bh(conf);
1734     	raid1_shrink_bh(conf);
1735     	raid1_shrink_buffers(conf);
1736     	kfree(conf);
1737     	mddev->private = NULL;
1738     out:
1739     	MOD_DEC_USE_COUNT;
1740     	return -EIO;
1741     }
1742     
1743     #undef INVALID_LEVEL
1744     #undef NO_SB
1745     #undef ERRORS
1746     #undef NOT_IN_SYNC
1747     #undef INCONSISTENT
1748     #undef ALREADY_RUNNING
1749     #undef OPERATIONAL
1750     #undef SPARE
1751     #undef NONE_OPERATIONAL
1752     #undef ARRAY_IS_ACTIVE
1753     
1754     static int raid1_stop_resync (mddev_t *mddev)
1755     {
1756     	raid1_conf_t *conf = mddev_to_conf(mddev);
1757     
1758     	if (conf->resync_thread) {
1759     		if (conf->resync_mirrors) {
1760     			conf->resync_mirrors = 2;
1761     			md_interrupt_thread(conf->resync_thread);
1762     
1763     			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1764     			return 1;
1765     		}
1766     		return 0;
1767     	}
1768     	return 0;
1769     }
1770     
1771     static int raid1_restart_resync (mddev_t *mddev)
1772     {
1773     	raid1_conf_t *conf = mddev_to_conf(mddev);
1774     
1775     	if (conf->resync_mirrors) {
1776     		if (!conf->resync_thread) {
1777     			MD_BUG();
1778     			return 0;
1779     		}
1780     		conf->resync_mirrors = 1;
1781     		md_wakeup_thread(conf->resync_thread);
1782     		return 1;
1783     	}
1784     	return 0;
1785     }
1786     
1787     static int raid1_stop (mddev_t *mddev)
1788     {
1789     	raid1_conf_t *conf = mddev_to_conf(mddev);
1790     
1791     	md_unregister_thread(conf->thread);
1792     	if (conf->resync_thread)
1793     		md_unregister_thread(conf->resync_thread);
1794     	raid1_shrink_r1bh(conf);
1795     	raid1_shrink_bh(conf);
1796     	raid1_shrink_buffers(conf);
1797     	kfree(conf);
1798     	mddev->private = NULL;
1799     	MOD_DEC_USE_COUNT;
1800     	return 0;
1801     }
1802     
1803     static mdk_personality_t raid1_personality=
1804     {
1805     	name:		"raid1",
1806     	make_request:	raid1_make_request,
1807     	run:		raid1_run,
1808     	stop:		raid1_stop,
1809     	status:		raid1_status,
1810     	error_handler:	raid1_error,
1811     	diskop:		raid1_diskop,
1812     	stop_resync:	raid1_stop_resync,
1813     	restart_resync:	raid1_restart_resync,
1814     	sync_request:	raid1_sync_request
1815     };
1816     
1817     static int md__init raid1_init (void)
1818     {
1819     	return register_md_personality (RAID1, &raid1_personality);
1820     }
1821     
1822     static void raid1_exit (void)
1823     {
1824     	unregister_md_personality (RAID1);
1825     }
1826     
1827     module_init(raid1_init);
1828     module_exit(raid1_exit);
1829     
1830