File: /usr/src/linux/drivers/md/md.c

1     /*
2        md.c : Multiple Devices driver for Linux
3     	  Copyright (C) 1998, 1999, 2000 Ingo Molnar
4     
5          completely rewritten, based on the MD driver code from Marc Zyngier
6     
7        Changes:
8     
9        - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10        - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
11        - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
12        - kmod support by: Cyrus Durgin
13        - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
14        - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
15     
16        - lots of fixes and improvements to the RAID1/RAID5 and generic
17          RAID code (such as request based resynchronization):
18     
19          Neil Brown <neilb@cse.unsw.edu.au>.
20     
21        This program is free software; you can redistribute it and/or modify
22        it under the terms of the GNU General Public License as published by
23        the Free Software Foundation; either version 2, or (at your option)
24        any later version.
25     
26        You should have received a copy of the GNU General Public License
27        (for example /usr/src/linux/COPYING); if not, write to the Free
28        Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29     */
30     
31     #include <linux/module.h>
32     #include <linux/config.h>
33     #include <linux/raid/md.h>
34     #include <linux/sysctl.h>
35     #include <linux/raid/xor.h>
36     #include <linux/devfs_fs_kernel.h>
37     
38     #include <linux/init.h>
39     
40     #ifdef CONFIG_KMOD
41     #include <linux/kmod.h>
42     #endif
43     
44     #define __KERNEL_SYSCALLS__
45     #include <linux/unistd.h>
46     
47     #include <asm/unaligned.h>
48     
49     #define MAJOR_NR MD_MAJOR
50     #define MD_DRIVER
51     
52     #include <linux/blk.h>
53     
54     #define DEBUG 0
55     #if DEBUG
56     # define dprintk(x...) printk(x)
57     #else
58     # define dprintk(x...) do { } while(0)
59     #endif
60     
61     #ifndef MODULE
62     static void autostart_arrays (void);
63     #endif
64     
65     static mdk_personality_t *pers[MAX_PERSONALITY];
66     
67     /*
68      * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
69      * is 100 KB/sec, so the extra system load does not show up that much.
70      * Increase it if you want to have more _guaranteed_ speed. Note that
71      * the RAID driver will use the maximum available bandwith if the IO
72      * subsystem is idle. There is also an 'absolute maximum' reconstruction
73      * speed limit - in case reconstruction slows down your system despite
74      * idle IO detection.
75      *
76      * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
77      */
78     
79     static int sysctl_speed_limit_min = 100;
80     static int sysctl_speed_limit_max = 100000;
81     
82     static struct ctl_table_header *raid_table_header;
83     
84     static ctl_table raid_table[] = {
85     	{DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
86     	 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
87     	{DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
88     	 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
89     	{0}
90     };
91     
92     static ctl_table raid_dir_table[] = {
93     	{DEV_RAID, "raid", NULL, 0, 0555, raid_table},
94     	{0}
95     };
96     
97     static ctl_table raid_root_table[] = {
98     	{CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
99     	{0}
100     };
101     
102     /*
103      * these have to be allocated separately because external
104      * subsystems want to have a pre-defined structure
105      */
106     struct hd_struct md_hd_struct[MAX_MD_DEVS];
107     static int md_blocksizes[MAX_MD_DEVS];
108     static int md_hardsect_sizes[MAX_MD_DEVS];
109     static int md_maxreadahead[MAX_MD_DEVS];
110     static mdk_thread_t *md_recovery_thread;
111     
112     int md_size[MAX_MD_DEVS];
113     
114     static struct block_device_operations md_fops;
115     static devfs_handle_t devfs_handle;
116     
117     static struct gendisk md_gendisk=
118     {
119     	major: MD_MAJOR,
120     	major_name: "md",
121     	minor_shift: 0,
122     	max_p: 1,
123     	part: md_hd_struct,
124     	sizes: md_size,
125     	nr_real: MAX_MD_DEVS,
126     	real_devices: NULL,
127     	next: NULL,
128     	fops: &md_fops,
129     };
130     
131     /*
132      * Enables to iterate over all existing md arrays
133      */
134     static MD_LIST_HEAD(all_mddevs);
135     
136     /*
137      * The mapping between kdev and mddev is not necessary a simple
138      * one! Eg. HSM uses several sub-devices to implement Logical
139      * Volumes. All these sub-devices map to the same mddev.
140      */
141     dev_mapping_t mddev_map[MAX_MD_DEVS];
142     
143     void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
144     {
145     	unsigned int minor = MINOR(dev);
146     
147     	if (MAJOR(dev) != MD_MAJOR) {
148     		MD_BUG();
149     		return;
150     	}
151     	if (mddev_map[minor].mddev != NULL) {
152     		MD_BUG();
153     		return;
154     	}
155     	mddev_map[minor].mddev = mddev;
156     	mddev_map[minor].data = data;
157     }
158     
159     void del_mddev_mapping (mddev_t * mddev, kdev_t dev)
160     {
161     	unsigned int minor = MINOR(dev);
162     
163     	if (MAJOR(dev) != MD_MAJOR) {
164     		MD_BUG();
165     		return;
166     	}
167     	if (mddev_map[minor].mddev != mddev) {
168     		MD_BUG();
169     		return;
170     	}
171     	mddev_map[minor].mddev = NULL;
172     	mddev_map[minor].data = NULL;
173     }
174     
175     static int md_make_request (request_queue_t *q, int rw, struct buffer_head * bh)
176     {
177     	mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
178     
179     	if (mddev && mddev->pers)
180     		return mddev->pers->make_request(mddev, rw, bh);
181     	else {
182     		buffer_IO_error(bh);
183     		return 0;
184     	}
185     }
186     
187     static mddev_t * alloc_mddev (kdev_t dev)
188     {
189     	mddev_t *mddev;
190     
191     	if (MAJOR(dev) != MD_MAJOR) {
192     		MD_BUG();
193     		return 0;
194     	}
195     	mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
196     	if (!mddev)
197     		return NULL;
198     		
199     	memset(mddev, 0, sizeof(*mddev));
200     
201     	mddev->__minor = MINOR(dev);
202     	init_MUTEX(&mddev->reconfig_sem);
203     	init_MUTEX(&mddev->recovery_sem);
204     	init_MUTEX(&mddev->resync_sem);
205     	MD_INIT_LIST_HEAD(&mddev->disks);
206     	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
207     	atomic_set(&mddev->active, 0);
208     
209     	/*
210     	 * The 'base' mddev is the one with data NULL.
211     	 * personalities can create additional mddevs
212     	 * if necessary.
213     	 */
214     	add_mddev_mapping(mddev, dev, 0);
215     	md_list_add(&mddev->all_mddevs, &all_mddevs);
216     
217     	MOD_INC_USE_COUNT;
218     
219     	return mddev;
220     }
221     
222     mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
223     {
224     	mdk_rdev_t * rdev;
225     	struct md_list_head *tmp;
226     
227     	ITERATE_RDEV(mddev,rdev,tmp) {
228     		if (rdev->desc_nr == nr)
229     			return rdev;
230     	}
231     	return NULL;
232     }
233     
234     mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
235     {
236     	struct md_list_head *tmp;
237     	mdk_rdev_t *rdev;
238     
239     	ITERATE_RDEV(mddev,rdev,tmp) {
240     		if (rdev->dev == dev)
241     			return rdev;
242     	}
243     	return NULL;
244     }
245     
246     static MD_LIST_HEAD(device_names);
247     
248     char * partition_name (kdev_t dev)
249     {
250     	struct gendisk *hd;
251     	static char nomem [] = "<nomem>";
252     	dev_name_t *dname;
253     	struct md_list_head *tmp = device_names.next;
254     
255     	while (tmp != &device_names) {
256     		dname = md_list_entry(tmp, dev_name_t, list);
257     		if (dname->dev == dev)
258     			return dname->name;
259     		tmp = tmp->next;
260     	}
261     
262     	dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
263     
264     	if (!dname)
265     		return nomem;
266     	/*
267     	 * ok, add this new device name to the list
268     	 */
269     	hd = get_gendisk (dev);
270     	dname->name = NULL;
271     	if (hd)
272     		dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
273     	if (!dname->name) {
274     		sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
275     		dname->name = dname->namebuf;
276     	}
277     
278     	dname->dev = dev;
279     	MD_INIT_LIST_HEAD(&dname->list);
280     	md_list_add(&dname->list, &device_names);
281     
282     	return dname->name;
283     }
284     
285     static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev,
286     						int persistent)
287     {
288     	unsigned int size = 0;
289     
290     	if (blk_size[MAJOR(dev)])
291     		size = blk_size[MAJOR(dev)][MINOR(dev)];
292     	if (persistent)
293     		size = MD_NEW_SIZE_BLOCKS(size);
294     	return size;
295     }
296     
297     static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent)
298     {
299     	unsigned int size;
300     
301     	size = calc_dev_sboffset(dev, mddev, persistent);
302     	if (!mddev->sb) {
303     		MD_BUG();
304     		return size;
305     	}
306     	if (mddev->sb->chunk_size)
307     		size &= ~(mddev->sb->chunk_size/1024 - 1);
308     	return size;
309     }
310     
311     static unsigned int zoned_raid_size (mddev_t *mddev)
312     {
313     	unsigned int mask;
314     	mdk_rdev_t * rdev;
315     	struct md_list_head *tmp;
316     
317     	if (!mddev->sb) {
318     		MD_BUG();
319     		return -EINVAL;
320     	}
321     	/*
322     	 * do size and offset calculations.
323     	 */
324     	mask = ~(mddev->sb->chunk_size/1024 - 1);
325     
326     	ITERATE_RDEV(mddev,rdev,tmp) {
327     		rdev->size &= mask;
328     		md_size[mdidx(mddev)] += rdev->size;
329     	}
330     	return 0;
331     }
332     
333     /*
334      * We check wether all devices are numbered from 0 to nb_dev-1. The
335      * order is guaranteed even after device name changes.
336      *
337      * Some personalities (raid0, linear) use this. Personalities that
338      * provide data have to be able to deal with loss of individual
339      * disks, so they do their checking themselves.
340      */
341     int md_check_ordering (mddev_t *mddev)
342     {
343     	int i, c;
344     	mdk_rdev_t *rdev;
345     	struct md_list_head *tmp;
346     
347     	/*
348     	 * First, all devices must be fully functional
349     	 */
350     	ITERATE_RDEV(mddev,rdev,tmp) {
351     		if (rdev->faulty) {
352     			printk("md: md%d's device %s faulty, aborting.\n",
353     				mdidx(mddev), partition_name(rdev->dev));
354     			goto abort;
355     		}
356     	}
357     
358     	c = 0;
359     	ITERATE_RDEV(mddev,rdev,tmp) {
360     		c++;
361     	}
362     	if (c != mddev->nb_dev) {
363     		MD_BUG();
364     		goto abort;
365     	}
366     	if (mddev->nb_dev != mddev->sb->raid_disks) {
367     		printk("md: md%d, array needs %d disks, has %d, aborting.\n",
368     			mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
369     		goto abort;
370     	}
371     	/*
372     	 * Now the numbering check
373     	 */
374     	for (i = 0; i < mddev->nb_dev; i++) {
375     		c = 0;
376     		ITERATE_RDEV(mddev,rdev,tmp) {
377     			if (rdev->desc_nr == i)
378     				c++;
379     		}
380     		if (!c) {
381     			printk("md: md%d, missing disk #%d, aborting.\n",
382     				mdidx(mddev), i);
383     			goto abort;
384     		}
385     		if (c > 1) {
386     			printk("md: md%d, too many disks #%d, aborting.\n",
387     				mdidx(mddev), i);
388     			goto abort;
389     		}
390     	}
391     	return 0;
392     abort:
393     	return 1;
394     }
395     
396     static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
397     {
398     	if (disk_active(disk)) {
399     		sb->working_disks--;
400     	} else {
401     		if (disk_spare(disk)) {
402     			sb->spare_disks--;
403     			sb->working_disks--;
404     		} else	{
405     			sb->failed_disks--;
406     		}
407     	}
408     	sb->nr_disks--;
409     	disk->major = 0;
410     	disk->minor = 0;
411     	mark_disk_removed(disk);
412     }
413     
414     #define BAD_MAGIC KERN_ERR \
415     "md: invalid raid superblock magic on %s\n"
416     
417     #define BAD_MINOR KERN_ERR \
418     "md: %s: invalid raid minor (%x)\n"
419     
420     #define OUT_OF_MEM KERN_ALERT \
421     "md: out of memory.\n"
422     
423     #define NO_SB KERN_ERR \
424     "md: disabled device %s, could not read superblock.\n"
425     
426     #define BAD_CSUM KERN_WARNING \
427     "md: invalid superblock checksum on %s\n"
428     
429     static int alloc_array_sb (mddev_t * mddev)
430     {
431     	if (mddev->sb) {
432     		MD_BUG();
433     		return 0;
434     	}
435     
436     	mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
437     	if (!mddev->sb)
438     		return -ENOMEM;
439     	md_clear_page(mddev->sb);
440     	return 0;
441     }
442     
443     static int alloc_disk_sb (mdk_rdev_t * rdev)
444     {
445     	if (rdev->sb)
446     		MD_BUG();
447     
448     	rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
449     	if (!rdev->sb) {
450     		printk (OUT_OF_MEM);
451     		return -EINVAL;
452     	}
453     	md_clear_page(rdev->sb);
454     
455     	return 0;
456     }
457     
458     static void free_disk_sb (mdk_rdev_t * rdev)
459     {
460     	if (rdev->sb) {
461     		free_page((unsigned long) rdev->sb);
462     		rdev->sb = NULL;
463     		rdev->sb_offset = 0;
464     		rdev->size = 0;
465     	} else {
466     		if (!rdev->faulty)
467     			MD_BUG();
468     	}
469     }
470     
471     static int read_disk_sb (mdk_rdev_t * rdev)
472     {
473     	int ret = -EINVAL;
474     	struct buffer_head *bh = NULL;
475     	kdev_t dev = rdev->dev;
476     	mdp_super_t *sb;
477     	unsigned long sb_offset;
478     
479     	if (!rdev->sb) {
480     		MD_BUG();
481     		goto abort;
482     	}	
483     	
484     	/*
485     	 * Calculate the position of the superblock,
486     	 * it's at the end of the disk
487     	 */
488     	sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
489     	rdev->sb_offset = sb_offset;
490     	printk("(read) %s's sb offset: %ld", partition_name(dev), sb_offset);
491     	fsync_dev(dev);
492     	set_blocksize (dev, MD_SB_BYTES);
493     	bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
494     
495     	if (bh) {
496     		sb = (mdp_super_t *) bh->b_data;
497     		memcpy (rdev->sb, sb, MD_SB_BYTES);
498     	} else {
499     		printk (NO_SB,partition_name(rdev->dev));
500     		goto abort;
501     	}
502     	printk(" [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
503     	ret = 0;
504     abort:
505     	if (bh)
506     		brelse (bh);
507     	return ret;
508     }
509     
510     static unsigned int calc_sb_csum (mdp_super_t * sb)
511     {
512     	unsigned int disk_csum, csum;
513     
514     	disk_csum = sb->sb_csum;
515     	sb->sb_csum = 0;
516     	csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
517     	sb->sb_csum = disk_csum;
518     	return csum;
519     }
520     
521     /*
522      * Check one RAID superblock for generic plausibility
523      */
524     
525     static int check_disk_sb (mdk_rdev_t * rdev)
526     {
527     	mdp_super_t *sb;
528     	int ret = -EINVAL;
529     
530     	sb = rdev->sb;
531     	if (!sb) {
532     		MD_BUG();
533     		goto abort;
534     	}
535     
536     	if (sb->md_magic != MD_SB_MAGIC) {
537     		printk (BAD_MAGIC, partition_name(rdev->dev));
538     		goto abort;
539     	}
540     
541     	if (sb->md_minor >= MAX_MD_DEVS) {
542     		printk (BAD_MINOR, partition_name(rdev->dev),
543     							sb->md_minor);
544     		goto abort;
545     	}
546     
547     	if (calc_sb_csum(sb) != sb->sb_csum)
548     		printk(BAD_CSUM, partition_name(rdev->dev));
549     	ret = 0;
550     abort:
551     	return ret;
552     }
553     
554     static kdev_t dev_unit(kdev_t dev)
555     {
556     	unsigned int mask;
557     	struct gendisk *hd = get_gendisk(dev);
558     
559     	if (!hd)
560     		return 0;
561     	mask = ~((1 << hd->minor_shift) - 1);
562     
563     	return MKDEV(MAJOR(dev), MINOR(dev) & mask);
564     }
565     
566     static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
567     {
568     	struct md_list_head *tmp;
569     	mdk_rdev_t *rdev;
570     
571     	ITERATE_RDEV(mddev,rdev,tmp)
572     		if (dev_unit(rdev->dev) == dev_unit(dev))
573     			return rdev;
574     
575     	return NULL;
576     }
577     
578     static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
579     {
580     	struct md_list_head *tmp;
581     	mdk_rdev_t *rdev;
582     
583     	ITERATE_RDEV(mddev1,rdev,tmp)
584     		if (match_dev_unit(mddev2, rdev->dev))
585     			return 1;
586     
587     	return 0;
588     }
589     
590     static MD_LIST_HEAD(all_raid_disks);
591     static MD_LIST_HEAD(pending_raid_disks);
592     
593     static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
594     {
595     	mdk_rdev_t *same_pdev;
596     
597     	if (rdev->mddev) {
598     		MD_BUG();
599     		return;
600     	}
601     	same_pdev = match_dev_unit(mddev, rdev->dev);
602     	if (same_pdev)
603     		printk( KERN_WARNING
604     "md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
605     "     protection against single-disk failure might be compromised.\n",
606      			mdidx(mddev), partition_name(rdev->dev),
607     				partition_name(same_pdev->dev));
608     		
609     	md_list_add(&rdev->same_set, &mddev->disks);
610     	rdev->mddev = mddev;
611     	mddev->nb_dev++;
612     	printk("md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev);
613     }
614     
615     static void unbind_rdev_from_array (mdk_rdev_t * rdev)
616     {
617     	if (!rdev->mddev) {
618     		MD_BUG();
619     		return;
620     	}
621     	md_list_del(&rdev->same_set);
622     	MD_INIT_LIST_HEAD(&rdev->same_set);
623     	rdev->mddev->nb_dev--;
624     	printk("md: unbind<%s,%d>\n", partition_name(rdev->dev),
625     						 rdev->mddev->nb_dev);
626     	rdev->mddev = NULL;
627     }
628     
629     /*
630      * prevent the device from being mounted, repartitioned or
631      * otherwise reused by a RAID array (or any other kernel
632      * subsystem), by opening the device. [simply getting an
633      * inode is not enough, the SCSI module usage code needs
634      * an explicit open() on the device]
635      */
636     static int lock_rdev (mdk_rdev_t *rdev)
637     {
638     	int err = 0;
639     	struct block_device *bdev;
640     
641     	bdev = bdget(rdev->dev);
642     	if (bdev == NULL)
643     		return -ENOMEM;
644     	err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
645     	if (!err)
646     		rdev->bdev = bdev;
647     	return err;
648     }
649     
650     static void unlock_rdev (mdk_rdev_t *rdev)
651     {
652     	struct block_device *bdev = rdev->bdev;
653     	rdev->bdev = NULL;
654     	if (!bdev)
655     		MD_BUG();
656     	blkdev_put(bdev, BDEV_RAW);
657     }
658     
659     void md_autodetect_dev (kdev_t dev);
660     
661     static void export_rdev (mdk_rdev_t * rdev)
662     {
663     	printk("md: export_rdev(%s)\n",partition_name(rdev->dev));
664     	if (rdev->mddev)
665     		MD_BUG();
666     	unlock_rdev(rdev);
667     	free_disk_sb(rdev);
668     	md_list_del(&rdev->all);
669     	MD_INIT_LIST_HEAD(&rdev->all);
670     	if (rdev->pending.next != &rdev->pending) {
671     		printk("md: (%s was pending)\n",partition_name(rdev->dev));
672     		md_list_del(&rdev->pending);
673     		MD_INIT_LIST_HEAD(&rdev->pending);
674     	}
675     #ifndef MODULE
676     	md_autodetect_dev(rdev->dev);
677     #endif
678     	rdev->dev = 0;
679     	rdev->faulty = 0;
680     	kfree(rdev);
681     }
682     
683     static void kick_rdev_from_array (mdk_rdev_t * rdev)
684     {
685     	unbind_rdev_from_array(rdev);
686     	export_rdev(rdev);
687     }
688     
689     static void export_array (mddev_t *mddev)
690     {
691     	struct md_list_head *tmp;
692     	mdk_rdev_t *rdev;
693     	mdp_super_t *sb = mddev->sb;
694     
695     	if (mddev->sb) {
696     		mddev->sb = NULL;
697     		free_page((unsigned long) sb);
698     	}
699     
700     	ITERATE_RDEV(mddev,rdev,tmp) {
701     		if (!rdev->mddev) {
702     			MD_BUG();
703     			continue;
704     		}
705     		kick_rdev_from_array(rdev);
706     	}
707     	if (mddev->nb_dev)
708     		MD_BUG();
709     }
710     
711     static void free_mddev (mddev_t *mddev)
712     {
713     	if (!mddev) {
714     		MD_BUG();
715     		return;
716     	}
717     
718     	export_array(mddev);
719     	md_size[mdidx(mddev)] = 0;
720     	md_hd_struct[mdidx(mddev)].nr_sects = 0;
721     
722     	/*
723     	 * Make sure nobody else is using this mddev
724     	 * (careful, we rely on the global kernel lock here)
725     	 */
726     	while (md_atomic_read(&mddev->resync_sem.count) != 1)
727     		schedule();
728     	while (md_atomic_read(&mddev->recovery_sem.count) != 1)
729     		schedule();
730     
731     	del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
732     	md_list_del(&mddev->all_mddevs);
733     	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
734     	kfree(mddev);
735     	MOD_DEC_USE_COUNT;
736     }
737     
738     #undef BAD_CSUM
739     #undef BAD_MAGIC
740     #undef OUT_OF_MEM
741     #undef NO_SB
742     
743     static void print_desc(mdp_disk_t *desc)
744     {
745     	printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
746     		partition_name(MKDEV(desc->major,desc->minor)),
747     		desc->major,desc->minor,desc->raid_disk,desc->state);
748     }
749     
750     static void print_sb(mdp_super_t *sb)
751     {
752     	int i;
753     
754     	printk("md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
755     		sb->major_version, sb->minor_version, sb->patch_version,
756     		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
757     		sb->ctime);
758     	printk("md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
759     		sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
760     		sb->layout, sb->chunk_size);
761     	printk("md:     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
762     		sb->utime, sb->state, sb->active_disks, sb->working_disks,
763     		sb->failed_disks, sb->spare_disks,
764     		sb->sb_csum, (unsigned long)sb->events_lo);
765     
766     	for (i = 0; i < MD_SB_DISKS; i++) {
767     		mdp_disk_t *desc;
768     
769     		desc = sb->disks + i;
770     		if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
771     			printk("     D %2d: ", i);
772     			print_desc(desc);
773     		}
774     	}
775     	printk("md:     THIS: ");
776     	print_desc(&sb->this_disk);
777     
778     }
779     
780     static void print_rdev(mdk_rdev_t *rdev)
781     {
782     	printk("md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
783     		partition_name(rdev->dev), partition_name(rdev->old_dev),
784     		rdev->size, rdev->faulty, rdev->desc_nr);
785     	if (rdev->sb) {
786     		printk("md: rdev superblock:\n");
787     		print_sb(rdev->sb);
788     	} else
789     		printk("md: no rdev superblock!\n");
790     }
791     
792     void md_print_devices (void)
793     {
794     	struct md_list_head *tmp, *tmp2;
795     	mdk_rdev_t *rdev;
796     	mddev_t *mddev;
797     
798     	printk("\n");
799     	printk("md:	**********************************\n");
800     	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
801     	printk("md:	**********************************\n");
802     	ITERATE_MDDEV(mddev,tmp) {
803     		printk("md%d: ", mdidx(mddev));
804     
805     		ITERATE_RDEV(mddev,rdev,tmp2)
806     			printk("<%s>", partition_name(rdev->dev));
807     
808     		if (mddev->sb) {
809     			printk(" array superblock:\n");
810     			print_sb(mddev->sb);
811     		} else
812     			printk(" no array superblock.\n");
813     
814     		ITERATE_RDEV(mddev,rdev,tmp2)
815     			print_rdev(rdev);
816     	}
817     	printk("md:	**********************************\n");
818     	printk("\n");
819     }
820     
821     static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
822     {
823     	int ret;
824     	mdp_super_t *tmp1, *tmp2;
825     
826     	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
827     	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
828     
829     	if (!tmp1 || !tmp2) {
830     		ret = 0;
831     		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
832     		goto abort;
833     	}
834     
835     	*tmp1 = *sb1;
836     	*tmp2 = *sb2;
837     
838     	/*
839     	 * nr_disks is not constant
840     	 */
841     	tmp1->nr_disks = 0;
842     	tmp2->nr_disks = 0;
843     
844     	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
845     		ret = 0;
846     	else
847     		ret = 1;
848     
849     abort:
850     	if (tmp1)
851     		kfree(tmp1);
852     	if (tmp2)
853     		kfree(tmp2);
854     
855     	return ret;
856     }
857     
858     static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
859     {
860     	if (	(rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
861     		(rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
862     		(rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
863     		(rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
864     
865     		return 1;
866     
867     	return 0;
868     }
869     
870     static mdk_rdev_t * find_rdev_all (kdev_t dev)
871     {
872     	struct md_list_head *tmp;
873     	mdk_rdev_t *rdev;
874     
875     	tmp = all_raid_disks.next;
876     	while (tmp != &all_raid_disks) {
877     		rdev = md_list_entry(tmp, mdk_rdev_t, all);
878     		if (rdev->dev == dev)
879     			return rdev;
880     		tmp = tmp->next;
881     	}
882     	return NULL;
883     }
884     
885     #define GETBLK_FAILED KERN_ERR \
886     "md: getblk failed for device %s\n"
887     
888     static int write_disk_sb(mdk_rdev_t * rdev)
889     {
890     	struct buffer_head *bh;
891     	kdev_t dev;
892     	unsigned long sb_offset, size;
893     	mdp_super_t *sb;
894     
895     	if (!rdev->sb) {
896     		MD_BUG();
897     		return 1;
898     	}
899     	if (rdev->faulty) {
900     		MD_BUG();
901     		return 1;
902     	}
903     	if (rdev->sb->md_magic != MD_SB_MAGIC) {
904     		MD_BUG();
905     		return 1;
906     	}
907     
908     	dev = rdev->dev;
909     	sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
910     	if (rdev->sb_offset != sb_offset) {
911     		printk("%s's sb offset has changed from %ld to %ld, skipping\n",
912     			partition_name(dev), rdev->sb_offset, sb_offset);
913     		goto skip;
914     	}
915     	/*
916     	 * If the disk went offline meanwhile and it's just a spare, then
917     	 * its size has changed to zero silently, and the MD code does
918     	 * not yet know that it's faulty.
919     	 */
920     	size = calc_dev_size(dev, rdev->mddev, 1);
921     	if (size != rdev->size) {
922     		printk("%s's size has changed from %ld to %ld since import, skipping\n",
923     			partition_name(dev), rdev->size, size);
924     		goto skip;
925     	}
926     
927     	printk("(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
928     	fsync_dev(dev);
929     	set_blocksize(dev, MD_SB_BYTES);
930     	bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES);
931     	if (!bh) {
932     		printk(GETBLK_FAILED, partition_name(dev));
933     		return 1;
934     	}
935     	memset(bh->b_data,0,bh->b_size);
936     	sb = (mdp_super_t *) bh->b_data;
937     	memcpy(sb, rdev->sb, MD_SB_BYTES);
938     
939     	mark_buffer_uptodate(bh, 1);
940     	mark_buffer_dirty(bh);
941     	ll_rw_block(WRITE, 1, &bh);
942     	wait_on_buffer(bh);
943     	brelse(bh);
944     	fsync_dev(dev);
945     skip:
946     	return 0;
947     }
948     #undef GETBLK_FAILED 
949     
950     static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
951     {
952     	int i, ok = 0;
953     	mdp_disk_t *desc;
954     
955     	for (i = 0; i < MD_SB_DISKS; i++) {
956     		desc = mddev->sb->disks + i;
957     #if 0
958     		if (disk_faulty(desc)) {
959     			if (MKDEV(desc->major,desc->minor) == rdev->dev)
960     				ok = 1;
961     			continue;
962     		}
963     #endif
964     		if (MKDEV(desc->major,desc->minor) == rdev->dev) {
965     			rdev->sb->this_disk = *desc;
966     			rdev->desc_nr = desc->number;
967     			ok = 1;
968     			break;
969     		}
970     	}
971     
972     	if (!ok) {
973     		MD_BUG();
974     	}
975     }
976     
977     static int sync_sbs(mddev_t * mddev)
978     {
979     	mdk_rdev_t *rdev;
980     	mdp_super_t *sb;
981     	struct md_list_head *tmp;
982     
983     	ITERATE_RDEV(mddev,rdev,tmp) {
984     		if (rdev->faulty || rdev->alias_device)
985     			continue;
986     		sb = rdev->sb;
987     		*sb = *mddev->sb;
988     		set_this_disk(mddev, rdev);
989     		sb->sb_csum = calc_sb_csum(sb);
990     	}
991     	return 0;
992     }
993     
994     int md_update_sb(mddev_t * mddev)
995     {
996     	int err, count = 100;
997     	struct md_list_head *tmp;
998     	mdk_rdev_t *rdev;
999     
1000     repeat:
1001     	mddev->sb->utime = CURRENT_TIME;
1002     	if ((++mddev->sb->events_lo)==0)
1003     		++mddev->sb->events_hi;
1004     
1005     	if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
1006     		/*
1007     		 * oops, this 64-bit counter should never wrap.
1008     		 * Either we are in around ~1 trillion A.C., assuming
1009     		 * 1 reboot per second, or we have a bug:
1010     		 */
1011     		MD_BUG();
1012     		mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
1013     	}
1014     	sync_sbs(mddev);
1015     
1016     	/*
1017     	 * do not write anything to disk if using
1018     	 * nonpersistent superblocks
1019     	 */
1020     	if (mddev->sb->not_persistent)
1021     		return 0;
1022     
1023     	printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
1024     					mdidx(mddev));
1025     
1026     	err = 0;
1027     	ITERATE_RDEV(mddev,rdev,tmp) {
1028     		printk("md: ");
1029     		if (rdev->faulty)
1030     			printk("(skipping faulty ");
1031     		if (rdev->alias_device)
1032     			printk("(skipping alias ");
1033     
1034     		printk("%s ", partition_name(rdev->dev));
1035     		if (!rdev->faulty && !rdev->alias_device) {
1036     			printk("[events: %08lx]",
1037     				(unsigned long)rdev->sb->events_lo);
1038     			err += write_disk_sb(rdev);
1039     		} else
1040     			printk(")\n");
1041     	}
1042     	if (err) {
1043     		if (--count) {
1044     			printk("md: errors occurred during superblock update, repeating\n");
1045     			goto repeat;
1046     		}
1047     		printk("md: excessive errors occurred during superblock update, exiting\n");
1048     	}
1049     	return 0;
1050     }
1051     
1052     /*
1053      * Import a device. If 'on_disk', then sanity check the superblock
1054      *
1055      * mark the device faulty if:
1056      *
1057      *   - the device is nonexistent (zero size)
1058      *   - the device has no valid superblock
1059      *
1060      * a faulty rdev _never_ has rdev->sb set.
1061      */
1062     static int md_import_device (kdev_t newdev, int on_disk)
1063     {
1064     	int err;
1065     	mdk_rdev_t *rdev;
1066     	unsigned int size;
1067     
1068     	if (find_rdev_all(newdev))
1069     		return -EEXIST;
1070     
1071     	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1072     	if (!rdev) {
1073     		printk("md: could not alloc mem for %s!\n", partition_name(newdev));
1074     		return -ENOMEM;
1075     	}
1076     	memset(rdev, 0, sizeof(*rdev));
1077     
1078     	if (is_mounted(newdev)) {
1079     		printk("md: can not import %s, has active inodes!\n",
1080     			partition_name(newdev));
1081     		err = -EBUSY;
1082     		goto abort_free;
1083     	}
1084     
1085     	if ((err = alloc_disk_sb(rdev)))
1086     		goto abort_free;
1087     
1088     	rdev->dev = newdev;
1089     	if (lock_rdev(rdev)) {
1090     		printk("md: could not lock %s, zero-size? Marking faulty.\n",
1091     			partition_name(newdev));
1092     		err = -EINVAL;
1093     		goto abort_free;
1094     	}
1095     	rdev->desc_nr = -1;
1096     	rdev->faulty = 0;
1097     
1098     	size = 0;
1099     	if (blk_size[MAJOR(newdev)])
1100     		size = blk_size[MAJOR(newdev)][MINOR(newdev)];
1101     	if (!size) {
1102     		printk("md: %s has zero size, marking faulty!\n",
1103     				partition_name(newdev));
1104     		err = -EINVAL;
1105     		goto abort_free;
1106     	}
1107     
1108     	if (on_disk) {
1109     		if ((err = read_disk_sb(rdev))) {
1110     			printk("md: could not read %s's sb, not importing!\n",
1111     					partition_name(newdev));
1112     			goto abort_free;
1113     		}
1114     		if ((err = check_disk_sb(rdev))) {
1115     			printk("md: %s has invalid sb, not importing!\n",
1116     					partition_name(newdev));
1117     			goto abort_free;
1118     		}
1119     
1120     		if (rdev->sb->level != -4) {
1121     			rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
1122     						rdev->sb->this_disk.minor);
1123     			rdev->desc_nr = rdev->sb->this_disk.number;
1124     		} else {
1125     			rdev->old_dev = MKDEV(0, 0);
1126     			rdev->desc_nr = -1;
1127     		}
1128     	}
1129     	md_list_add(&rdev->all, &all_raid_disks);
1130     	MD_INIT_LIST_HEAD(&rdev->pending);
1131     
1132     	if (rdev->faulty && rdev->sb)
1133     		free_disk_sb(rdev);
1134     	return 0;
1135     
1136     abort_free:
1137     	if (rdev->sb) {
1138     		if (rdev->bdev)
1139     			unlock_rdev(rdev);
1140     		free_disk_sb(rdev);
1141     	}
1142     	kfree(rdev);
1143     	return err;
1144     }
1145     
1146     /*
1147      * Check a full RAID array for plausibility
1148      */
1149     
1150     #define INCONSISTENT KERN_ERR \
1151     "md: fatal superblock inconsistency in %s -- removing from array\n"
1152     
1153     #define OUT_OF_DATE KERN_ERR \
1154     "md: superblock update time inconsistency -- using the most recent one\n"
1155     
1156     #define OLD_VERSION KERN_ALERT \
1157     "md: md%d: unsupported raid array version %d.%d.%d\n"
1158     
1159     #define NOT_CLEAN_IGNORE KERN_ERR \
1160     "md: md%d: raid array is not clean -- starting background reconstruction\n"
1161     
1162     #define UNKNOWN_LEVEL KERN_ERR \
1163     "md: md%d: unsupported raid level %d\n"
1164     
1165     static int analyze_sbs (mddev_t * mddev)
1166     {
1167     	int out_of_date = 0, i, first;
1168     	struct md_list_head *tmp, *tmp2;
1169     	mdk_rdev_t *rdev, *rdev2, *freshest;
1170     	mdp_super_t *sb;
1171     
1172     	/*
1173     	 * Verify the RAID superblock on each real device
1174     	 */
1175     	ITERATE_RDEV(mddev,rdev,tmp) {
1176     		if (rdev->faulty) {
1177     			MD_BUG();
1178     			goto abort;
1179     		}
1180     		if (!rdev->sb) {
1181     			MD_BUG();
1182     			goto abort;
1183     		}
1184     		if (check_disk_sb(rdev))
1185     			goto abort;
1186     	}
1187     
1188     	/*
1189     	 * The superblock constant part has to be the same
1190     	 * for all disks in the array.
1191     	 */
1192     	sb = NULL;
1193     
1194     	ITERATE_RDEV(mddev,rdev,tmp) {
1195     		if (!sb) {
1196     			sb = rdev->sb;
1197     			continue;
1198     		}
1199     		if (!sb_equal(sb, rdev->sb)) {
1200     			printk (INCONSISTENT, partition_name(rdev->dev));
1201     			kick_rdev_from_array(rdev);
1202     			continue;
1203     		}
1204     	}
1205     
1206     	/*
1207     	 * OK, we have all disks and the array is ready to run. Let's
1208     	 * find the freshest superblock, that one will be the superblock
1209     	 * that represents the whole array.
1210     	 */
1211     	if (!mddev->sb)
1212     		if (alloc_array_sb(mddev))
1213     			goto abort;
1214     	sb = mddev->sb;
1215     	freshest = NULL;
1216     
1217     	ITERATE_RDEV(mddev,rdev,tmp) {
1218     		__u64 ev1, ev2;
1219     		/*
1220     		 * if the checksum is invalid, use the superblock
1221     		 * only as a last resort. (decrease it's age by
1222     		 * one event)
1223     		 */
1224     		if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
1225     			if (rdev->sb->events_lo || rdev->sb->events_hi)
1226     				if ((rdev->sb->events_lo--)==0)
1227     					rdev->sb->events_hi--;
1228     		}
1229     
1230     		printk("md: %s's event counter: %08lx\n", partition_name(rdev->dev),
1231     			(unsigned long)rdev->sb->events_lo);
1232     		if (!freshest) {
1233     			freshest = rdev;
1234     			continue;
1235     		}
1236     		/*
1237     		 * Find the newest superblock version
1238     		 */
1239     		ev1 = md_event(rdev->sb);
1240     		ev2 = md_event(freshest->sb);
1241     		if (ev1 != ev2) {
1242     			out_of_date = 1;
1243     			if (ev1 > ev2)
1244     				freshest = rdev;
1245     		}
1246     	}
1247     	if (out_of_date) {
1248     		printk(OUT_OF_DATE);
1249     		printk("md: freshest: %s\n", partition_name(freshest->dev));
1250     	}
1251     	memcpy (sb, freshest->sb, sizeof(*sb));
1252     
1253     	/*
1254     	 * at this point we have picked the 'best' superblock
1255     	 * from all available superblocks.
1256     	 * now we validate this superblock and kick out possibly
1257     	 * failed disks.
1258     	 */
1259     	ITERATE_RDEV(mddev,rdev,tmp) {
1260     		/*
1261     		 * Kick all non-fresh devices
1262     		 */
1263     		__u64 ev1, ev2;
1264     		ev1 = md_event(rdev->sb);
1265     		ev2 = md_event(sb);
1266     		++ev1;
1267     		if (ev1 < ev2) {
1268     			printk("md: kicking non-fresh %s from array!\n",
1269     						partition_name(rdev->dev));
1270     			kick_rdev_from_array(rdev);
1271     			continue;
1272     		}
1273     	}
1274     
1275     	/*
1276     	 * Fix up changed device names ... but only if this disk has a
1277     	 * recent update time. Use faulty checksum ones too.
1278     	 */
1279     	if (mddev->sb->level != -4)
1280     	ITERATE_RDEV(mddev,rdev,tmp) {
1281     		__u64 ev1, ev2, ev3;
1282     		if (rdev->faulty || rdev->alias_device) {
1283     			MD_BUG();
1284     			goto abort;
1285     		}
1286     		ev1 = md_event(rdev->sb);
1287     		ev2 = md_event(sb);
1288     		ev3 = ev2;
1289     		--ev3;
1290     		if ((rdev->dev != rdev->old_dev) &&
1291     			((ev1 == ev2) || (ev1 == ev3))) {
1292     			mdp_disk_t *desc;
1293     
1294     			printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
1295     			if (rdev->desc_nr == -1) {
1296     				MD_BUG();
1297     				goto abort;
1298     			}
1299     			desc = &sb->disks[rdev->desc_nr];
1300     			if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
1301     				MD_BUG();
1302     				goto abort;
1303     			}
1304     			desc->major = MAJOR(rdev->dev);
1305     			desc->minor = MINOR(rdev->dev);
1306     			desc = &rdev->sb->this_disk;
1307     			desc->major = MAJOR(rdev->dev);
1308     			desc->minor = MINOR(rdev->dev);
1309     		}
1310     	}
1311     
1312     	/*
1313     	 * Remove unavailable and faulty devices ...
1314     	 *
1315     	 * note that if an array becomes completely unrunnable due to
1316     	 * missing devices, we do not write the superblock back, so the
1317     	 * administrator has a chance to fix things up. The removal thus
1318     	 * only happens if it's nonfatal to the contents of the array.
1319     	 */
1320     	for (i = 0; i < MD_SB_DISKS; i++) {
1321     		int found;
1322     		mdp_disk_t *desc;
1323     		kdev_t dev;
1324     
1325     		desc = sb->disks + i;
1326     		dev = MKDEV(desc->major, desc->minor);
1327     
1328     		/*
1329     		 * We kick faulty devices/descriptors immediately.
1330     		 *
1331     		 * Note: multipath devices are a special case.  Since we
1332     		 * were able to read the superblock on the path, we don't
1333     		 * care if it was previously marked as faulty, it's up now
1334     		 * so enable it.
1335     		 */
1336     		if (disk_faulty(desc) && mddev->sb->level != -4) {
1337     			found = 0;
1338     			ITERATE_RDEV(mddev,rdev,tmp) {
1339     				if (rdev->desc_nr != desc->number)
1340     					continue;
1341     				printk("md%d: kicking faulty %s!\n",
1342     					mdidx(mddev),partition_name(rdev->dev));
1343     				kick_rdev_from_array(rdev);
1344     				found = 1;
1345     				break;
1346     			}
1347     			if (!found) {
1348     				if (dev == MKDEV(0,0))
1349     					continue;
1350     				printk("md%d: removing former faulty %s!\n",
1351     					mdidx(mddev), partition_name(dev));
1352     			}
1353     			remove_descriptor(desc, sb);
1354     			continue;
1355     		} else if (disk_faulty(desc)) {
1356     			/*
1357     			 * multipath entry marked as faulty, unfaulty it
1358     			 */
1359     			rdev = find_rdev(mddev, dev);
1360     			if(rdev)
1361     				mark_disk_spare(desc);
1362     			else
1363     				remove_descriptor(desc, sb);
1364     		}
1365     
1366     		if (dev == MKDEV(0,0))
1367     			continue;
1368     		/*
1369     		 * Is this device present in the rdev ring?
1370     		 */
1371     		found = 0;
1372     		ITERATE_RDEV(mddev,rdev,tmp) {
1373     			/*
1374     			 * Multi-path IO special-case: since we have no
1375     			 * this_disk descriptor at auto-detect time,
1376     			 * we cannot check rdev->number.
1377     			 * We can check the device though.
1378     			 */
1379     			if ((sb->level == -4) && (rdev->dev ==
1380     					MKDEV(desc->major,desc->minor))) {
1381     				found = 1;
1382     				break;
1383     			}
1384     			if (rdev->desc_nr == desc->number) {
1385     				found = 1;
1386     				break;
1387     			}
1388     		}
1389     		if (found)
1390     			continue;
1391     
1392     		printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev));
1393     		remove_descriptor(desc, sb);
1394     	}
1395     
1396     	/*
1397     	 * Double check wether all devices mentioned in the
1398     	 * superblock are in the rdev ring.
1399     	 */
1400     	first = 1;
1401     	for (i = 0; i < MD_SB_DISKS; i++) {
1402     		mdp_disk_t *desc;
1403     		kdev_t dev;
1404     
1405     		desc = sb->disks + i;
1406     		dev = MKDEV(desc->major, desc->minor);
1407     
1408     		if (dev == MKDEV(0,0))
1409     			continue;
1410     
1411     		if (disk_faulty(desc)) {
1412     			MD_BUG();
1413     			goto abort;
1414     		}
1415     
1416     		rdev = find_rdev(mddev, dev);
1417     		if (!rdev) {
1418     			MD_BUG();
1419     			goto abort;
1420     		}
1421     		/*
1422     		 * In the case of Multipath-IO, we have no
1423     		 * other information source to find out which
1424     		 * disk is which, only the position of the device
1425     		 * in the superblock:
1426     		 */
1427     		if (mddev->sb->level == -4) {
1428     			if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
1429     				MD_BUG();
1430     				goto abort;
1431     			}
1432     			rdev->desc_nr = i;
1433     			if (!first)
1434     				rdev->alias_device = 1;
1435     			else
1436     				first = 0;
1437     		}
1438     	}
1439      
1440     	/*
1441     	 * Kick all rdevs that are not in the
1442     	 * descriptor array:
1443     	 */
1444     	ITERATE_RDEV(mddev,rdev,tmp) {
1445     		if (rdev->desc_nr == -1)
1446     			kick_rdev_from_array(rdev);
1447     	}
1448      
1449     	/*
1450     	 * Do a final reality check.
1451     	 */
1452     	if (mddev->sb->level != -4) {
1453     		ITERATE_RDEV(mddev,rdev,tmp) {
1454     			if (rdev->desc_nr == -1) {
1455     				MD_BUG();
1456     				goto abort;
1457     			}
1458     			/*
1459     			 * is the desc_nr unique?
1460     			 */
1461     			ITERATE_RDEV(mddev,rdev2,tmp2) {
1462     				if ((rdev2 != rdev) &&
1463     						(rdev2->desc_nr == rdev->desc_nr)) {
1464     					MD_BUG();
1465     					goto abort;
1466     				}
1467     			}
1468     			/*
1469     			 * is the device unique?
1470     			 */
1471     			ITERATE_RDEV(mddev,rdev2,tmp2) {
1472     				if ((rdev2 != rdev) &&
1473     						(rdev2->dev == rdev->dev)) {
1474     					MD_BUG();
1475     					goto abort;
1476     				}
1477     			}
1478     		}
1479     	}
1480     
1481     	/*
1482     	 * Check if we can support this RAID array
1483     	 */
1484     	if (sb->major_version != MD_MAJOR_VERSION ||
1485     			sb->minor_version > MD_MINOR_VERSION) {
1486     
1487     		printk (OLD_VERSION, mdidx(mddev), sb->major_version,
1488     				sb->minor_version, sb->patch_version);
1489     		goto abort;
1490     	}
1491     
1492     	if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
1493     			(sb->level == 4) || (sb->level == 5)))
1494     		printk (NOT_CLEAN_IGNORE, mdidx(mddev));
1495     
1496     	return 0;
1497     abort:
1498     	return 1;
1499     }
1500     
1501     #undef INCONSISTENT
1502     #undef OUT_OF_DATE
1503     #undef OLD_VERSION
1504     #undef OLD_LEVEL
1505     
1506     static int device_size_calculation (mddev_t * mddev)
1507     {
1508     	int data_disks = 0, persistent;
1509     	unsigned int readahead;
1510     	mdp_super_t *sb = mddev->sb;
1511     	struct md_list_head *tmp;
1512     	mdk_rdev_t *rdev;
1513     
1514     	/*
1515     	 * Do device size calculation. Bail out if too small.
1516     	 * (we have to do this after having validated chunk_size,
1517     	 * because device size has to be modulo chunk_size)
1518     	 */
1519     	persistent = !mddev->sb->not_persistent;
1520     	ITERATE_RDEV(mddev,rdev,tmp) {
1521     		if (rdev->faulty)
1522     			continue;
1523     		if (rdev->size) {
1524     			MD_BUG();
1525     			continue;
1526     		}
1527     		rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
1528     		if (rdev->size < sb->chunk_size / 1024) {
1529     			printk (KERN_WARNING
1530     				"md: Dev %s smaller than chunk_size: %ldk < %dk\n",
1531     				partition_name(rdev->dev),
1532     				rdev->size, sb->chunk_size / 1024);
1533     			return -EINVAL;
1534     		}
1535     	}
1536     
1537     	switch (sb->level) {
1538     		case -4:
1539     			data_disks = 1;
1540     			break;
1541     		case -3:
1542     			data_disks = 1;
1543     			break;
1544     		case -2:
1545     			data_disks = 1;
1546     			break;
1547     		case -1:
1548     			zoned_raid_size(mddev);
1549     			data_disks = 1;
1550     			break;
1551     		case 0:
1552     			zoned_raid_size(mddev);
1553     			data_disks = sb->raid_disks;
1554     			break;
1555     		case 1:
1556     			data_disks = 1;
1557     			break;
1558     		case 4:
1559     		case 5:
1560     			data_disks = sb->raid_disks-1;
1561     			break;
1562     		default:
1563     			printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level);
1564     			goto abort;
1565     	}
1566     	if (!md_size[mdidx(mddev)])
1567     		md_size[mdidx(mddev)] = sb->size * data_disks;
1568     
1569     	readahead = MD_READAHEAD;
1570     	if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
1571     		readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
1572     		if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
1573     			readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
1574     	} else {
1575     		// (no multipath branch - it uses the default setting)
1576     		if (sb->level == -3)
1577     			readahead = 0;
1578     	}
1579     	md_maxreadahead[mdidx(mddev)] = readahead;
1580     
1581     	printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
1582     		mdidx(mddev), readahead*(PAGE_SIZE/1024));
1583     
1584     	printk(KERN_INFO
1585     		"md%d: %d data-disks, max readahead per data-disk: %ldk\n",
1586     			mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
1587     	return 0;
1588     abort:
1589     	return 1;
1590     }
1591     
1592     
1593     #define TOO_BIG_CHUNKSIZE KERN_ERR \
1594     "too big chunk_size: %d > %d\n"
1595     
1596     #define TOO_SMALL_CHUNKSIZE KERN_ERR \
1597     "too small chunk_size: %d < %ld\n"
1598     
1599     #define BAD_CHUNKSIZE KERN_ERR \
1600     "no chunksize specified, see 'man raidtab'\n"
1601     
1602     static int do_md_run (mddev_t * mddev)
1603     {
1604     	int pnum, err;
1605     	int chunk_size;
1606     	struct md_list_head *tmp;
1607     	mdk_rdev_t *rdev;
1608     
1609     
1610     	if (!mddev->nb_dev) {
1611     		MD_BUG();
1612     		return -EINVAL;
1613     	}
1614     
1615     	if (mddev->pers)
1616     		return -EBUSY;
1617     
1618     	/*
1619     	 * Resize disks to align partitions size on a given
1620     	 * chunk size.
1621     	 */
1622     	md_size[mdidx(mddev)] = 0;
1623     
1624     	/*
1625     	 * Analyze all RAID superblock(s)
1626     	 */
1627     	if (analyze_sbs(mddev)) {
1628     		MD_BUG();
1629     		return -EINVAL;
1630     	}
1631     
1632     	chunk_size = mddev->sb->chunk_size;
1633     	pnum = level_to_pers(mddev->sb->level);
1634     
1635     	mddev->param.chunk_size = chunk_size;
1636     	mddev->param.personality = pnum;
1637     
1638     	if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1639     		if (!chunk_size) {
1640     			/*
1641     			 * 'default chunksize' in the old md code used to
1642     			 * be PAGE_SIZE, baaad.
1643     			 * we abort here to be on the safe side. We dont
1644     			 * want to continue the bad practice.
1645     			 */
1646     			printk(BAD_CHUNKSIZE);
1647     			return -EINVAL;
1648     		}
1649     		if (chunk_size > MAX_CHUNK_SIZE) {
1650     			printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
1651     			return -EINVAL;
1652     		}
1653     		/*
1654     		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1655     		 */
1656     		if ( (1 << ffz(~chunk_size)) != chunk_size) {
1657     			MD_BUG();
1658     			return -EINVAL;
1659     		}
1660     		if (chunk_size < PAGE_SIZE) {
1661     			printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
1662     			return -EINVAL;
1663     		}
1664     	} else
1665     		if (chunk_size)
1666     			printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
1667     
1668     	if (pnum >= MAX_PERSONALITY) {
1669     		MD_BUG();
1670     		return -EINVAL;
1671     	}
1672     
1673     	if (!pers[pnum])
1674     	{
1675     #ifdef CONFIG_KMOD
1676     		char module_name[80];
1677     		sprintf (module_name, "md-personality-%d", pnum);
1678     		request_module (module_name);
1679     		if (!pers[pnum])
1680     #endif
1681     		{
1682     			printk(KERN_ERR "md: personality %d is not loaded!\n",
1683     				pnum);
1684     			return -EINVAL;
1685     		}
1686     	}
1687     
1688     	if (device_size_calculation(mddev))
1689     		return -EINVAL;
1690     
1691     	/*
1692     	 * Drop all container device buffers, from now on
1693     	 * the only valid external interface is through the md
1694     	 * device.
1695     	 * Also find largest hardsector size
1696     	 */
1697     	md_hardsect_sizes[mdidx(mddev)] = 512;
1698     	ITERATE_RDEV(mddev,rdev,tmp) {
1699     		if (rdev->faulty)
1700     			continue;
1701     		invalidate_device(rdev->dev, 1);
1702     		if (get_hardsect_size(rdev->dev)
1703     			> md_hardsect_sizes[mdidx(mddev)]) 
1704     			md_hardsect_sizes[mdidx(mddev)] =
1705     				get_hardsect_size(rdev->dev);
1706     	}
1707     	md_blocksizes[mdidx(mddev)] = 1024;
1708     	if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
1709     		md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
1710     	mddev->pers = pers[pnum];
1711     
1712     	err = mddev->pers->run(mddev);
1713     	if (err) {
1714     		printk("md: pers->run() failed ...\n");
1715     		mddev->pers = NULL;
1716     		return -EINVAL;
1717     	}
1718     
1719     	mddev->sb->state &= ~(1 << MD_SB_CLEAN);
1720     	md_update_sb(mddev);
1721     
1722     	/*
1723     	 * md_size has units of 1K blocks, which are
1724     	 * twice as large as sectors.
1725     	 */
1726     	md_hd_struct[mdidx(mddev)].start_sect = 0;
1727     	register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
1728     			1, &md_fops, md_size[mdidx(mddev)]<<1);
1729     
1730     	read_ahead[MD_MAJOR] = 1024;
1731     	return (0);
1732     }
1733     
1734     #undef TOO_BIG_CHUNKSIZE
1735     #undef BAD_CHUNKSIZE
1736     
1737     #define OUT(x) do { err = (x); goto out; } while (0)
1738     
1739     static int restart_array (mddev_t *mddev)
1740     {
1741     	int err = 0;
1742     
1743     	/*
1744     	 * Complain if it has no devices
1745     	 */
1746     	if (!mddev->nb_dev)
1747     		OUT(-ENXIO);
1748     
1749     	if (mddev->pers) {
1750     		if (!mddev->ro)
1751     			OUT(-EBUSY);
1752     
1753     		mddev->ro = 0;
1754     		set_device_ro(mddev_to_kdev(mddev), 0);
1755     
1756     		printk (KERN_INFO
1757     			"md: md%d switched to read-write mode.\n", mdidx(mddev));
1758     		/*
1759     		 * Kick recovery or resync if necessary
1760     		 */
1761     		md_recover_arrays();
1762     		if (mddev->pers->restart_resync)
1763     			mddev->pers->restart_resync(mddev);
1764     	} else {
1765     		printk (KERN_ERR "md: md%d has no personality assigned.\n",
1766     			mdidx(mddev));
1767     		err = -EINVAL;
1768     	}
1769     
1770     out:
1771     	return err;
1772     }
1773     
1774     #define STILL_MOUNTED KERN_WARNING \
1775     "md: md%d still mounted.\n"
1776     #define	STILL_IN_USE \
1777     "md: md%d still in use.\n"
1778     
1779     static int do_md_stop (mddev_t * mddev, int ro)
1780     {
1781     	int err = 0, resync_interrupted = 0;
1782     	kdev_t dev = mddev_to_kdev(mddev);
1783     
1784      	if (atomic_read(&mddev->active)>1) {
1785      		printk(STILL_IN_USE, mdidx(mddev));
1786      		OUT(-EBUSY);
1787      	}
1788     
1789     	if (mddev->pers) {
1790     		/*
1791     		 * It is safe to call stop here, it only frees private
1792     		 * data. Also, it tells us if a device is unstoppable
1793     		 * (eg. resyncing is in progress)
1794     		 */
1795     		if (mddev->pers->stop_resync)
1796     			if (mddev->pers->stop_resync(mddev))
1797     				resync_interrupted = 1;
1798     
1799     		if (mddev->recovery_running)
1800     			md_interrupt_thread(md_recovery_thread);
1801     
1802     		/*
1803     		 * This synchronizes with signal delivery to the
1804     		 * resync or reconstruction thread. It also nicely
1805     		 * hangs the process if some reconstruction has not
1806     		 * finished.
1807     		 */
1808     		down(&mddev->recovery_sem);
1809     		up(&mddev->recovery_sem);
1810     
1811     		invalidate_device(dev, 1);
1812     
1813     		if (ro) {
1814     			if (mddev->ro)
1815     				OUT(-ENXIO);
1816     			mddev->ro = 1;
1817     		} else {
1818     			if (mddev->ro)
1819     				set_device_ro(dev, 0);
1820     			if (mddev->pers->stop(mddev)) {
1821     				if (mddev->ro)
1822     					set_device_ro(dev, 1);
1823     				OUT(-EBUSY);
1824     			}
1825     			if (mddev->ro)
1826     				mddev->ro = 0;
1827     		}
1828     		if (mddev->sb) {
1829     			/*
1830     			 * mark it clean only if there was no resync
1831     			 * interrupted.
1832     			 */
1833     			if (!mddev->recovery_running && !resync_interrupted) {
1834     				printk("md: marking sb clean...\n");
1835     				mddev->sb->state |= 1 << MD_SB_CLEAN;
1836     			}
1837     			md_update_sb(mddev);
1838     		}
1839     		if (ro)
1840     			set_device_ro(dev, 1);
1841     	}
1842     
1843     	/*
1844     	 * Free resources if final stop
1845     	 */
1846     	if (!ro) {
1847     		printk (KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
1848     		free_mddev(mddev);
1849     
1850     	} else
1851     		printk (KERN_INFO
1852     			"md: md%d switched to read-only mode.\n", mdidx(mddev));
1853     out:
1854     	return err;
1855     }
1856     
1857     #undef OUT
1858     
1859     /*
1860      * We have to safely support old arrays too.
1861      */
1862     int detect_old_array (mdp_super_t *sb)
1863     {
1864     	if (sb->major_version > 0)
1865     		return 0;
1866     	if (sb->minor_version >= 90)
1867     		return 0;
1868     
1869     	return -EINVAL;
1870     }
1871     
1872     
1873     static void autorun_array (mddev_t *mddev)
1874     {
1875     	mdk_rdev_t *rdev;
1876     	struct md_list_head *tmp;
1877     	int err;
1878     
1879     	if (mddev->disks.prev == &mddev->disks) {
1880     		MD_BUG();
1881     		return;
1882     	}
1883     
1884     	printk("md: running: ");
1885     
1886     	ITERATE_RDEV(mddev,rdev,tmp) {
1887     		printk("<%s>", partition_name(rdev->dev));
1888     	}
1889     	printk("\n");
1890     
1891     	err = do_md_run (mddev);
1892     	if (err) {
1893     		printk("md :do_md_run() returned %d\n", err);
1894     		/*
1895     		 * prevent the writeback of an unrunnable array
1896     		 */
1897     		mddev->sb_dirty = 0;
1898     		do_md_stop (mddev, 0);
1899     	}
1900     }
1901     
1902     /*
1903      * lets try to run arrays based on all disks that have arrived
1904      * until now. (those are in the ->pending list)
1905      *
1906      * the method: pick the first pending disk, collect all disks with
1907      * the same UUID, remove all from the pending list and put them into
1908      * the 'same_array' list. Then order this list based on superblock
1909      * update time (freshest comes first), kick out 'old' disks and
1910      * compare superblocks. If everything's fine then run it.
1911      *
1912      * If "unit" is allocated, then bump its reference count
1913      */
1914     static void autorun_devices (kdev_t countdev)
1915     {
1916     	struct md_list_head candidates;
1917     	struct md_list_head *tmp;
1918     	mdk_rdev_t *rdev0, *rdev;
1919     	mddev_t *mddev;
1920     	kdev_t md_kdev;
1921     
1922     
1923     	printk("md: autorun ...\n");
1924     	while (pending_raid_disks.next != &pending_raid_disks) {
1925     		rdev0 = md_list_entry(pending_raid_disks.next,
1926     					 mdk_rdev_t, pending);
1927     
1928     		printk("md: considering %s ...\n", partition_name(rdev0->dev));
1929     		MD_INIT_LIST_HEAD(&candidates);
1930     		ITERATE_RDEV_PENDING(rdev,tmp) {
1931     			if (uuid_equal(rdev0, rdev)) {
1932     				if (!sb_equal(rdev0->sb, rdev->sb)) {
1933     					printk("md: %s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev));
1934     					continue;
1935     				}
1936     				printk("md:  adding %s ...\n", partition_name(rdev->dev));
1937     				md_list_del(&rdev->pending);
1938     				md_list_add(&rdev->pending, &candidates);
1939     			}
1940     		}
1941     		/*
1942     		 * now we have a set of devices, with all of them having
1943     		 * mostly sane superblocks. It's time to allocate the
1944     		 * mddev.
1945     		 */
1946     		md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
1947     		mddev = kdev_to_mddev(md_kdev);
1948     		if (mddev) {
1949     			printk("md: md%d already running, cannot run %s\n",
1950     				 mdidx(mddev), partition_name(rdev0->dev));
1951     			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
1952     				export_rdev(rdev);
1953     			continue;
1954     		}
1955     		mddev = alloc_mddev(md_kdev);
1956      		if (mddev == NULL) {
1957      			printk("md: cannot allocate memory for md drive.\n");
1958      			break;
1959      		}
1960      		if (md_kdev == countdev)
1961      			atomic_inc(&mddev->active);
1962     		printk("md: created md%d\n", mdidx(mddev));
1963     		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
1964     			bind_rdev_to_array(rdev, mddev);
1965     			md_list_del(&rdev->pending);
1966     			MD_INIT_LIST_HEAD(&rdev->pending);
1967     		}
1968     		autorun_array(mddev);
1969     	}
1970     	printk("md: ... autorun DONE.\n");
1971     }
1972     
1973     /*
1974      * import RAID devices based on one partition
1975      * if possible, the array gets run as well.
1976      */
1977     
1978     #define BAD_VERSION KERN_ERR \
1979     "md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
1980     
1981     #define OUT_OF_MEM KERN_ALERT \
1982     "md: out of memory.\n"
1983     
1984     #define NO_DEVICE KERN_ERR \
1985     "md: disabled device %s\n"
1986     
1987     #define AUTOADD_FAILED KERN_ERR \
1988     "md: auto-adding devices to md%d FAILED (error %d).\n"
1989     
1990     #define AUTOADD_FAILED_USED KERN_ERR \
1991     "md: cannot auto-add device %s to md%d, already used.\n"
1992     
1993     #define AUTORUN_FAILED KERN_ERR \
1994     "md: auto-running md%d FAILED (error %d).\n"
1995     
1996     #define MDDEV_BUSY KERN_ERR \
1997     "md: cannot auto-add to md%d, already running.\n"
1998     
1999     #define AUTOADDING KERN_INFO \
2000     "md: auto-adding devices to md%d, based on %s's superblock.\n"
2001     
2002     #define AUTORUNNING KERN_INFO \
2003     "md: auto-running md%d.\n"
2004     
2005     static int autostart_array (kdev_t startdev, kdev_t countdev)
2006     {
2007     	int err = -EINVAL, i;
2008     	mdp_super_t *sb = NULL;
2009     	mdk_rdev_t *start_rdev = NULL, *rdev;
2010     
2011     	if (md_import_device(startdev, 1)) {
2012     		printk("md: could not import %s!\n", partition_name(startdev));
2013     		goto abort;
2014     	}
2015     
2016     	start_rdev = find_rdev_all(startdev);
2017     	if (!start_rdev) {
2018     		MD_BUG();
2019     		goto abort;
2020     	}
2021     	if (start_rdev->faulty) {
2022     		printk("md: can not autostart based on faulty %s!\n",
2023     						partition_name(startdev));
2024     		goto abort;
2025     	}
2026     	md_list_add(&start_rdev->pending, &pending_raid_disks);
2027     
2028     	sb = start_rdev->sb;
2029     
2030     	err = detect_old_array(sb);
2031     	if (err) {
2032     		printk("md: array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n");
2033     		goto abort;
2034     	}
2035     
2036     	for (i = 0; i < MD_SB_DISKS; i++) {
2037     		mdp_disk_t *desc;
2038     		kdev_t dev;
2039     
2040     		desc = sb->disks + i;
2041     		dev = MKDEV(desc->major, desc->minor);
2042     
2043     		if (dev == MKDEV(0,0))
2044     			continue;
2045     		if (dev == startdev)
2046     			continue;
2047     		if (md_import_device(dev, 1)) {
2048     			printk("md: could not import %s, trying to run array nevertheless.\n", partition_name(dev));
2049     			continue;
2050     		}
2051     		rdev = find_rdev_all(dev);
2052     		if (!rdev) {
2053     			MD_BUG();
2054     			goto abort;
2055     		}
2056     		md_list_add(&rdev->pending, &pending_raid_disks);
2057     	}
2058     
2059     	/*
2060     	 * possibly return codes
2061     	 */
2062     	autorun_devices(countdev);
2063     	return 0;
2064     
2065     abort:
2066     	if (start_rdev)
2067     		export_rdev(start_rdev);
2068     	return err;
2069     }
2070     
2071     #undef BAD_VERSION
2072     #undef OUT_OF_MEM
2073     #undef NO_DEVICE
2074     #undef AUTOADD_FAILED_USED
2075     #undef AUTOADD_FAILED
2076     #undef AUTORUN_FAILED
2077     #undef AUTOADDING
2078     #undef AUTORUNNING
2079     
2080     
2081     static int get_version (void * arg)
2082     {
2083     	mdu_version_t ver;
2084     
2085     	ver.major = MD_MAJOR_VERSION;
2086     	ver.minor = MD_MINOR_VERSION;
2087     	ver.patchlevel = MD_PATCHLEVEL_VERSION;
2088     
2089     	if (md_copy_to_user(arg, &ver, sizeof(ver)))
2090     		return -EFAULT;
2091     
2092     	return 0;
2093     }
2094     
2095     #define SET_FROM_SB(x) info.x = mddev->sb->x
2096     static int get_array_info (mddev_t * mddev, void * arg)
2097     {
2098     	mdu_array_info_t info;
2099     
2100     	if (!mddev->sb) {
2101     		MD_BUG();
2102     		return -EINVAL;
2103     	}
2104     
2105     	SET_FROM_SB(major_version);
2106     	SET_FROM_SB(minor_version);
2107     	SET_FROM_SB(patch_version);
2108     	SET_FROM_SB(ctime);
2109     	SET_FROM_SB(level);
2110     	SET_FROM_SB(size);
2111     	SET_FROM_SB(nr_disks);
2112     	SET_FROM_SB(raid_disks);
2113     	SET_FROM_SB(md_minor);
2114     	SET_FROM_SB(not_persistent);
2115     
2116     	SET_FROM_SB(utime);
2117     	SET_FROM_SB(state);
2118     	SET_FROM_SB(active_disks);
2119     	SET_FROM_SB(working_disks);
2120     	SET_FROM_SB(failed_disks);
2121     	SET_FROM_SB(spare_disks);
2122     
2123     	SET_FROM_SB(layout);
2124     	SET_FROM_SB(chunk_size);
2125     
2126     	if (md_copy_to_user(arg, &info, sizeof(info)))
2127     		return -EFAULT;
2128     
2129     	return 0;
2130     }
2131     #undef SET_FROM_SB
2132     
2133     #define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
2134     static int get_disk_info (mddev_t * mddev, void * arg)
2135     {
2136     	mdu_disk_info_t info;
2137     	unsigned int nr;
2138     
2139     	if (!mddev->sb)
2140     		return -EINVAL;
2141     
2142     	if (md_copy_from_user(&info, arg, sizeof(info)))
2143     		return -EFAULT;
2144     
2145     	nr = info.number;
2146     	if (nr >= MD_SB_DISKS)
2147     		return -EINVAL;
2148     
2149     	SET_FROM_SB(major);
2150     	SET_FROM_SB(minor);
2151     	SET_FROM_SB(raid_disk);
2152     	SET_FROM_SB(state);
2153     
2154     	if (md_copy_to_user(arg, &info, sizeof(info)))
2155     		return -EFAULT;
2156     
2157     	return 0;
2158     }
2159     #undef SET_FROM_SB
2160     
2161     #define SET_SB(x) mddev->sb->disks[nr].x = info->x
2162     
2163     static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info)
2164     {
2165     	int err, size, persistent;
2166     	mdk_rdev_t *rdev;
2167     	unsigned int nr;
2168     	kdev_t dev;
2169     	dev = MKDEV(info->major,info->minor);
2170     
2171     	if (find_rdev_all(dev)) {
2172     		printk("md: device %s already used in a RAID array!\n",
2173     				partition_name(dev));
2174     		return -EBUSY;
2175     	}
2176     	if (!mddev->sb) {
2177     		/* expecting a device which has a superblock */
2178     		err = md_import_device(dev, 1);
2179     		if (err) {
2180     			printk("md: md_import_device returned %d\n", err);
2181     			return -EINVAL;
2182     		}
2183     		rdev = find_rdev_all(dev);
2184     		if (!rdev) {
2185     			MD_BUG();
2186     			return -EINVAL;
2187     		}
2188     		if (mddev->nb_dev) {
2189     			mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
2190     							mdk_rdev_t, same_set);
2191     			if (!uuid_equal(rdev0, rdev)) {
2192     				printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2193     				export_rdev(rdev);
2194     				return -EINVAL;
2195     			}
2196     			if (!sb_equal(rdev0->sb, rdev->sb)) {
2197     				printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
2198     				export_rdev(rdev);
2199     				return -EINVAL;
2200     			}
2201     		}
2202     		bind_rdev_to_array(rdev, mddev);
2203     		return 0;
2204     	}
2205     
2206     	nr = info->number;
2207     	if (nr >= mddev->sb->nr_disks) {
2208     		MD_BUG();
2209     		return -EINVAL;
2210     	}
2211     
2212     
2213     	SET_SB(number);
2214     	SET_SB(major);
2215     	SET_SB(minor);
2216     	SET_SB(raid_disk);
2217     	SET_SB(state);
2218     
2219     	if ((info->state & (1<<MD_DISK_FAULTY))==0) {
2220     		err = md_import_device (dev, 0);
2221     		if (err) {
2222     			printk("md: error, md_import_device() returned %d\n", err);
2223     			return -EINVAL;
2224     		}
2225     		rdev = find_rdev_all(dev);
2226     		if (!rdev) {
2227     			MD_BUG();
2228     			return -EINVAL;
2229     		}
2230     
2231     		rdev->old_dev = dev;
2232     		rdev->desc_nr = info->number;
2233     
2234     		bind_rdev_to_array(rdev, mddev);
2235     
2236     		persistent = !mddev->sb->not_persistent;
2237     		if (!persistent)
2238     			printk("md: nonpersistent superblock ...\n");
2239     
2240     		size = calc_dev_size(dev, mddev, persistent);
2241     		rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2242     
2243     		if (!mddev->sb->size || (mddev->sb->size > size))
2244     			mddev->sb->size = size;
2245     	}
2246     
2247     	/*
2248     	 * sync all other superblocks with the main superblock
2249     	 */
2250     	sync_sbs(mddev);
2251     
2252     	return 0;
2253     }
2254     #undef SET_SB
2255     
2256     static int hot_generate_error (mddev_t * mddev, kdev_t dev)
2257     {
2258     	struct request_queue *q;
2259     	mdk_rdev_t *rdev;
2260     	mdp_disk_t *disk;
2261      
2262     	if (!mddev->pers)
2263     		return -ENODEV;
2264      
2265     	printk("md: trying to generate %s error in md%d ... \n",
2266     		partition_name(dev), mdidx(mddev));
2267      
2268     	rdev = find_rdev(mddev, dev);
2269     	if (!rdev) {
2270     		MD_BUG();
2271     		return -ENXIO;
2272     	}
2273      
2274     	if (rdev->desc_nr == -1) {
2275     		MD_BUG();
2276     		return -EINVAL;
2277     	}
2278     	disk = &mddev->sb->disks[rdev->desc_nr];
2279     	if (!disk_active(disk))
2280     		return -ENODEV;
2281      
2282     	q = blk_get_queue(rdev->dev);
2283     	if (!q) {
2284     		MD_BUG();
2285     		return -ENODEV;
2286     	}
2287     	printk("md: okay, generating error!\n");
2288     //	q->oneshot_error = 1; // disabled for now
2289      
2290     	return 0;
2291     }
2292     
2293     static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
2294     {
2295     	int err;
2296     	mdk_rdev_t *rdev;
2297     	mdp_disk_t *disk;
2298     
2299     	if (!mddev->pers)
2300     		return -ENODEV;
2301     
2302     	printk("md: trying to remove %s from md%d ... \n",
2303     		partition_name(dev), mdidx(mddev));
2304     
2305     	if (!mddev->pers->diskop) {
2306     		printk("md%d: personality does not support diskops!\n",
2307     								 mdidx(mddev));
2308     		return -EINVAL;
2309     	}
2310     
2311     	rdev = find_rdev(mddev, dev);
2312     	if (!rdev)
2313     		return -ENXIO;
2314     
2315     	if (rdev->desc_nr == -1) {
2316     		MD_BUG();
2317     		return -EINVAL;
2318     	}
2319     	disk = &mddev->sb->disks[rdev->desc_nr];
2320     	if (disk_active(disk)) {
2321     		MD_BUG();
2322     		goto busy;
2323     	}
2324     	if (disk_removed(disk)) {
2325     		MD_BUG();
2326     		return -EINVAL;
2327     	}
2328     	
2329     	err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
2330     	if (err == -EBUSY) {
2331     		MD_BUG();
2332     		goto busy;
2333     	}
2334     	if (err) {
2335     		MD_BUG();
2336     		return -EINVAL;
2337     	}
2338     
2339     	remove_descriptor(disk, mddev->sb);
2340     	kick_rdev_from_array(rdev);
2341     	mddev->sb_dirty = 1;
2342     	md_update_sb(mddev);
2343     
2344     	return 0;
2345     busy:
2346     	printk("md: cannot remove active disk %s from md%d ... \n",
2347     		partition_name(dev), mdidx(mddev));
2348     	return -EBUSY;
2349     }
2350     
2351     static int hot_add_disk (mddev_t * mddev, kdev_t dev)
2352     {
2353     	int i, err, persistent;
2354     	unsigned int size;
2355     	mdk_rdev_t *rdev;
2356     	mdp_disk_t *disk;
2357     
2358     	if (!mddev->pers)
2359     		return -ENODEV;
2360     
2361     	printk("md: trying to hot-add %s to md%d ... \n",
2362     		partition_name(dev), mdidx(mddev));
2363     
2364     	if (!mddev->pers->diskop) {
2365     		printk("md%d: personality does not support diskops!\n",
2366     								 mdidx(mddev));
2367     		return -EINVAL;
2368     	}
2369     
2370     	persistent = !mddev->sb->not_persistent;
2371     	size = calc_dev_size(dev, mddev, persistent);
2372     
2373     	if (size < mddev->sb->size) {
2374     		printk("md%d: disk size %d blocks < array size %d\n",
2375     				mdidx(mddev), size, mddev->sb->size);
2376     		return -ENOSPC;
2377     	}
2378     
2379     	rdev = find_rdev(mddev, dev);
2380     	if (rdev)
2381     		return -EBUSY;
2382     
2383     	err = md_import_device (dev, 0);
2384     	if (err) {
2385     		printk("md: error, md_import_device() returned %d\n", err);
2386     		return -EINVAL;
2387     	}
2388     	rdev = find_rdev_all(dev);
2389     	if (!rdev) {
2390     		MD_BUG();
2391     		return -EINVAL;
2392     	}
2393     	if (rdev->faulty) {
2394     		printk("md: can not hot-add faulty %s disk to md%d!\n",
2395     				partition_name(dev), mdidx(mddev));
2396     		err = -EINVAL;
2397     		goto abort_export;
2398     	}
2399     	bind_rdev_to_array(rdev, mddev);
2400     
2401     	/*
2402     	 * The rest should better be atomic, we can have disk failures
2403     	 * noticed in interrupt contexts ...
2404     	 */
2405     	rdev->old_dev = dev;
2406     	rdev->size = size;
2407     	rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
2408     
2409     	disk = mddev->sb->disks + mddev->sb->raid_disks;
2410     	for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
2411     		disk = mddev->sb->disks + i;
2412     
2413     		if (!disk->major && !disk->minor)
2414     			break;
2415     		if (disk_removed(disk))
2416     			break;
2417     	}
2418     	if (i == MD_SB_DISKS) {
2419     		printk("md%d: can not hot-add to full array!\n", mdidx(mddev));
2420     		err = -EBUSY;
2421     		goto abort_unbind_export;
2422     	}
2423     
2424     	if (disk_removed(disk)) {
2425     		/*
2426     		 * reuse slot
2427     		 */
2428     		if (disk->number != i) {
2429     			MD_BUG();
2430     			err = -EINVAL;
2431     			goto abort_unbind_export;
2432     		}
2433     	} else {
2434     		disk->number = i;
2435     	}
2436     
2437     	disk->raid_disk = disk->number;
2438     	disk->major = MAJOR(dev);
2439     	disk->minor = MINOR(dev);
2440     
2441     	if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
2442     		MD_BUG();
2443     		err = -EINVAL;
2444     		goto abort_unbind_export;
2445     	}
2446     
2447     	mark_disk_spare(disk);
2448     	mddev->sb->nr_disks++;
2449     	mddev->sb->spare_disks++;
2450     	mddev->sb->working_disks++;
2451     
2452     	mddev->sb_dirty = 1;
2453     
2454     	md_update_sb(mddev);
2455     
2456     	/*
2457     	 * Kick recovery, maybe this spare has to be added to the
2458     	 * array immediately.
2459     	 */
2460     	md_recover_arrays();
2461     
2462     	return 0;
2463     
2464     abort_unbind_export:
2465     	unbind_rdev_from_array(rdev);
2466     
2467     abort_export:
2468     	export_rdev(rdev);
2469     	return err;
2470     }
2471     
2472     #define SET_SB(x) mddev->sb->x = info->x
2473     static int set_array_info (mddev_t * mddev, mdu_array_info_t *info)
2474     {
2475     
2476     	if (alloc_array_sb(mddev))
2477     		return -ENOMEM;
2478     
2479     	mddev->sb->major_version = MD_MAJOR_VERSION;
2480     	mddev->sb->minor_version = MD_MINOR_VERSION;
2481     	mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
2482     	mddev->sb->ctime = CURRENT_TIME;
2483     
2484     	SET_SB(level);
2485     	SET_SB(size);
2486     	SET_SB(nr_disks);
2487     	SET_SB(raid_disks);
2488     	SET_SB(md_minor);
2489     	SET_SB(not_persistent);
2490     
2491     	SET_SB(state);
2492     	SET_SB(active_disks);
2493     	SET_SB(working_disks);
2494     	SET_SB(failed_disks);
2495     	SET_SB(spare_disks);
2496     
2497     	SET_SB(layout);
2498     	SET_SB(chunk_size);
2499     
2500     	mddev->sb->md_magic = MD_SB_MAGIC;
2501     
2502     	/*
2503     	 * Generate a 128 bit UUID
2504     	 */
2505     	get_random_bytes(&mddev->sb->set_uuid0, 4);
2506     	get_random_bytes(&mddev->sb->set_uuid1, 4);
2507     	get_random_bytes(&mddev->sb->set_uuid2, 4);
2508     	get_random_bytes(&mddev->sb->set_uuid3, 4);
2509     
2510     	return 0;
2511     }
2512     #undef SET_SB
2513     
2514     static int set_disk_info (mddev_t * mddev, void * arg)
2515     {
2516     	printk("md: not yet");
2517     	return -EINVAL;
2518     }
2519     
2520     static int clear_array (mddev_t * mddev)
2521     {
2522     	printk("md: not yet");
2523     	return -EINVAL;
2524     }
2525     
2526     static int write_raid_info (mddev_t * mddev)
2527     {
2528     	printk("md: not yet");
2529     	return -EINVAL;
2530     }
2531     
2532     static int protect_array (mddev_t * mddev)
2533     {
2534     	printk("md: not yet");
2535     	return -EINVAL;
2536     }
2537     
2538     static int unprotect_array (mddev_t * mddev)
2539     {
2540     	printk("md: not yet");
2541     	return -EINVAL;
2542     }
2543     
2544     static int set_disk_faulty (mddev_t *mddev, kdev_t dev)
2545     {
2546     	int ret;
2547     
2548     	ret = md_error(mddev, dev);
2549     	return ret;
2550     }
2551     
2552     static int md_ioctl (struct inode *inode, struct file *file,
2553     			unsigned int cmd, unsigned long arg)
2554     {
2555     	unsigned int minor;
2556     	int err = 0;
2557     	struct hd_geometry *loc = (struct hd_geometry *) arg;
2558     	mddev_t *mddev = NULL;
2559     	kdev_t dev;
2560     
2561     	if (!md_capable_admin())
2562     		return -EACCES;
2563     
2564     	dev = inode->i_rdev;
2565     	minor = MINOR(dev);
2566     	if (minor >= MAX_MD_DEVS) {
2567     		MD_BUG();
2568     		return -EINVAL;
2569     	}
2570     
2571     	/*
2572     	 * Commands dealing with the RAID driver but not any
2573     	 * particular array:
2574     	 */
2575     	switch (cmd)
2576     	{
2577     		case RAID_VERSION:
2578     			err = get_version((void *)arg);
2579     			goto done;
2580     
2581     		case PRINT_RAID_DEBUG:
2582     			err = 0;
2583     			md_print_devices();
2584     			goto done_unlock;
2585     
2586     #ifndef MODULE
2587     		case RAID_AUTORUN:
2588     			err = 0;
2589     			autostart_arrays();
2590     			goto done;
2591     #endif
2592     
2593     		case BLKGETSIZE:	/* Return device size */
2594     			if (!arg) {
2595     				err = -EINVAL;
2596     				MD_BUG();
2597     				goto abort;
2598     			}
2599     			err = md_put_user(md_hd_struct[minor].nr_sects,
2600     						(long *) arg);
2601     			goto done;
2602     
2603     		case BLKGETSIZE64:	/* Return device size */
2604     			err = md_put_user((u64)md_hd_struct[minor].nr_sects << 9,
2605     						(u64 *) arg);
2606     			goto done;
2607     
2608     		case BLKRAGET:
2609     		case BLKRASET:
2610     		case BLKFLSBUF:
2611     		case BLKBSZGET:
2612     		case BLKBSZSET:
2613     			err = blk_ioctl (dev, cmd, arg);
2614     			goto abort;
2615     
2616     		default:;
2617     	}
2618     
2619     	/*
2620     	 * Commands creating/starting a new array:
2621     	 */
2622     
2623     	mddev = kdev_to_mddev(dev);
2624     
2625     	switch (cmd)
2626     	{
2627     		case SET_ARRAY_INFO:
2628     		case START_ARRAY:
2629     			if (mddev) {
2630     				printk("md: array md%d already exists!\n",
2631     								mdidx(mddev));
2632     				err = -EEXIST;
2633     				goto abort;
2634     			}
2635     		default:;
2636     	}
2637     	switch (cmd)
2638     	{
2639     		case SET_ARRAY_INFO:
2640     			mddev = alloc_mddev(dev);
2641     			if (!mddev) {
2642     				err = -ENOMEM;
2643     				goto abort;
2644     			}
2645     			atomic_inc(&mddev->active);
2646     
2647     			/*
2648     			 * alloc_mddev() should possibly self-lock.
2649     			 */
2650     			err = lock_mddev(mddev);
2651     			if (err) {
2652     				printk("md: ioctl, reason %d, cmd %d\n", err, cmd);
2653     				goto abort;
2654     			}
2655     
2656     			if (mddev->sb) {
2657     				printk("md: array md%d already has a superblock!\n",
2658     					mdidx(mddev));
2659     				err = -EBUSY;
2660     				goto abort_unlock;
2661     			}
2662     			if (arg) {
2663     				mdu_array_info_t info;
2664     				if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
2665     					err = -EFAULT;
2666     					goto abort_unlock;
2667     				}
2668     				err = set_array_info(mddev, &info);
2669     				if (err) {
2670     					printk("md: couldnt set array info. %d\n", err);
2671     					goto abort_unlock;
2672     				}
2673     			}
2674     			goto done_unlock;
2675     
2676     		case START_ARRAY:
2677     			/*
2678     			 * possibly make it lock the array ...
2679     			 */
2680     			err = autostart_array((kdev_t)arg, dev);
2681     			if (err) {
2682     				printk("md: autostart %s failed!\n",
2683     					partition_name((kdev_t)arg));
2684     				goto abort;
2685     			}
2686     			goto done;
2687     
2688     		default:;
2689     	}
2690     
2691     	/*
2692     	 * Commands querying/configuring an existing array:
2693     	 */
2694     
2695     	if (!mddev) {
2696     		err = -ENODEV;
2697     		goto abort;
2698     	}
2699     	err = lock_mddev(mddev);
2700     	if (err) {
2701     		printk("md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
2702     		goto abort;
2703     	}
2704     	/* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2705     	if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2706     		err = -ENODEV;
2707     		goto abort_unlock;
2708     	}
2709     
2710     	/*
2711     	 * Commands even a read-only array can execute:
2712     	 */
2713     	switch (cmd)
2714     	{
2715     		case GET_ARRAY_INFO:
2716     			err = get_array_info(mddev, (void *)arg);
2717     			goto done_unlock;
2718     
2719     		case GET_DISK_INFO:
2720     			err = get_disk_info(mddev, (void *)arg);
2721     			goto done_unlock;
2722     
2723     		case RESTART_ARRAY_RW:
2724     			err = restart_array(mddev);
2725     			goto done_unlock;
2726     
2727     		case STOP_ARRAY:
2728     			if (!(err = do_md_stop (mddev, 0)))
2729     				mddev = NULL;
2730     			goto done_unlock;
2731     
2732     		case STOP_ARRAY_RO:
2733     			err = do_md_stop (mddev, 1);
2734     			goto done_unlock;
2735     
2736     	/*
2737     	 * We have a problem here : there is no easy way to give a CHS
2738     	 * virtual geometry. We currently pretend that we have a 2 heads
2739     	 * 4 sectors (with a BIG number of cylinders...). This drives
2740     	 * dosfs just mad... ;-)
2741     	 */
2742     		case HDIO_GETGEO:
2743     			if (!loc) {
2744     				err = -EINVAL;
2745     				goto abort_unlock;
2746     			}
2747     			err = md_put_user (2, (char *) &loc->heads);
2748     			if (err)
2749     				goto abort_unlock;
2750     			err = md_put_user (4, (char *) &loc->sectors);
2751     			if (err)
2752     				goto abort_unlock;
2753     			err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
2754     						(short *) &loc->cylinders);
2755     			if (err)
2756     				goto abort_unlock;
2757     			err = md_put_user (md_hd_struct[minor].start_sect,
2758     						(long *) &loc->start);
2759     			goto done_unlock;
2760     	}
2761     
2762     	/*
2763     	 * The remaining ioctls are changing the state of the
2764     	 * superblock, so we do not allow read-only arrays
2765     	 * here:
2766     	 */
2767     	if (mddev->ro) {
2768     		err = -EROFS;
2769     		goto abort_unlock;
2770     	}
2771     
2772     	switch (cmd)
2773     	{
2774     		case CLEAR_ARRAY:
2775     			err = clear_array(mddev);
2776     			goto done_unlock;
2777     
2778     		case ADD_NEW_DISK:
2779     		{
2780     			mdu_disk_info_t info;
2781     			if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
2782     				err = -EFAULT;
2783     			else
2784     				err = add_new_disk(mddev, &info);
2785     			goto done_unlock;
2786     		}
2787     		case HOT_GENERATE_ERROR:
2788     			err = hot_generate_error(mddev, (kdev_t)arg);
2789     			goto done_unlock;
2790     		case HOT_REMOVE_DISK:
2791     			err = hot_remove_disk(mddev, (kdev_t)arg);
2792     			goto done_unlock;
2793     
2794     		case HOT_ADD_DISK:
2795     			err = hot_add_disk(mddev, (kdev_t)arg);
2796     			goto done_unlock;
2797     
2798     		case SET_DISK_INFO:
2799     			err = set_disk_info(mddev, (void *)arg);
2800     			goto done_unlock;
2801     
2802     		case WRITE_RAID_INFO:
2803     			err = write_raid_info(mddev);
2804     			goto done_unlock;
2805     
2806     		case UNPROTECT_ARRAY:
2807     			err = unprotect_array(mddev);
2808     			goto done_unlock;
2809     
2810     		case PROTECT_ARRAY:
2811     			err = protect_array(mddev);
2812     			goto done_unlock;
2813     
2814     		case SET_DISK_FAULTY:
2815     			err = set_disk_faulty(mddev, (kdev_t)arg);
2816     			goto done_unlock;
2817     
2818     		case RUN_ARRAY:
2819     		{
2820     /* The data is never used....
2821     			mdu_param_t param;
2822     			err = md_copy_from_user(&param, (mdu_param_t *)arg,
2823     							 sizeof(param));
2824     			if (err)
2825     				goto abort_unlock;
2826     */
2827     			err = do_md_run (mddev);
2828     			/*
2829     			 * we have to clean up the mess if
2830     			 * the array cannot be run for some
2831     			 * reason ...
2832     			 */
2833     			if (err) {
2834     				mddev->sb_dirty = 0;
2835     				if (!do_md_stop (mddev, 0))
2836     					mddev = NULL;
2837     			}
2838     			goto done_unlock;
2839     		}
2840     
2841     		default:
2842     			printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid);
2843     			err = -EINVAL;
2844     			goto abort_unlock;
2845     	}
2846     
2847     done_unlock:
2848     abort_unlock:
2849     	if (mddev)
2850     		unlock_mddev(mddev);
2851     
2852     	return err;
2853     done:
2854     	if (err)
2855     		printk("md: huh12?\n");
2856     abort:
2857     	return err;
2858     }
2859     
2860     static int md_open (struct inode *inode, struct file *file)
2861     {
2862     	/*
2863     	 * Always succeed, but increment the usage count
2864     	 */
2865     	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2866     	if (mddev)
2867     		atomic_inc(&mddev->active);
2868     	return (0);
2869     }
2870     
2871     static int md_release (struct inode *inode, struct file * file)
2872     {
2873     	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
2874     	if (mddev)
2875     		atomic_dec(&mddev->active);
2876     	return 0;
2877     }
2878     
2879     static struct block_device_operations md_fops=
2880     {
2881     	open:		md_open,
2882     	release:	md_release,
2883     	ioctl:		md_ioctl,
2884     };
2885     
2886     
2887     int md_thread(void * arg)
2888     {
2889     	mdk_thread_t *thread = arg;
2890     
2891     	md_lock_kernel();
2892     
2893     	/*
2894     	 * Detach thread
2895     	 */
2896     
2897     	daemonize();
2898     
2899     	sprintf(current->comm, thread->name);
2900     	md_init_signals();
2901     	md_flush_signals();
2902     	thread->tsk = current;
2903     
2904     	/*
2905     	 * md_thread is a 'system-thread', it's priority should be very
2906     	 * high. We avoid resource deadlocks individually in each
2907     	 * raid personality. (RAID5 does preallocation) We also use RR and
2908     	 * the very same RT priority as kswapd, thus we will never get
2909     	 * into a priority inversion deadlock.
2910     	 *
2911     	 * we definitely have to have equal or higher priority than
2912     	 * bdflush, otherwise bdflush will deadlock if there are too
2913     	 * many dirty RAID5 blocks.
2914     	 */
2915     	current->policy = SCHED_OTHER;
2916     	current->nice = -20;
2917     	md_unlock_kernel();
2918     
2919     	complete(thread->event);
2920     	while (thread->run) {
2921     		void (*run)(void *data);
2922     		DECLARE_WAITQUEUE(wait, current);
2923     
2924     		add_wait_queue(&thread->wqueue, &wait);
2925     		set_task_state(current, TASK_INTERRUPTIBLE);
2926     		if (!test_bit(THREAD_WAKEUP, &thread->flags)) {
2927     			dprintk("md: thread %p went to sleep.\n", thread);
2928     			schedule();
2929     			dprintk("md: thread %p woke up.\n", thread);
2930     		}
2931     		current->state = TASK_RUNNING;
2932     		remove_wait_queue(&thread->wqueue, &wait);
2933     		clear_bit(THREAD_WAKEUP, &thread->flags);
2934     
2935     		run = thread->run;
2936     		if (run) {
2937     			run(thread->data);
2938     			run_task_queue(&tq_disk);
2939     		}
2940     		if (md_signal_pending(current)) {
2941     			printk("md: %8s(%d) flushing signals.\n", current->comm,
2942     				current->pid);
2943     			md_flush_signals();
2944     		}
2945     	}
2946     	complete(thread->event);
2947     	return 0;
2948     }
2949     
2950     void md_wakeup_thread(mdk_thread_t *thread)
2951     {
2952     	dprintk("md: waking up MD thread %p.\n", thread);
2953     	set_bit(THREAD_WAKEUP, &thread->flags);
2954     	wake_up(&thread->wqueue);
2955     }
2956     
2957     mdk_thread_t *md_register_thread (void (*run) (void *),
2958     						void *data, const char *name)
2959     {
2960     	mdk_thread_t *thread;
2961     	int ret;
2962     	struct completion event;
2963     	
2964     	thread = (mdk_thread_t *) kmalloc
2965     				(sizeof(mdk_thread_t), GFP_KERNEL);
2966     	if (!thread)
2967     		return NULL;
2968     	
2969     	memset(thread, 0, sizeof(mdk_thread_t));
2970     	md_init_waitqueue_head(&thread->wqueue);
2971     
2972     	init_completion(&event);	
2973     	thread->event = &event;
2974     	thread->run = run;
2975     	thread->data = data;
2976     	thread->name = name;
2977     	ret = kernel_thread(md_thread, thread, 0);
2978     	if (ret < 0) {
2979     		kfree(thread);
2980     		return NULL;
2981     	}
2982     	wait_for_completion(&event);
2983     	return thread;
2984     }
2985     
2986     void md_interrupt_thread (mdk_thread_t *thread)
2987     {
2988     	if (!thread->tsk) {
2989     		MD_BUG();
2990     		return;
2991     	}
2992     	printk("md: interrupting MD-thread pid %d\n", thread->tsk->pid);
2993     	send_sig(SIGKILL, thread->tsk, 1);
2994     }
2995     
2996     void md_unregister_thread (mdk_thread_t *thread)
2997     {
2998     	struct completion event;
2999     
3000     	init_completion(&event);
3001     	
3002     	thread->event = &event;
3003     	thread->run = NULL;
3004     	thread->name = NULL;
3005     	md_interrupt_thread(thread);
3006     	wait_for_completion(&event);
3007     	kfree(thread);
3008     }
3009     
3010     void md_recover_arrays (void)
3011     {
3012     	if (!md_recovery_thread) {
3013     		MD_BUG();
3014     		return;
3015     	}
3016     	md_wakeup_thread(md_recovery_thread);
3017     }
3018     
3019     
3020     int md_error (mddev_t *mddev, kdev_t rdev)
3021     {
3022     	mdk_rdev_t * rrdev;
3023     
3024     /*	printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3));
3025      */
3026     	if (!mddev) {
3027     		MD_BUG();
3028     		return 0;
3029     	}
3030     	rrdev = find_rdev(mddev, rdev);
3031     	if (!rrdev || rrdev->faulty)
3032     		return 0;
3033     	if (mddev->pers->error_handler == NULL
3034     			|| mddev->pers->error_handler(mddev,rdev) <= 0) {
3035     		free_disk_sb(rrdev);
3036     		rrdev->faulty = 1;
3037     	} else
3038     		return 1;
3039     	/*
3040     	 * if recovery was running, stop it now.
3041     	 */
3042     	if (mddev->pers->stop_resync)
3043     		mddev->pers->stop_resync(mddev);
3044     	if (mddev->recovery_running)
3045     		md_interrupt_thread(md_recovery_thread);
3046     	md_recover_arrays();
3047     
3048     	return 0;
3049     }
3050     
3051     static int status_unused (char * page)
3052     {
3053     	int sz = 0, i = 0;
3054     	mdk_rdev_t *rdev;
3055     	struct md_list_head *tmp;
3056     
3057     	sz += sprintf(page + sz, "unused devices: ");
3058     
3059     	ITERATE_RDEV_ALL(rdev,tmp) {
3060     		if (!rdev->same_set.next && !rdev->same_set.prev) {
3061     			/*
3062     			 * The device is not yet used by any array.
3063     			 */
3064     			i++;
3065     			sz += sprintf(page + sz, "%s ",
3066     				partition_name(rdev->dev));
3067     		}
3068     	}
3069     	if (!i)
3070     		sz += sprintf(page + sz, "<none>");
3071     
3072     	sz += sprintf(page + sz, "\n");
3073     	return sz;
3074     }
3075     
3076     
3077     static int status_resync (char * page, mddev_t * mddev)
3078     {
3079     	int sz = 0;
3080     	unsigned long max_blocks, resync, res, dt, db, rt;
3081     
3082     	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
3083     	max_blocks = mddev->sb->size;
3084     
3085     	/*
3086     	 * Should not happen.
3087     	 */		
3088     	if (!max_blocks) {
3089     		MD_BUG();
3090     		return 0;
3091     	}
3092     	res = (resync/1024)*1000/(max_blocks/1024 + 1);
3093     	{
3094     		int i, x = res/50, y = 20-x;
3095     		sz += sprintf(page + sz, "[");
3096     		for (i = 0; i < x; i++)
3097     			sz += sprintf(page + sz, "=");
3098     		sz += sprintf(page + sz, ">");
3099     		for (i = 0; i < y; i++)
3100     			sz += sprintf(page + sz, ".");
3101     		sz += sprintf(page + sz, "] ");
3102     	}
3103     	if (!mddev->recovery_running)
3104     		/*
3105     		 * true resync
3106     		 */
3107     		sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)",
3108     				res/10, res % 10, resync, max_blocks);
3109     	else
3110     		/*
3111     		 * recovery ...
3112     		 */
3113     		sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)",
3114     				res/10, res % 10, resync, max_blocks);
3115     
3116     	/*
3117     	 * We do not want to overflow, so the order of operands and
3118     	 * the * 100 / 100 trick are important. We do a +1 to be
3119     	 * safe against division by zero. We only estimate anyway.
3120     	 *
3121     	 * dt: time from mark until now
3122     	 * db: blocks written from mark until now
3123     	 * rt: remaining time
3124     	 */
3125     	dt = ((jiffies - mddev->resync_mark) / HZ);
3126     	if (!dt) dt++;
3127     	db = resync - (mddev->resync_mark_cnt/2);
3128     	rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
3129     	
3130     	sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
3131     
3132     	sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);
3133     
3134     	return sz;
3135     }
3136     
3137     static int md_status_read_proc(char *page, char **start, off_t off,
3138     			int count, int *eof, void *data)
3139     {
3140     	int sz = 0, j, size;
3141     	struct md_list_head *tmp, *tmp2;
3142     	mdk_rdev_t *rdev;
3143     	mddev_t *mddev;
3144     
3145     	sz += sprintf(page + sz, "Personalities : ");
3146     	for (j = 0; j < MAX_PERSONALITY; j++)
3147     	if (pers[j])
3148     		sz += sprintf(page+sz, "[%s] ", pers[j]->name);
3149     
3150     	sz += sprintf(page+sz, "\n");
3151     
3152     
3153     	sz += sprintf(page+sz, "read_ahead ");
3154     	if (read_ahead[MD_MAJOR] == INT_MAX)
3155     		sz += sprintf(page+sz, "not set\n");
3156     	else
3157     		sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]);
3158     
3159     	ITERATE_MDDEV(mddev,tmp) {
3160     		sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
3161     						mddev->pers ? "" : "in");
3162     		if (mddev->pers) {
3163     			if (mddev->ro)	
3164     				sz += sprintf(page + sz, " (read-only)");
3165     			sz += sprintf(page + sz, " %s", mddev->pers->name);
3166     		}
3167     
3168     		size = 0;
3169     		ITERATE_RDEV(mddev,rdev,tmp2) {
3170     			sz += sprintf(page + sz, " %s[%d]",
3171     				partition_name(rdev->dev), rdev->desc_nr);
3172     			if (rdev->faulty) {
3173     				sz += sprintf(page + sz, "(F)");
3174     				continue;
3175     			}
3176     			size += rdev->size;
3177     		}
3178     
3179     		if (mddev->nb_dev) {
3180     			if (mddev->pers)
3181     				sz += sprintf(page + sz, "\n      %d blocks",
3182     						 md_size[mdidx(mddev)]);
3183     			else
3184     				sz += sprintf(page + sz, "\n      %d blocks", size);
3185     		}
3186     
3187     		if (!mddev->pers) {
3188     			sz += sprintf(page+sz, "\n");
3189     			continue;
3190     		}
3191     
3192     		sz += mddev->pers->status (page+sz, mddev);
3193     
3194     		sz += sprintf(page+sz, "\n      ");
3195     		if (mddev->curr_resync) {
3196     			sz += status_resync (page+sz, mddev);
3197     		} else {
3198     			if (md_atomic_read(&mddev->resync_sem.count) != 1)
3199     				sz += sprintf(page + sz, "	resync=DELAYED");
3200     		}
3201     		sz += sprintf(page + sz, "\n");
3202     	}
3203     	sz += status_unused (page + sz);
3204     
3205     	return sz;
3206     }
3207     
3208     int register_md_personality (int pnum, mdk_personality_t *p)
3209     {
3210     	if (pnum >= MAX_PERSONALITY) {
3211     		MD_BUG();
3212     		return -EINVAL;
3213     	}
3214     
3215     	if (pers[pnum]) {
3216     		MD_BUG();
3217     		return -EBUSY;
3218     	}
3219     
3220     	pers[pnum] = p;
3221     	printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3222     	return 0;
3223     }
3224     
3225     int unregister_md_personality (int pnum)
3226     {
3227     	if (pnum >= MAX_PERSONALITY) {
3228     		MD_BUG();
3229     		return -EINVAL;
3230     	}
3231     
3232     	printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3233     	pers[pnum] = NULL;
3234     	return 0;
3235     }
3236     
3237     mdp_disk_t *get_spare(mddev_t *mddev)
3238     {
3239     	mdp_super_t *sb = mddev->sb;
3240     	mdp_disk_t *disk;
3241     	mdk_rdev_t *rdev;
3242     	struct md_list_head *tmp;
3243     
3244     	ITERATE_RDEV(mddev,rdev,tmp) {
3245     		if (rdev->faulty)
3246     			continue;
3247     		if (!rdev->sb) {
3248     			MD_BUG();
3249     			continue;
3250     		}
3251     		disk = &sb->disks[rdev->desc_nr];
3252     		if (disk_faulty(disk)) {
3253     			MD_BUG();
3254     			continue;
3255     		}
3256     		if (disk_active(disk))
3257     			continue;
3258     		return disk;
3259     	}
3260     	return NULL;
3261     }
3262     
3263     static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
3264     void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
3265     {
3266     	unsigned int major = MAJOR(dev);
3267     	unsigned int index;
3268     
3269     	index = disk_index(dev);
3270     	if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3271     		return;
3272     
3273     	sync_io[major][index] += nr_sectors;
3274     }
3275     
3276     static int is_mddev_idle (mddev_t *mddev)
3277     {
3278     	mdk_rdev_t * rdev;
3279     	struct md_list_head *tmp;
3280     	int idle;
3281     	unsigned long curr_events;
3282     
3283     	idle = 1;
3284     	ITERATE_RDEV(mddev,rdev,tmp) {
3285     		int major = MAJOR(rdev->dev);
3286     		int idx = disk_index(rdev->dev);
3287     
3288     		if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
3289     			continue;
3290     
3291     		curr_events = kstat.dk_drive_rblk[major][idx] +
3292     						kstat.dk_drive_wblk[major][idx] ;
3293     		curr_events -= sync_io[major][idx];
3294     //		printk("md: events(major: %d, idx: %d): %ld\n", major, idx, curr_events);
3295     		if ((curr_events - rdev->last_events) > 32) {
3296     //			printk("!I(%ld)%x", curr_events - rdev->last_events, rdev->dev);
3297     			rdev->last_events = curr_events;
3298     			idle = 0;
3299     		}
3300     	}
3301     	return idle;
3302     }
3303     
3304     MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3305     
3306     void md_done_sync(mddev_t *mddev, int blocks, int ok)
3307     {
3308     	/* another "blocks" (512byte) blocks have been synced */
3309     	atomic_sub(blocks, &mddev->recovery_active);
3310     	wake_up(&mddev->recovery_wait);
3311     	if (!ok) {
3312     		// stop recovery, signal do_sync ....
3313     	}
3314     }
3315     
3316     #define SYNC_MARKS	10
3317     #define	SYNC_MARK_STEP	(3*HZ)
3318     int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
3319     {
3320     	mddev_t *mddev2;
3321     	unsigned int max_sectors, currspeed,
3322     		j, window, err, serialize;
3323     	unsigned long mark[SYNC_MARKS];
3324     	unsigned long mark_cnt[SYNC_MARKS];	
3325     	int last_mark,m;
3326     	struct md_list_head *tmp;
3327     	unsigned long last_check;
3328     
3329     
3330     	err = down_interruptible(&mddev->resync_sem);
3331     	if (err)
3332     		goto out_nolock;
3333     
3334     recheck:
3335     	serialize = 0;
3336     	ITERATE_MDDEV(mddev2,tmp) {
3337     		if (mddev2 == mddev)
3338     			continue;
3339     		if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
3340     			printk(KERN_INFO "md: delaying resync of md%d until md%d has finished resync (they share one or more physical units)\n", mdidx(mddev), mdidx(mddev2));
3341     			serialize = 1;
3342     			break;
3343     		}
3344     	}
3345     	if (serialize) {
3346     		interruptible_sleep_on(&resync_wait);
3347     		if (md_signal_pending(current)) {
3348     			md_flush_signals();
3349     			err = -EINTR;
3350     			goto out;
3351     		}
3352     		goto recheck;
3353     	}
3354     
3355     	mddev->curr_resync = 1;
3356     
3357     	max_sectors = mddev->sb->size<<1;
3358     
3359     	printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
3360     	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
3361     						sysctl_speed_limit_min);
3362     	printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max);
3363     
3364     	/*
3365     	 * Resync has low priority.
3366     	 */
3367     	current->nice = 19;
3368     
3369     	is_mddev_idle(mddev); /* this also initializes IO event counters */
3370     	for (m = 0; m < SYNC_MARKS; m++) {
3371     		mark[m] = jiffies;
3372     		mark_cnt[m] = 0;
3373     	}
3374     	last_mark = 0;
3375     	mddev->resync_mark = mark[last_mark];
3376     	mddev->resync_mark_cnt = mark_cnt[last_mark];
3377     
3378     	/*
3379     	 * Tune reconstruction:
3380     	 */
3381     	window = MAX_READAHEAD*(PAGE_SIZE/512);
3382     	printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",window/2,max_sectors/2);
3383     
3384     	atomic_set(&mddev->recovery_active, 0);
3385     	init_waitqueue_head(&mddev->recovery_wait);
3386     	last_check = 0;
3387     	for (j = 0; j < max_sectors;) {
3388     		int sectors;
3389     
3390     		sectors = mddev->pers->sync_request(mddev, j);
3391     
3392     		if (sectors < 0) {
3393     			err = sectors;
3394     			goto out;
3395     		}
3396     		atomic_add(sectors, &mddev->recovery_active);
3397     		j += sectors;
3398     		mddev->curr_resync = j;
3399     
3400     		if (last_check + window > j)
3401     			continue;
3402     
3403     		last_check = j;
3404     		
3405     		run_task_queue(&tq_disk);
3406     
3407     	repeat:
3408     		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
3409     			/* step marks */
3410     			int next = (last_mark+1) % SYNC_MARKS;
3411     			
3412     			mddev->resync_mark = mark[next];
3413     			mddev->resync_mark_cnt = mark_cnt[next];
3414     			mark[next] = jiffies;
3415     			mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3416     			last_mark = next;
3417     		}
3418     
3419     
3420     		if (md_signal_pending(current)) {
3421     			/*
3422     			 * got a signal, exit.
3423     			 */
3424     			mddev->curr_resync = 0;
3425     			printk("md: md_do_sync() got signal ... exiting\n");
3426     			md_flush_signals();
3427     			err = -EINTR;
3428     			goto out;
3429     		}
3430     
3431     		/*
3432     		 * this loop exits only if either when we are slower than
3433     		 * the 'hard' speed limit, or the system was IO-idle for
3434     		 * a jiffy.
3435     		 * the system might be non-idle CPU-wise, but we only care
3436     		 * about not overloading the IO subsystem. (things like an
3437     		 * e2fsck being done on the RAID array should execute fast)
3438     		 */
3439     		if (md_need_resched(current))
3440     			schedule();
3441     
3442     		currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3443     
3444     		if (currspeed > sysctl_speed_limit_min) {
3445     			current->nice = 19;
3446     
3447     			if ((currspeed > sysctl_speed_limit_max) ||
3448     					!is_mddev_idle(mddev)) {
3449     				current->state = TASK_INTERRUPTIBLE;
3450     				md_schedule_timeout(HZ/4);
3451     				goto repeat;
3452     			}
3453     		} else
3454     			current->nice = -20;
3455     	}
3456     	printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
3457     	err = 0;
3458     	/*
3459     	 * this also signals 'finished resyncing' to md_stop
3460     	 */
3461     out:
3462     	wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
3463     	up(&mddev->resync_sem);
3464     out_nolock:
3465     	mddev->curr_resync = 0;
3466     	wake_up(&resync_wait);
3467     	return err;
3468     }
3469     
3470     
3471     /*
3472      * This is a kernel thread which syncs a spare disk with the active array
3473      *
3474      * the amount of foolproofing might seem to be a tad excessive, but an
3475      * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
3476      * of my root partition with the first 0.5 gigs of my /home partition ... so
3477      * i'm a bit nervous ;)
3478      */
3479     void md_do_recovery (void *data)
3480     {
3481     	int err;
3482     	mddev_t *mddev;
3483     	mdp_super_t *sb;
3484     	mdp_disk_t *spare;
3485     	struct md_list_head *tmp;
3486     
3487     	printk(KERN_INFO "md: recovery thread got woken up ...\n");
3488     restart:
3489     	ITERATE_MDDEV(mddev,tmp) {
3490     		sb = mddev->sb;
3491     		if (!sb)
3492     			continue;
3493     		if (mddev->recovery_running)
3494     			continue;
3495     		if (sb->active_disks == sb->raid_disks)
3496     			continue;
3497     		if (!sb->spare_disks) {
3498     			printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev));
3499     			continue;
3500     		}
3501     		/*
3502     		 * now here we get the spare and resync it.
3503     		 */
3504     		if ((spare = get_spare(mddev)) == NULL)
3505     			continue;
3506     		printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3507     		if (!mddev->pers->diskop)
3508     			continue;
3509     		if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
3510     			continue;
3511     		down(&mddev->recovery_sem);
3512     		mddev->recovery_running = 1;
3513     		err = md_do_sync(mddev, spare);
3514     		if (err == -EIO) {
3515     			printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
3516     			if (!disk_faulty(spare)) {
3517     				mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
3518     				mark_disk_faulty(spare);
3519     				mark_disk_nonsync(spare);
3520     				mark_disk_inactive(spare);
3521     				sb->spare_disks--;
3522     				sb->working_disks--;
3523     				sb->failed_disks++;
3524     			}
3525     		} else
3526     			if (disk_faulty(spare))
3527     				mddev->pers->diskop(mddev, &spare,
3528     						DISKOP_SPARE_INACTIVE);
3529     		if (err == -EINTR || err == -ENOMEM) {
3530     			/*
3531     			 * Recovery got interrupted, or ran out of mem ...
3532     			 * signal back that we have finished using the array.
3533     			 */
3534     			mddev->pers->diskop(mddev, &spare,
3535     							 DISKOP_SPARE_INACTIVE);
3536     			up(&mddev->recovery_sem);
3537     			mddev->recovery_running = 0;
3538     			continue;
3539     		} else {
3540     			mddev->recovery_running = 0;
3541     			up(&mddev->recovery_sem);
3542     		}
3543     		if (!disk_faulty(spare)) {
3544     			/*
3545     			 * the SPARE_ACTIVE diskop possibly changes the
3546     			 * pointer too
3547     			 */
3548     			mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
3549     			mark_disk_sync(spare);
3550     			mark_disk_active(spare);
3551     			sb->active_disks++;
3552     			sb->spare_disks--;
3553     		}
3554     		mddev->sb_dirty = 1;
3555     		md_update_sb(mddev);
3556     		goto restart;
3557     	}
3558     	printk(KERN_INFO "md: recovery thread finished ...\n");
3559     	
3560     }
3561     
3562     int md_notify_reboot(struct notifier_block *this,
3563     					unsigned long code, void *x)
3564     {
3565     	struct md_list_head *tmp;
3566     	mddev_t *mddev;
3567     
3568     	if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
3569     					|| (code == MD_SYS_POWER_OFF)) {
3570     
3571     		printk(KERN_INFO "md: stopping all md devices.\n");
3572     
3573     		ITERATE_MDDEV(mddev,tmp)
3574     			do_md_stop (mddev, 1);
3575     		/*
3576     		 * certain more exotic SCSI devices are known to be
3577     		 * volatile wrt too early system reboots. While the
3578     		 * right place to handle this issue is the given
3579     		 * driver, we do want to have a safe RAID driver ...
3580     		 */
3581     		md_mdelay(1000*1);
3582     	}
3583     	return NOTIFY_DONE;
3584     }
3585     
3586     struct notifier_block md_notifier = {
3587     	notifier_call:	md_notify_reboot,
3588     	next:		NULL,
3589     	priority:	INT_MAX, /* before any real devices */
3590     };
3591     
3592     static void md_geninit (void)
3593     {
3594     	int i;
3595     
3596     	for(i = 0; i < MAX_MD_DEVS; i++) {
3597     		md_blocksizes[i] = 1024;
3598     		md_size[i] = 0;
3599     		md_hardsect_sizes[i] = 512;
3600     		md_maxreadahead[i] = MD_READAHEAD;
3601     	}
3602     	blksize_size[MAJOR_NR] = md_blocksizes;
3603     	blk_size[MAJOR_NR] = md_size;
3604     	max_readahead[MAJOR_NR] = md_maxreadahead;
3605     	hardsect_size[MAJOR_NR] = md_hardsect_sizes;
3606     
3607     	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3608     
3609     #ifdef CONFIG_PROC_FS
3610     	create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
3611     #endif
3612     }
3613     
3614     int md__init md_init (void)
3615     {
3616     	static char * name = "mdrecoveryd";
3617     	int minor;
3618     	
3619     	printk (KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
3620     			MD_MAJOR_VERSION, MD_MINOR_VERSION,
3621     			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3622     
3623     	if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
3624     	{
3625     		printk (KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
3626     		return (-1);
3627     	}
3628     	devfs_handle = devfs_mk_dir (NULL, "md", NULL);
3629     	/* we don't use devfs_register_series because we want to fill md_hd_struct */
3630     	for (minor=0; minor < MAX_MD_DEVS; ++minor) {
3631     		char devname[128];
3632     		sprintf (devname, "%u", minor);
3633     		md_hd_struct[minor].de = devfs_register (devfs_handle,
3634     			devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
3635     			S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
3636     	}
3637     
3638     	/* forward all md request to md_make_request */
3639     	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request);
3640     	
3641     
3642     	read_ahead[MAJOR_NR] = INT_MAX;
3643     
3644     	add_gendisk(&md_gendisk);
3645     
3646     	md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
3647     	if (!md_recovery_thread)
3648     		printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
3649     
3650     	md_register_reboot_notifier(&md_notifier);
3651     	raid_table_header = register_sysctl_table(raid_root_table, 1);
3652     
3653     	md_geninit();
3654     	return (0);
3655     }
3656     
3657     
3658     #ifndef MODULE
3659     
3660     /*
3661      * When md (and any require personalities) are compiled into the kernel
3662      * (not a module), arrays can be assembles are boot time using with AUTODETECT
3663      * where specially marked partitions are registered with md_autodetect_dev(),
3664      * and with MD_BOOT where devices to be collected are given on the boot line
3665      * with md=.....
3666      * The code for that is here.
3667      */
3668     
3669     struct {
3670     	int set;
3671     	int noautodetect;
3672     } raid_setup_args md__initdata;
3673     
3674     /*
3675      * Searches all registered partitions for autorun RAID arrays
3676      * at boot time.
3677      */
3678     static int detected_devices[128];
3679     static int dev_cnt;
3680     
3681     void md_autodetect_dev (kdev_t dev)
3682     {
3683     	if (dev_cnt >= 0 && dev_cnt < 127)
3684     		detected_devices[dev_cnt++] = dev;
3685     }
3686     
3687     
3688     static void autostart_arrays (void)
3689     {
3690     	mdk_rdev_t *rdev;
3691     	int i;
3692     
3693     	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3694     
3695     	for (i = 0; i < dev_cnt; i++) {
3696     		kdev_t dev = detected_devices[i];
3697     
3698     		if (md_import_device(dev,1)) {
3699     			printk(KERN_ALERT "md: could not import %s!\n",
3700     				partition_name(dev));
3701     			continue;
3702     		}
3703     		/*
3704     		 * Sanity checks:
3705     		 */
3706     		rdev = find_rdev_all(dev);
3707     		if (!rdev) {
3708     			MD_BUG();
3709     			continue;
3710     		}
3711     		if (rdev->faulty) {
3712     			MD_BUG();
3713     			continue;
3714     		}
3715     		md_list_add(&rdev->pending, &pending_raid_disks);
3716     	}
3717     	dev_cnt = 0;
3718     
3719     	autorun_devices(-1);
3720     }
3721     
3722     static struct {
3723     	char device_set [MAX_MD_DEVS];
3724     	int pers[MAX_MD_DEVS];
3725     	int chunk[MAX_MD_DEVS];
3726     	char *device_names[MAX_MD_DEVS];
3727     } md_setup_args md__initdata;
3728     
3729     /*
3730      * Parse the command-line parameters given our kernel, but do not
3731      * actually try to invoke the MD device now; that is handled by
3732      * md_setup_drive after the low-level disk drivers have initialised.
3733      *
3734      * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
3735      *             assigns the task of parsing integer arguments to the
3736      *             invoked program now).  Added ability to initialise all
3737      *             the MD devices (by specifying multiple "md=" lines)
3738      *             instead of just one.  -- KTK
3739      * 18May2000: Added support for persistant-superblock arrays:
3740      *             md=n,0,factor,fault,device-list   uses RAID0 for device n
3741      *             md=n,-1,factor,fault,device-list  uses LINEAR for device n
3742      *             md=n,device-list      reads a RAID superblock from the devices
3743      *             elements in device-list are read by name_to_kdev_t so can be
3744      *             a hex number or something like /dev/hda1 /dev/sdb
3745      * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
3746      *		Shifted name_to_kdev_t() and related operations to md_set_drive()
3747      *		for later execution. Rewrote section to make devfs compatible.
3748      */
3749     static int md__init md_setup(char *str)
3750     {
3751     	int minor, level, factor, fault;
3752     	char *pername = "";
3753     	char *str1 = str;
3754     
3755     	if (get_option(&str, &minor) != 2) {	/* MD Number */
3756     		printk("md: Too few arguments supplied to md=.\n");
3757     		return 0;
3758     	}
3759     	if (minor >= MAX_MD_DEVS) {
3760     		printk ("md: md=%d, Minor device number too high.\n", minor);
3761     		return 0;
3762     	} else if (md_setup_args.device_names[minor]) {
3763     		printk ("md: md=%d, Specified more then once. Replacing previous definition.\n", minor);
3764     	}
3765     	switch (get_option(&str, &level)) {	/* RAID Personality */
3766     	case 2: /* could be 0 or -1.. */
3767     		if (level == 0 || level == -1) {
3768     			if (get_option(&str, &factor) != 2 ||	/* Chunk Size */
3769     					get_option(&str, &fault) != 2) {
3770     				printk("md: Too few arguments supplied to md=.\n");
3771     				return 0;
3772     			}
3773     			md_setup_args.pers[minor] = level;
3774     			md_setup_args.chunk[minor] = 1 << (factor+12);
3775     			switch(level) {
3776     			case -1:
3777     				level = LINEAR;
3778     				pername = "linear";
3779     				break;
3780     			case 0:
3781     				level = RAID0;
3782     				pername = "raid0";
3783     				break;
3784     			default:
3785     				printk ("md: The kernel has not been configured for raid%d"
3786     					" support!\n", level);
3787     				return 0;
3788     			}
3789     			md_setup_args.pers[minor] = level;
3790     			break;
3791     		}
3792     		/* FALL THROUGH */
3793     	case 1: /* the first device is numeric */
3794     		str = str1;
3795     		/* FALL THROUGH */
3796     	case 0:
3797     		md_setup_args.pers[minor] = 0;
3798     		pername="super-block";
3799     	}
3800     	
3801     	printk ("md: Will configure md%d (%s) from %s, below.\n",
3802     		minor, pername, str);
3803     	md_setup_args.device_names[minor] = str;
3804     			
3805     	return 1;
3806     }
3807     
3808     extern kdev_t name_to_kdev_t(char *line) md__init;
3809     void md__init md_setup_drive(void)
3810     {
3811     	int minor, i;
3812     	kdev_t dev;
3813     	mddev_t*mddev;
3814     	kdev_t devices[MD_SB_DISKS+1];
3815     
3816     	for (minor = 0; minor < MAX_MD_DEVS; minor++) {
3817     		int err = 0;
3818     		char *devname;
3819     		mdu_disk_info_t dinfo;
3820     
3821     		if ((devname = md_setup_args.device_names[minor]) == 0)	continue;
3822     	
3823     		for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
3824     
3825     			char *p;
3826     			void *handle;
3827     	
3828     			if ((p = strchr(devname, ',')) != NULL)
3829     				*p++ = 0;
3830     
3831     			dev = name_to_kdev_t(devname);
3832     			handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
3833     							DEVFS_SPECIAL_BLK, 1);
3834     			if (handle != 0) {
3835     				unsigned major, minor;
3836     				devfs_get_maj_min(handle, &major, &minor);
3837     				dev = MKDEV(major, minor);
3838     			}
3839     			if (dev == 0) {
3840     				printk ("md: Unknown device name: %s\n", devname);
3841     				break;
3842     			}
3843     			
3844     			devices[i] = dev;
3845     			md_setup_args.device_set[minor] = 1;
3846     			
3847     			devname = p;
3848     		}
3849     		devices[i] = 0;
3850     		
3851     		if (md_setup_args.device_set[minor] == 0)
3852     			continue;
3853     		
3854     		if (mddev_map[minor].mddev) {
3855     			printk("md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", minor);
3856     			continue;
3857     		}
3858     		printk("md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
3859     		
3860     		mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
3861     		if (mddev == NULL) {
3862     			printk("md: kmalloc failed - cannot start array %d\n", minor);
3863     			continue;
3864     		}
3865     		if (md_setup_args.pers[minor]) {
3866     			/* non-persistent */
3867     			mdu_array_info_t ainfo;
3868     			ainfo.level = pers_to_level(md_setup_args.pers[minor]);
3869     			ainfo.size = 0;
3870     			ainfo.nr_disks =0;
3871     			ainfo.raid_disks =0;
3872     			ainfo.md_minor =minor;
3873     			ainfo.not_persistent = 1;
3874     
3875     			ainfo.state = (1 << MD_SB_CLEAN);
3876     			ainfo.active_disks = 0;
3877     			ainfo.working_disks = 0;
3878     			ainfo.failed_disks = 0;
3879     			ainfo.spare_disks = 0;
3880     			ainfo.layout = 0;
3881     			ainfo.chunk_size = md_setup_args.chunk[minor];
3882     			err = set_array_info(mddev, &ainfo);
3883     			for (i = 0; !err && (dev = devices[i]); i++) {
3884     				dinfo.number = i;
3885     				dinfo.raid_disk = i;
3886     				dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
3887     				dinfo.major = MAJOR(dev);
3888     				dinfo.minor = MINOR(dev);
3889     				mddev->sb->nr_disks++;
3890     				mddev->sb->raid_disks++;
3891     				mddev->sb->active_disks++;
3892     				mddev->sb->working_disks++;
3893     				err = add_new_disk (mddev, &dinfo);
3894     			}
3895     		} else {
3896     			/* persistent */
3897     			for (i = 0; (dev = devices[i]); i++) {
3898     				dinfo.major = MAJOR(dev);
3899     				dinfo.minor = MINOR(dev);
3900     				add_new_disk (mddev, &dinfo);
3901     			}
3902     		}
3903     		if (!err)
3904     			err = do_md_run(mddev);
3905     		if (err) {
3906     			mddev->sb_dirty = 0;
3907     			do_md_stop(mddev, 0);
3908     			printk("md: starting md%d failed\n", minor);
3909     		}
3910     	}
3911     }
3912     
3913     static int md__init raid_setup(char *str)
3914     {
3915     	int len, pos;
3916     
3917     	len = strlen(str) + 1;
3918     	pos = 0;
3919     
3920     	while (pos < len) {
3921     		char *comma = strchr(str+pos, ',');
3922     		int wlen;
3923     		if (comma)
3924     			wlen = (comma-str)-pos;
3925     		else	wlen = (len-1)-pos;
3926     
3927     		if (strncmp(str, "noautodetect", wlen) == 0)
3928     			raid_setup_args.noautodetect = 1;
3929     		pos += wlen+1;
3930     	}
3931     	raid_setup_args.set = 1;
3932     	return 1;
3933     }
3934     
3935     int md__init md_run_setup(void)
3936     {
3937     	if (raid_setup_args.noautodetect)
3938     		printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
3939     	else
3940     		autostart_arrays();
3941     	md_setup_drive();
3942     	return 0;
3943     }
3944     
3945     __setup("raid=", raid_setup);
3946     __setup("md=", md_setup);
3947     
3948     __initcall(md_init);
3949     __initcall(md_run_setup);
3950     
3951     #else /* It is a MODULE */
3952     
3953     int init_module (void)
3954     {
3955     	return md_init();
3956     }
3957     
3958     static void free_device_names(void)
3959     {
3960     	while (device_names.next != &device_names) {
3961     		struct list_head *tmp = device_names.next;
3962     		list_del(tmp);
3963     		kfree(tmp);
3964     	}
3965     }
3966     
3967     
3968     void cleanup_module (void)
3969     {
3970     	md_unregister_thread(md_recovery_thread);
3971     	devfs_unregister(devfs_handle);
3972     
3973     	devfs_unregister_blkdev(MAJOR_NR,"md");
3974     	unregister_reboot_notifier(&md_notifier);
3975     	unregister_sysctl_table(raid_table_header);
3976     #ifdef CONFIG_PROC_FS
3977     	remove_proc_entry("mdstat", NULL);
3978     #endif
3979     
3980     	del_gendisk(&md_gendisk);
3981     
3982     	blk_dev[MAJOR_NR].queue = NULL;
3983     	blksize_size[MAJOR_NR] = NULL;
3984     	blk_size[MAJOR_NR] = NULL;
3985     	max_readahead[MAJOR_NR] = NULL;
3986     	hardsect_size[MAJOR_NR] = NULL;
3987     	
3988     	free_device_names();
3989     
3990     }
3991     #endif
3992     
3993     MD_EXPORT_SYMBOL(md_size);
3994     MD_EXPORT_SYMBOL(register_md_personality);
3995     MD_EXPORT_SYMBOL(unregister_md_personality);
3996     MD_EXPORT_SYMBOL(partition_name);
3997     MD_EXPORT_SYMBOL(md_error);
3998     MD_EXPORT_SYMBOL(md_do_sync);
3999     MD_EXPORT_SYMBOL(md_sync_acct);
4000     MD_EXPORT_SYMBOL(md_done_sync);
4001     MD_EXPORT_SYMBOL(md_recover_arrays);
4002     MD_EXPORT_SYMBOL(md_register_thread);
4003     MD_EXPORT_SYMBOL(md_unregister_thread);
4004     MD_EXPORT_SYMBOL(md_update_sb);
4005     MD_EXPORT_SYMBOL(md_wakeup_thread);
4006     MD_EXPORT_SYMBOL(md_print_devices);
4007     MD_EXPORT_SYMBOL(find_rdev_nr);
4008     MD_EXPORT_SYMBOL(md_interrupt_thread);
4009     MD_EXPORT_SYMBOL(mddev_map);
4010     MD_EXPORT_SYMBOL(md_check_ordering);
4011     MD_EXPORT_SYMBOL(get_spare);
4012     
4013