[LWN Logo]
[LWN.net]
From:	 Ingo Molnar <mingo@elte.hu>
To:	 Linus Torvalds <torvalds@transmeta.com>
Subject: [patch] multipath RAID personality, 2.4.10-pre9
Date:	 Fri, 14 Sep 2001 12:01:44 +0200 (CEST)
Cc:	 Neil Brown <neilb@cse.unsw.edu.au>, <linux-raid@vger.kernel.org>,
	 <linux-kernel@vger.kernel.org>


the attached patches implement multipath IO for Linux in form of a sw-RAID
personality. Multipath-IO is the ability of certain devices to address the
same physical disk over multiple 'IO paths'. The code ensures that such
paths can be defined and handled runtime, and ensures that a transparent
failover to the backup path(s) happens if a IO errors arrives on the
primary path.

the attached patches add the multipath RAID personality in two stages:

 - multipath-generic-2.4.10-A0, generic bits and enhancements to the MD
   framework. The patch also fixes some raid-hotadd/hotremove and
   autodetection bugs.

 - multipath-2.4.10-A0, the multipath personality itself.

multipath-generic-2.4.10-A0 is standalone and results in a functional MD
layer. The multipath-2.4.10-A0 patch must be applied on top of this.

latest raidtools are needed to be able to define multipath devices in
/etc/raidtab. RH 7.1 and other distributions use this new raidtools
package, but the latest version can also be downloaded from:

	http://redhat.com/~mingo/raidtools/raidtools-20010914.tar.gz

comments, reports welcome,

	Ingo

--- linux/include/linux/raid/md_k.h.orig	Fri Sep 14 10:00:16 2001
+++ linux/include/linux/raid/md_k.h	Fri Sep 14 10:37:37 2001
@@ -17,17 +17,18 @@
 
 #define MD_RESERVED       0UL
 #define LINEAR            1UL
-#define STRIPED           2UL
-#define RAID0             STRIPED
+#define RAID0             2UL
 #define RAID1             3UL
 #define RAID5             4UL
 #define TRANSLUCENT       5UL
 #define HSM               6UL
-#define MAX_PERSONALITY   7UL
+#define MULTIPATH         7UL
+#define MAX_PERSONALITY   8UL
 
 static inline int pers_to_level (int pers)
 {
 	switch (pers) {
+		case MULTIPATH:		return -4;
 		case HSM:		return -3;
 		case TRANSLUCENT:	return -2;
 		case LINEAR:		return -1;
@@ -35,7 +36,7 @@
 		case RAID1:		return 1;
 		case RAID5:		return 5;
 	}
-	panic("pers_to_level()");
+	BUG();
 	return MD_RESERVED;
 }
 
@@ -171,6 +172,7 @@
 	mdp_super_t *sb;
 	unsigned long sb_offset;
 
+	int alias_device;		/* device alias to the same disk */
 	int faulty;			/* if faulty do not issue IO requests */
 	int desc_nr;			/* descriptor index in the superblock */
 };
@@ -258,6 +260,7 @@
 
 extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev);
 extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
+extern mdp_disk_t *get_spare(mddev_t *mddev);
 
 /*
  * iterates through some rdev ringlist. It's safe to remove the
--- linux/include/linux/raid/md_u.h.orig	Tue Aug 21 14:26:04 2001
+++ linux/include/linux/raid/md_u.h	Fri Sep 14 10:03:30 2001
@@ -35,6 +35,7 @@
 #define PROTECT_ARRAY		_IO (MD_MAJOR, 0x27)
 #define HOT_ADD_DISK		_IO (MD_MAJOR, 0x28)
 #define SET_DISK_FAULTY		_IO (MD_MAJOR, 0x29)
+#define HOT_GENERATE_ERROR	_IO (MD_MAJOR, 0x2a)
 
 /* usage */
 #define RUN_ARRAY		_IOW (MD_MAJOR, 0x30, mdu_param_t)
--- linux/drivers/md/md.c.orig	Fri Sep 14 10:39:24 2001
+++ linux/drivers/md/md.c	Fri Sep 14 10:43:17 2001
@@ -644,10 +644,9 @@
 	bdev = bdget(rdev->dev);
 	if (bdev == NULL)
 		return -ENOMEM;
-	err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FILE);
-	if (!err) {
+	err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+	if (!err)
 		rdev->bdev = bdev;
-	}
 	return err;
 }
 
@@ -655,7 +654,7 @@
 {
 	if (!rdev->bdev)
 		MD_BUG();
-	blkdev_put(rdev->bdev, BDEV_FILE);
+	blkdev_put(rdev->bdev, BDEV_RAW);
 	bdput(rdev->bdev);
 	rdev->bdev = NULL;
 }
@@ -771,8 +770,10 @@
 		mdp_disk_t *desc;
 
 		desc = sb->disks + i;
-		printk("md:     D %2d: ", i);
-		print_desc(desc);
+		if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
+			printk("     D %2d: ", i);
+			print_desc(desc);
+		}
 	}
 	printk("md:     THIS: ");
 	print_desc(&sb->this_disk);
@@ -830,6 +831,7 @@
 
 	if (!tmp1 || !tmp2) {
 		ret = 0;
+		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
 		goto abort;
 	}
 
@@ -910,7 +912,7 @@
 	sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
 	if (rdev->sb_offset != sb_offset) {
 		printk("%s's sb offset has changed from %ld to %ld, skipping\n",
-		       partition_name(dev), rdev->sb_offset, sb_offset);
+			partition_name(dev), rdev->sb_offset, sb_offset);
 		goto skip;
 	}
 	/*
@@ -921,7 +923,7 @@
 	size = calc_dev_size(dev, rdev->mddev, 1);
 	if (size != rdev->size) {
 		printk("%s's size has changed from %ld to %ld since import, skipping\n",
-		       partition_name(dev), rdev->size, size);
+			partition_name(dev), rdev->size, size);
 		goto skip;
 	}
 
@@ -982,7 +984,7 @@
 	struct md_list_head *tmp;
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
-		if (rdev->faulty)
+		if (rdev->faulty || rdev->alias_device)
 			continue;
 		sb = rdev->sb;
 		*sb = *mddev->sb;
@@ -1029,8 +1031,11 @@
 		printk("md: ");
 		if (rdev->faulty)
 			printk("(skipping faulty ");
+		if (rdev->alias_device)
+			printk("(skipping alias ");
+
 		printk("%s ", partition_name(rdev->dev));
-		if (!rdev->faulty) {
+		if (!rdev->faulty && !rdev->alias_device) {
 			printk("[events: %08lx]",
 				(unsigned long)rdev->sb->events_lo);
 			err += write_disk_sb(rdev);
@@ -1115,9 +1120,14 @@
 			goto abort_free;
 		}
 
-		rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
-					rdev->sb->this_disk.minor);
-		rdev->desc_nr = rdev->sb->this_disk.number;
+		if (rdev->sb->level != -4) {
+			rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+						rdev->sb->this_disk.minor);
+			rdev->desc_nr = rdev->sb->this_disk.number;
+		} else {
+			rdev->old_dev = MKDEV(0, 0);
+			rdev->desc_nr = -1;
+		}
 	}
 	md_list_add(&rdev->all, &all_raid_disks);
 	MD_INIT_LIST_HEAD(&rdev->pending);
@@ -1157,7 +1167,7 @@
 
 static int analyze_sbs (mddev_t * mddev)
 {
-	int out_of_date = 0, i;
+	int out_of_date = 0, i, first;
 	struct md_list_head *tmp, *tmp2;
 	mdk_rdev_t *rdev, *rdev2, *freshest;
 	mdp_super_t *sb;
@@ -1251,7 +1261,7 @@
 	 */
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		/*
-		 * Kick all non-fresh devices faulty
+		 * Kick all non-fresh devices
 		 */
 		__u64 ev1, ev2;
 		ev1 = md_event(rdev->sb);
@@ -1269,9 +1279,10 @@
 	 * Fix up changed device names ... but only if this disk has a
 	 * recent update time. Use faulty checksum ones too.
 	 */
+	if (mddev->sb->level != -4)
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		__u64 ev1, ev2, ev3;
-		if (rdev->faulty) { /* REMOVEME */
+		if (rdev->faulty || rdev->alias_device) {
 			MD_BUG();
 			goto abort;
 		}
@@ -1280,7 +1291,7 @@
 		ev3 = ev2;
 		--ev3;
 		if ((rdev->dev != rdev->old_dev) &&
-		    ((ev1 == ev2) || (ev1 == ev3))) {
+			((ev1 == ev2) || (ev1 == ev3))) {
 			mdp_disk_t *desc;
 
 			printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev));
@@ -1319,8 +1330,13 @@
 
 		/*
 		 * We kick faulty devices/descriptors immediately.
+		 *
+		 * Note: multipath devices are a special case.  Since we
+		 * were able to read the superblock on the path, we don't
+		 * care if it was previously marked as faulty, it's up now
+		 * so enable it.
 		 */
-		if (disk_faulty(desc)) {
+		if (disk_faulty(desc) && mddev->sb->level != -4) {
 			found = 0;
 			ITERATE_RDEV(mddev,rdev,tmp) {
 				if (rdev->desc_nr != desc->number)
@@ -1339,6 +1355,15 @@
 			}
 			remove_descriptor(desc, sb);
 			continue;
+		} else if (disk_faulty(desc)) {
+			/*
+			 * multipath entry marked as faulty, unfaulty it
+			 */
+			rdev = find_rdev(mddev, dev);
+			if(rdev)
+				mark_disk_spare(desc);
+			else
+				remove_descriptor(desc, sb);
 		}
 
 		if (dev == MKDEV(0,0))
@@ -1348,6 +1373,17 @@
 		 */
 		found = 0;
 		ITERATE_RDEV(mddev,rdev,tmp) {
+			/*
+			 * Multi-path IO special-case: since we have no
+			 * this_disk descriptor at auto-detect time,
+			 * we cannot check rdev->number.
+			 * We can check the device though.
+			 */
+			if ((sb->level == -4) && (rdev->dev ==
+					MKDEV(desc->major,desc->minor))) {
+				found = 1;
+				break;
+			}
 			if (rdev->desc_nr == desc->number) {
 				found = 1;
 				break;
@@ -1364,6 +1400,7 @@
 	 * Double check wether all devices mentioned in the
 	 * superblock are in the rdev ring.
 	 */
+	first = 1;
 	for (i = 0; i < MD_SB_DISKS; i++) {
 		mdp_disk_t *desc;
 		kdev_t dev;
@@ -1384,35 +1421,63 @@
 			MD_BUG();
 			goto abort;
 		}
-	}
-
-	/*
-	 * Do a final reality check.
-	 */
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		if (rdev->desc_nr == -1) {
-			MD_BUG();
-			goto abort;
-		}
 		/*
-		 * is the desc_nr unique?
+		 * In the case of Multipath-IO, we have no
+		 * other information source to find out which
+		 * disk is which, only the position of the device
+		 * in the superblock:
 		 */
-		ITERATE_RDEV(mddev,rdev2,tmp2) {
-			if ((rdev2 != rdev) &&
-					(rdev2->desc_nr == rdev->desc_nr)) {
+		if (mddev->sb->level == -4) {
+			if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
 				MD_BUG();
 				goto abort;
 			}
+			rdev->desc_nr = i;
+			if (!first)
+				rdev->alias_device = 1;
+			else
+				first = 0;
 		}
-		/*
-		 * is the device unique?
-		 */
-		ITERATE_RDEV(mddev,rdev2,tmp2) {
-			if ((rdev2 != rdev) &&
-					(rdev2->dev == rdev->dev)) {
+	}
+ 
+	/*
+	 * Kick all rdevs that are not in the
+	 * descriptor array:
+	 */
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->desc_nr == -1)
+			kick_rdev_from_array(rdev);
+	}
+ 
+	/*
+	 * Do a final reality check.
+	 */
+	if (mddev->sb->level != -4) {
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (rdev->desc_nr == -1) {
 				MD_BUG();
 				goto abort;
 			}
+			/*
+			 * is the desc_nr unique?
+			 */
+			ITERATE_RDEV(mddev,rdev2,tmp2) {
+				if ((rdev2 != rdev) &&
+						(rdev2->desc_nr == rdev->desc_nr)) {
+					MD_BUG();
+					goto abort;
+				}
+			}
+			/*
+			 * is the device unique?
+			 */
+			ITERATE_RDEV(mddev,rdev2,tmp2) {
+				if ((rdev2 != rdev) &&
+						(rdev2->dev == rdev->dev)) {
+					MD_BUG();
+					goto abort;
+				}
+			}
 		}
 	}
 
@@ -1473,6 +1538,9 @@
 	}
 
 	switch (sb->level) {
+		case -4:
+			data_disks = 1;
+			break;
 		case -3:
 			data_disks = 1;
 			break;
@@ -1507,6 +1575,7 @@
 		if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
 			readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
 	} else {
+		// (no multipath branch - it uses the default setting)
 		if (sb->level == -3)
 			readahead = 0;
 	}
@@ -1569,38 +1638,41 @@
 	mddev->param.chunk_size = chunk_size;
 	mddev->param.personality = pnum;
 
-	if (chunk_size > MAX_CHUNK_SIZE) {
-		printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
-		return -EINVAL;
-	}
-	/*
-	 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
-	 */
-	if ( (1 << ffz(~chunk_size)) != chunk_size) {
-		MD_BUG();
-		return -EINVAL;
-	}
-	if (chunk_size < PAGE_SIZE) {
-		printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
-		return -EINVAL;
-	}
+	if ((pnum != MULTIPATH) && (pnum != RAID1) && (pnum != LINEAR)) {
+		if (!chunk_size) {
+			/*
+			 * 'default chunksize' in the old md code used to
+			 * be PAGE_SIZE, baaad.
+			 * we abort here to be on the safe side. We dont
+			 * want to continue the bad practice.
+			 */
+			printk(BAD_CHUNKSIZE);
+			return -EINVAL;
+		}
+		if (chunk_size > MAX_CHUNK_SIZE) {
+			printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+			return -EINVAL;
+		}
+		/*
+		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+		 */
+		if ( (1 << ffz(~chunk_size)) != chunk_size) {
+			MD_BUG();
+			return -EINVAL;
+		}
+		if (chunk_size < PAGE_SIZE) {
+			printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+			return -EINVAL;
+		}
+	} else
+		if (chunk_size)
+			printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
 
 	if (pnum >= MAX_PERSONALITY) {
 		MD_BUG();
 		return -EINVAL;
 	}
 
-	if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) {
-		/*
-		 * 'default chunksize' in the old md code used to
-		 * be PAGE_SIZE, baaad.
-		 * we abort here to be on the safe side. We dont
-		 * want to continue the bad practice.
-		 */
-		printk(BAD_CHUNKSIZE);
-		return -EINVAL;
-	}
-
 	if (!pers[pnum])
 	{
 #ifdef CONFIG_KMOD
@@ -1609,7 +1681,11 @@
 		request_module (module_name);
 		if (!pers[pnum])
 #endif
+		{
+			printk(KERN_ERR "md.c: personality %d is not loaded!\n",
+				pnum);
 			return -EINVAL;
+		}
 	}
 
 	if (device_size_calculation(mddev))
@@ -1627,7 +1703,7 @@
 			continue;
 		invalidate_device(rdev->dev, 1);
 		if (get_hardsect_size(rdev->dev)
-		    > md_hardsect_sizes[mdidx(mddev)]) 
+			> md_hardsect_sizes[mdidx(mddev)]) 
 			md_hardsect_sizes[mdidx(mddev)] =
 				get_hardsect_size(rdev->dev);
 	}
@@ -1652,7 +1728,7 @@
 	 */
 	md_hd_struct[mdidx(mddev)].start_sect = 0;
 	register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
-		      1, &md_fops, md_size[mdidx(mddev)]<<1);
+			1, &md_fops, md_size[mdidx(mddev)]<<1);
 
 	read_ahead[MD_MAJOR] = 1024;
 	return (0);
@@ -1688,8 +1764,11 @@
 		md_recover_arrays();
 		if (mddev->pers->restart_resync)
 			mddev->pers->restart_resync(mddev);
-	} else
+	} else {
+		printk (KERN_ERR "md.c: md%d has no personality assigned.\n",
+			mdidx(mddev));
 		err = -EINVAL;
+	}
 
 out:
 	return err;
@@ -1810,7 +1889,7 @@
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		printk("<%s>", partition_name(rdev->dev));
 	}
-	printk("\nmd: now!\n");
+	printk("\n");
 
 	err = do_md_run (mddev);
 	if (err) {
@@ -2021,8 +2100,10 @@
 {
 	mdu_array_info_t info;
 
-	if (!mddev->sb)
+	if (!mddev->sb) {
+		MD_BUG();
 		return -EINVAL;
+	}
 
 	SET_FROM_SB(major_version);
 	SET_FROM_SB(minor_version);
@@ -2109,7 +2190,7 @@
 		}
 		if (mddev->nb_dev) {
 			mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
-							  mdk_rdev_t, same_set);
+							mdk_rdev_t, same_set);
 			if (!uuid_equal(rdev0, rdev)) {
 				printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev));
 				export_rdev(rdev);
@@ -2126,8 +2207,11 @@
 	}
 
 	nr = info->number;
-	if (nr >= mddev->sb->nr_disks)
+	if (nr >= mddev->sb->nr_disks) {
+		MD_BUG();
 		return -EINVAL;
+	}
+
 
 	SET_SB(number);
 	SET_SB(major);
@@ -2155,8 +2239,6 @@
 		persistent = !mddev->sb->not_persistent;
 		if (!persistent)
 			printk("md: nonpersistent superblock ...\n");
-		if (!mddev->sb->chunk_size)
-			printk("md: no chunksize?\n");
 
 		size = calc_dev_size(dev, mddev, persistent);
 		rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
@@ -2174,6 +2256,43 @@
 }
 #undef SET_SB
 
+static int hot_generate_error (mddev_t * mddev, kdev_t dev)
+{
+	struct request_queue *q;
+	mdk_rdev_t *rdev;
+	mdp_disk_t *disk;
+ 
+	if (!mddev->pers)
+		return -ENODEV;
+ 
+	printk("trying to generate %s error in md%d ... \n",
+		partition_name(dev), mdidx(mddev));
+ 
+	rdev = find_rdev(mddev, dev);
+	if (!rdev) {
+		MD_BUG();
+		return -ENXIO;
+	}
+ 
+	if (rdev->desc_nr == -1) {
+		MD_BUG();
+		return -EINVAL;
+	}
+	disk = &mddev->sb->disks[rdev->desc_nr];
+	if (!disk_active(disk))
+		return -ENODEV;
+ 
+	q = blk_get_queue(rdev->dev);
+	if (!q) {
+		MD_BUG();
+		return -ENODEV;
+	}
+	printk("okay, generating error!\n");
+//	q->oneshot_error = 1; // disabled for now
+ 
+	return 0;
+}
+
 static int hot_remove_disk (mddev_t * mddev, kdev_t dev)
 {
 	int err;
@@ -2201,16 +2320,20 @@
 		return -EINVAL;
 	}
 	disk = &mddev->sb->disks[rdev->desc_nr];
-	if (disk_active(disk))
+	if (disk_active(disk)) {
+		MD_BUG();
 		goto busy;
+	}
 	if (disk_removed(disk)) {
 		MD_BUG();
 		return -EINVAL;
 	}
 	
 	err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
-	if (err == -EBUSY)
+	if (err == -EBUSY) {
+		MD_BUG();
 		goto busy;
+	}
 	if (err) {
 		MD_BUG();
 		return -EINVAL;
@@ -2425,7 +2548,6 @@
 {
 	int ret;
 
-	fsync_dev(mddev_to_kdev(mddev));
 	ret = md_error(mddev, dev);
 	return ret;
 }
@@ -2444,8 +2566,10 @@
 
 	dev = inode->i_rdev;
 	minor = MINOR(dev);
-	if (minor >= MAX_MD_DEVS)
+	if (minor >= MAX_MD_DEVS) {
+		MD_BUG();
 		return -EINVAL;
+	}
 
 	/*
 	 * Commands dealing with the RAID driver but not any
@@ -2469,16 +2593,17 @@
 			goto done;
 #endif
 
-		case BLKGETSIZE:   /* Return device size */
+		case BLKGETSIZE:	/* Return device size */
 			if (!arg) {
 				err = -EINVAL;
+				MD_BUG();
 				goto abort;
 			}
 			err = md_put_user(md_hd_struct[minor].nr_sects,
 						(long *) arg);
 			goto done;
 
-		case BLKGETSIZE64:   /* Return device size */
+		case BLKGETSIZE64:	/* Return device size */
 			err = md_put_user((u64)md_hd_struct[minor].nr_sects << 9,
 						(u64 *) arg);
 			goto done;
@@ -2533,7 +2658,7 @@
 
 			if (mddev->sb) {
 				printk("md: array md%d already has a superblock!\n",
-				       mdidx(mddev));
+					mdidx(mddev));
 				err = -EBUSY;
 				goto abort_unlock;
 			}
@@ -2662,6 +2787,9 @@
 				err = add_new_disk(mddev, &info);
 			goto done_unlock;
 		}
+		case HOT_GENERATE_ERROR:
+			err = hot_generate_error(mddev, (kdev_t)arg);
+			goto done_unlock;
 		case HOT_REMOVE_DISK:
 			err = hot_remove_disk(mddev, (kdev_t)arg);
 			goto done_unlock;
@@ -2807,7 +2935,8 @@
 		remove_wait_queue(&thread->wqueue, &wait);
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 
-		if ((run=thread->run)) {
+		run = thread->run;
+		if (run) {
 			run(thread->data);
 			run_task_queue(&tq_disk);
 		}
@@ -2905,7 +3034,7 @@
 	if (!rrdev || rrdev->faulty)
 		return 0;
 	if (mddev->pers->error_handler == NULL
-	    || mddev->pers->error_handler(mddev,rdev) <= 0) {
+			|| mddev->pers->error_handler(mddev,rdev) <= 0) {
 		free_disk_sb(rrdev);
 		rrdev->faulty = 1;
 	} else
@@ -2954,7 +3083,7 @@
 	unsigned long max_blocks, resync, res, dt, db, rt;
 
 	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
-	max_blocks = mddev->sb->size;
+	max_blocks = mddev->sb->size << 1;
 
 	/*
 	 * Should not happen.
@@ -3081,28 +3210,34 @@
 
 int register_md_personality (int pnum, mdk_personality_t *p)
 {
-	if (pnum >= MAX_PERSONALITY)
+	if (pnum >= MAX_PERSONALITY) {
+		MD_BUG();
 		return -EINVAL;
+	}
 
-	if (pers[pnum])
+	if (pers[pnum]) {
+		MD_BUG();
 		return -EBUSY;
+	}
 
 	pers[pnum] = p;
-	printk(KERN_INFO "md: %s personality registered\n", p->name);
+	printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
 	return 0;
 }
 
 int unregister_md_personality (int pnum)
 {
-	if (pnum >= MAX_PERSONALITY)
+	if (pnum >= MAX_PERSONALITY) {
+		MD_BUG();
 		return -EINVAL;
+	}
 
 	printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
 	pers[pnum] = NULL;
 	return 0;
 }
 
-static mdp_disk_t *get_spare(mddev_t *mddev)
+mdp_disk_t *get_spare(mddev_t *mddev)
 {
 	mdp_super_t *sb = mddev->sb;
 	mdp_disk_t *disk;
@@ -3434,7 +3569,7 @@
 	mddev_t *mddev;
 
 	if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
-				  || (code == MD_SYS_POWER_OFF)) {
+					|| (code == MD_SYS_POWER_OFF)) {
 
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
@@ -3565,7 +3700,7 @@
 
 		if (md_import_device(dev,1)) {
 			printk(KERN_ALERT "md: could not import %s!\n",
-			       partition_name(dev));
+				partition_name(dev));
 			continue;
 		}
 		/*
@@ -3634,7 +3769,7 @@
 	case 2: /* could be 0 or -1.. */
 		if (level == 0 || level == -1) {
 			if (get_option(&str, &factor) != 2 ||	/* Chunk Size */
-			    get_option(&str, &fault) != 2) {
+					get_option(&str, &fault) != 2) {
 				printk("md: Too few arguments supplied to md=.\n");
 				return 0;
 			}
@@ -3698,7 +3833,7 @@
 
 			dev = name_to_kdev_t(devname);
 			handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
-						    DEVFS_SPECIAL_BLK, 1);
+							DEVFS_SPECIAL_BLK, 1);
 			if (handle != 0) {
 				unsigned major, minor;
 				devfs_get_maj_min(handle, &major, &minor);
@@ -3876,4 +4011,5 @@
 MD_EXPORT_SYMBOL(md_interrupt_thread);
 MD_EXPORT_SYMBOL(mddev_map);
 MD_EXPORT_SYMBOL(md_check_ordering);
+MD_EXPORT_SYMBOL(get_spare);
 
--- linux/include/linux/raid/multipath.h.orig	Fri Sep 14 10:03:30 2001
+++ linux/include/linux/raid/multipath.h	Fri Sep 14 10:59:41 2001
@@ -0,0 +1,87 @@
+#ifndef _MULTIPATH_H
+#define _MULTIPATH_H
+
+#include <linux/raid/md.h>
+
+struct multipath_info {
+	int		number;
+	int		raid_disk;
+	kdev_t		dev;
+	int		sect_limit;
+	int		head_position;
+
+	/*
+	 * State bits:
+	 */
+	int		operational;
+	int		write_only;
+	int		spare;
+
+	int		used_slot;
+};
+
+struct multipath_private_data {
+	mddev_t			*mddev;
+	struct multipath_info	multipaths[MD_SB_DISKS];
+	int			nr_disks;
+	int			raid_disks;
+	int			working_disks;
+	mdk_thread_t		*thread;
+	struct multipath_info	*spare;
+	md_spinlock_t		device_lock;
+
+	/* buffer pool */
+	/* buffer_heads that we have pre-allocated have b_pprev -> &freebh
+	 * and are linked into a stack using b_next
+	 * multipath_bh that are pre-allocated have MPBH_PreAlloc set.
+	 * All these variable are protected by device_lock
+	 */
+	struct buffer_head	*freebh;
+	int			freebh_cnt;	/* how many are on the list */
+	struct multipath_bh	*freer1;
+	struct multipath_bh	*freebuf; 	/* each bh_req has a page allocated */
+	md_wait_queue_head_t	wait_buffer;
+
+	/* for use when syncing multipaths: */
+	unsigned long	start_active, start_ready,
+		start_pending, start_future;
+	int	cnt_done, cnt_active, cnt_ready,
+		cnt_pending, cnt_future;
+	int	phase;
+	int	window;
+	md_wait_queue_head_t	wait_done;
+	md_wait_queue_head_t	wait_ready;
+	md_spinlock_t		segment_lock;
+};
+
+typedef struct multipath_private_data multipath_conf_t;
+
+/*
+ * this is the only point in the RAID code where we violate
+ * C type safety. mddev->private is an 'opaque' pointer.
+ */
+#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
+
+/*
+ * this is our 'private' 'collective' MULTIPATH buffer head.
+ * it contains information about what kind of IO operations were started
+ * for this MULTIPATH operation, and about their status:
+ */
+
+struct multipath_bh {
+	atomic_t		remaining; /* 'have we finished' count,
+					    * used from IRQ handlers
+					    */
+	int			cmd;
+	unsigned long		state;
+	mddev_t			*mddev;
+	struct buffer_head	*master_bh;
+	struct buffer_head	*multipath_bh_list;
+	struct buffer_head	bh_req;
+	struct multipath_bh	*next_r1;	/* next for retry or in free list */
+};
+/* bits for multipath_bh.state */
+#define	MPBH_Uptodate	1
+#define	MPBH_SyncPhase	2
+#define	MPBH_PreAlloc	3	/* this was pre-allocated, add to free list */
+#endif
--- linux/drivers/md/Config.in.orig	Tue Aug 21 14:26:03 2001
+++ linux/drivers/md/Config.in	Fri Sep 14 10:03:26 2001
@@ -11,6 +11,7 @@
 dep_tristate '  RAID-0 (striping) mode' CONFIG_MD_RAID0 $CONFIG_BLK_DEV_MD
 dep_tristate '  RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD
 dep_tristate '  RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD
+dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
 
 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
 
--- linux/drivers/md/Makefile.orig	Fri Dec 29 23:07:22 2000
+++ linux/drivers/md/Makefile	Fri Sep 14 10:03:26 2001
@@ -17,6 +17,7 @@
 obj-$(CONFIG_MD_RAID0)		+= raid0.o
 obj-$(CONFIG_MD_RAID1)		+= raid1.o
 obj-$(CONFIG_MD_RAID5)		+= raid5.o xor.o
+obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_BLK_DEV_MD)	+= md.o
 obj-$(CONFIG_BLK_DEV_LVM)	+= lvm-mod.o
 
--- linux/drivers/md/multipath.c.orig	Fri Sep 14 10:03:28 2001
+++ linux/drivers/md/multipath.c	Fri Sep 14 10:03:28 2001
@@ -0,0 +1,1261 @@
+/*
+ * multipath.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * MULTIPATH management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/raid/multipath.h>
+#include <asm/atomic.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define MAX_WORK_PER_DISK 128
+
+/*
+ * The following can be used to debug the driver
+ */
+#define MULTIPATH_DEBUG	0
+
+#if MULTIPATH_DEBUG
+#define PRINTK(x...)   printk(x)
+#define inline
+#define __inline__
+#else
+#define PRINTK(x...)  do { } while (0)
+#endif
+
+
+static mdk_personality_t multipath_personality;
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
+struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail;
+
+static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state);
+
+struct buffer_head *multipath_alloc_bh(multipath_conf_t *conf, int cnt)
+{
+	/* return a linked list of "cnt" struct buffer_heads.
+	 * don't take any off the free list unless we know we can
+	 * get all we need, otherwise we could deadlock
+	 */
+	struct buffer_head *bh=NULL;
+
+	while(cnt) {
+		struct buffer_head *t;
+		md_spin_lock_irq(&conf->device_lock);
+		if (conf->freebh_cnt >= cnt)
+			while (cnt) {
+				t = conf->freebh;
+				conf->freebh = t->b_next;
+				t->b_next = bh;
+				bh = t;
+				t->b_state = 0;
+				conf->freebh_cnt--;
+				cnt--;
+			}
+		md_spin_unlock_irq(&conf->device_lock);
+		if (cnt == 0)
+			break;
+		t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_NOIO);
+		if (t) {
+			memset(t, 0, sizeof(*t));
+			t->b_next = bh;
+			bh = t;
+			cnt--;
+		} else {
+			PRINTK("waiting for %d bh\n", cnt);
+			wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
+		}
+	}
+	return bh;
+}
+
+static inline void multipath_free_bh(multipath_conf_t *conf, struct buffer_head *bh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->device_lock, flags);
+	while (bh) {
+		struct buffer_head *t = bh;
+		bh=bh->b_next;
+		if (t->b_pprev == NULL)
+			kfree(t);
+		else {
+			t->b_next= conf->freebh;
+			conf->freebh = t;
+			conf->freebh_cnt++;
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	wake_up(&conf->wait_buffer);
+}
+
+static int multipath_grow_bh(multipath_conf_t *conf, int cnt)
+{
+	/* allocate cnt buffer_heads, possibly less if kalloc fails */
+	int i = 0;
+
+	while (i < cnt) {
+		struct buffer_head *bh;
+		bh = kmalloc(sizeof(*bh), GFP_KERNEL);
+		if (!bh) break;
+		memset(bh, 0, sizeof(*bh));
+
+		md_spin_lock_irq(&conf->device_lock);
+		bh->b_pprev = &conf->freebh;
+		bh->b_next = conf->freebh;
+		conf->freebh = bh;
+		conf->freebh_cnt++;
+		md_spin_unlock_irq(&conf->device_lock);
+
+		i++;
+	}
+	return i;
+}
+
+static int multipath_shrink_bh(multipath_conf_t *conf, int cnt)
+{
+	/* discard cnt buffer_heads, if we can find them */
+	int i = 0;
+
+	md_spin_lock_irq(&conf->device_lock);
+	while ((i < cnt) && conf->freebh) {
+		struct buffer_head *bh = conf->freebh;
+		conf->freebh = bh->b_next;
+		kfree(bh);
+		i++;
+		conf->freebh_cnt--;
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+	return i;
+}
+		
+
+static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf)
+{
+	struct multipath_bh *r1_bh = NULL;
+
+	do {
+		md_spin_lock_irq(&conf->device_lock);
+		if (conf->freer1) {
+			r1_bh = conf->freer1;
+			conf->freer1 = r1_bh->next_r1;
+			r1_bh->next_r1 = NULL;
+			r1_bh->state = 0;
+			r1_bh->bh_req.b_state = 0;
+		}
+		md_spin_unlock_irq(&conf->device_lock);
+		if (r1_bh)
+			return r1_bh;
+		r1_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh),
+					GFP_NOIO);
+		if (r1_bh) {
+			memset(r1_bh, 0, sizeof(*r1_bh));
+			return r1_bh;
+		}
+		wait_event(conf->wait_buffer, conf->freer1);
+	} while (1);
+}
+
+static inline void multipath_free_mpbh(struct multipath_bh *r1_bh)
+{
+	struct buffer_head *bh = r1_bh->multipath_bh_list;
+	multipath_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+
+	r1_bh->multipath_bh_list = NULL;
+
+	if (test_bit(MPBH_PreAlloc, &r1_bh->state)) {
+		unsigned long flags;
+		spin_lock_irqsave(&conf->device_lock, flags);
+		r1_bh->next_r1 = conf->freer1;
+		conf->freer1 = r1_bh;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+	} else {
+		kfree(r1_bh);
+	}
+	multipath_free_bh(conf, bh);
+}
+
+static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt)
+{
+	int i = 0;
+
+	while (i < cnt) {
+		struct multipath_bh *r1_bh;
+		r1_bh = (struct multipath_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+		if (!r1_bh)
+			break;
+		memset(r1_bh, 0, sizeof(*r1_bh));
+
+		md_spin_lock_irq(&conf->device_lock);
+		set_bit(MPBH_PreAlloc, &r1_bh->state);
+		r1_bh->next_r1 = conf->freer1;
+		conf->freer1 = r1_bh;
+		md_spin_unlock_irq(&conf->device_lock);
+
+		i++;
+	}
+	return i;
+}
+
+static void multipath_shrink_mpbh(multipath_conf_t *conf)
+{
+	md_spin_lock_irq(&conf->device_lock);
+	while (conf->freer1) {
+		struct multipath_bh *r1_bh = conf->freer1;
+		conf->freer1 = r1_bh->next_r1;
+		kfree(r1_bh);
+	}
+	md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+
+static inline void multipath_free_buf(struct multipath_bh *r1_bh)
+{
+	unsigned long flags;
+	struct buffer_head *bh = r1_bh->multipath_bh_list;
+	multipath_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+	r1_bh->multipath_bh_list = NULL;
+	
+	spin_lock_irqsave(&conf->device_lock, flags);
+	r1_bh->next_r1 = conf->freebuf;
+	conf->freebuf = r1_bh;
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	multipath_free_bh(conf, bh);
+}
+
+static int multipath_map (mddev_t *mddev, kdev_t *rdev)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i, disks = MD_SB_DISKS;
+
+	/*
+	 * Later we do read balancing on the read side 
+	 * now we use the first available disk.
+	 */
+
+	for (i = 0; i < disks; i++) {
+		if (conf->multipaths[i].operational) {
+			*rdev = conf->multipaths[i].dev;
+			return (0);
+		}
+	}
+
+	printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	return (-1);
+}
+
+static void multipath_reschedule_retry (struct multipath_bh *r1_bh)
+{
+	unsigned long flags;
+	mddev_t *mddev = r1_bh->mddev;
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+
+	md_spin_lock_irqsave(&retry_list_lock, flags);
+	if (multipath_retry_list == NULL)
+		multipath_retry_tail = &multipath_retry_list;
+	*multipath_retry_tail = r1_bh;
+	multipath_retry_tail = &r1_bh->next_r1;
+	r1_bh->next_r1 = NULL;
+	md_spin_unlock_irqrestore(&retry_list_lock, flags);
+	md_wakeup_thread(conf->thread);
+}
+
+
+static void inline io_request_done(unsigned long sector, multipath_conf_t *conf, int phase)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->segment_lock, flags);
+	if (sector < conf->start_active)
+		conf->cnt_done--;
+	else if (sector >= conf->start_future && conf->phase == phase)
+		conf->cnt_future--;
+	else if (!--conf->cnt_pending)
+		wake_up(&conf->wait_ready);
+
+	spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+static void inline sync_request_done (unsigned long sector, multipath_conf_t *conf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&conf->segment_lock, flags);
+	if (sector >= conf->start_ready)
+		--conf->cnt_ready;
+	else if (sector >= conf->start_active) {
+		if (!--conf->cnt_active) {
+			conf->start_active = conf->start_ready;
+			wake_up(&conf->wait_done);
+		}
+	}
+	spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+/*
+ * multipath_end_bh_io() is called when we have finished servicing a multipathed
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void multipath_end_bh_io (struct multipath_bh *r1_bh, int uptodate)
+{
+	struct buffer_head *bh = r1_bh->master_bh;
+
+	io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
+			test_bit(MPBH_SyncPhase, &r1_bh->state));
+
+	bh->b_end_io(bh, uptodate);
+	multipath_free_mpbh(r1_bh);
+}
+
+void multipath_end_request (struct buffer_head *bh, int uptodate)
+{
+	struct multipath_bh * r1_bh = (struct multipath_bh *)(bh->b_private);
+
+	/*
+	 * this branch is our 'one multipath IO has finished' event handler:
+	 */
+	if (!uptodate)
+		md_error (r1_bh->mddev, bh->b_dev);
+	else
+		/*
+		 * Set MPBH_Uptodate in our master buffer_head, so that
+		 * we will return a good error code for to the higher
+		 * levels even if IO on some other multipathed buffer fails.
+		 *
+		 * The 'master' represents the complex operation to 
+		 * user-side. So if something waits for IO, then it will
+		 * wait for the 'master' buffer_head.
+		 */
+		set_bit (MPBH_Uptodate, &r1_bh->state);
+
+		
+	if (uptodate) {
+		multipath_end_bh_io(r1_bh, uptodate);
+		return;
+	}
+	/*
+	 * oops, IO error:
+	 */
+	printk(KERN_ERR "multipath: %s: rescheduling block %lu\n", 
+		 partition_name(bh->b_dev), bh->b_blocknr);
+	multipath_reschedule_retry(r1_bh);
+	return;
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. It bookkeeps the last read position for every disk
+ * in array and when new read requests come, the disk which last
+ * position is nearest to the request, is chosen.
+ *
+ * TODO: now if there are 2 multipaths in the same 2 devices, performance
+ * degrades dramatically because position is multipath, not device based.
+ * This should be changed to be device based. Also atomic sequential
+ * reads should be somehow balanced.
+ */
+
+static int multipath_read_balance (multipath_conf_t *conf)
+{
+	int disk;
+
+	for (disk = 0; disk < conf->raid_disks; disk++)	
+		if (conf->multipaths[disk].operational)
+			return disk;
+	BUG();
+	return 0;
+}
+
+static int multipath_make_request (mddev_t *mddev, int rw,
+			       struct buffer_head * bh)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	struct buffer_head *bh_req;
+	struct multipath_bh * r1_bh;
+	struct multipath_info *multipath;
+
+	if (!buffer_locked(bh))
+		BUG();
+	
+/*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ * Currently, just replace the command with READ/WRITE.
+ */
+	if (rw == READA)
+		rw = READ;
+
+	r1_bh = multipath_alloc_mpbh (conf);
+
+	spin_lock_irq(&conf->segment_lock);
+	wait_event_lock_irq(conf->wait_done,
+			bh->b_rsector < conf->start_active ||
+			bh->b_rsector >= conf->start_future,
+			conf->segment_lock);
+	if (bh->b_rsector < conf->start_active) 
+		conf->cnt_done++;
+	else {
+		conf->cnt_future++;
+		if (conf->phase)
+			set_bit(MPBH_SyncPhase, &r1_bh->state);
+	}
+	spin_unlock_irq(&conf->segment_lock);
+	
+	/*
+	 * i think the read and write branch should be separated completely,
+	 * since we want to do read balancing on the read side for example.
+	 * Alternative implementations? :) --mingo
+	 */
+
+	r1_bh->master_bh = bh;
+	r1_bh->mddev = mddev;
+	r1_bh->cmd = rw;
+
+	/*
+	 * read balancing logic:
+	 */
+	multipath = conf->multipaths + multipath_read_balance(conf);
+
+	bh_req = &r1_bh->bh_req;
+	memcpy(bh_req, bh, sizeof(*bh));
+	bh_req->b_blocknr = bh->b_rsector;
+	bh_req->b_dev = multipath->dev;
+	bh_req->b_rdev = multipath->dev;
+/*	bh_req->b_rsector = bh->n_rsector; */
+	bh_req->b_end_io = multipath_end_request;
+	bh_req->b_private = r1_bh;
+	generic_make_request (rw, bh_req);
+	return 0;
+}
+
+static int multipath_status (char *page, mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int sz = 0, i;
+	
+	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
+						 conf->working_disks);
+	for (i = 0; i < conf->raid_disks; i++)
+		sz += sprintf (page+sz, "%s",
+			conf->multipaths[i].operational ? "U" : "_");
+	sz += sprintf (page+sz, "]");
+	return sz;
+}
+
+#define LAST_DISK KERN_ALERT \
+"multipath: only one IO path left and IO error.\n"
+
+#define NO_SPARE_DISK KERN_ALERT \
+"multipath: no spare IO path left!\n"
+
+#define DISK_FAILED KERN_ALERT \
+"multipath: IO failure on %s, disabling IO path. \n" \
+"	Operation continuing on %d IO paths.\n"
+
+static void mark_disk_bad (mddev_t *mddev, int failed)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	struct multipath_info *multipath = conf->multipaths+failed;
+	mdp_super_t *sb = mddev->sb;
+
+	multipath->operational = 0;
+	mark_disk_faulty(sb->disks+multipath->number);
+	mark_disk_nonsync(sb->disks+multipath->number);
+	mark_disk_inactive(sb->disks+multipath->number);
+	sb->active_disks--;
+	sb->working_disks--;
+	sb->failed_disks++;
+	mddev->sb_dirty = 1;
+	md_wakeup_thread(conf->thread);
+	conf->working_disks--;
+	printk (DISK_FAILED, partition_name (multipath->dev),
+				 conf->working_disks);
+}
+
+/*
+ * Careful, this can execute in IRQ contexts as well!
+ */
+static int multipath_error (mddev_t *mddev, kdev_t dev)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	struct multipath_info * multipaths = conf->multipaths;
+	int disks = MD_SB_DISKS;
+	int other_paths = 1;
+	int i;
+
+	if (conf->working_disks == 1) {
+		other_paths = 0;
+		for (i = 0; i < disks; i++) {
+			if (multipaths[i].spare) {
+				other_paths = 1;
+				break;
+			}
+		}
+	}
+
+	if (!other_paths) {
+		/*
+		 * Uh oh, we can do nothing if this is our last path, but
+		 * first check if this is a queued request for a device
+		 * which has just failed.
+		 */
+		for (i = 0; i < disks; i++) {
+			if (multipaths[i].dev==dev && !multipaths[i].operational)
+				return 0;
+		}
+		printk (LAST_DISK);
+	} else {
+		/*
+		 * Mark disk as unusable
+		 */
+		for (i = 0; i < disks; i++) {
+			if (multipaths[i].dev==dev && multipaths[i].operational) {
+				mark_disk_bad(mddev, i);
+				break;
+			}
+		}
+		if (!conf->working_disks) {
+			int err = 1;
+			mdp_disk_t *spare;
+			mdp_super_t *sb = mddev->sb;
+
+//			MD_BUG();
+			spare = get_spare(mddev);
+			if (spare) {
+				err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE);
+				printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare));
+//				MD_BUG();
+			}
+			if (!err && !disk_faulty(spare)) {
+				multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+				mark_disk_sync(spare);
+				mark_disk_active(spare);
+				sb->active_disks++;
+				sb->spare_disks--;
+//				MD_BUG();
+			}
+		}
+	}
+	return 0;
+}
+
+#undef LAST_DISK
+#undef NO_SPARE_DISK
+#undef DISK_FAILED
+
+
+static void print_multipath_conf (multipath_conf_t *conf)
+{
+	int i;
+	struct multipath_info *tmp;
+
+	printk("MULTIPATH conf printout:\n");
+	if (!conf) {
+		printk("(conf==NULL)\n");
+		return;
+	}
+	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
+			 conf->raid_disks, conf->nr_disks);
+
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		tmp = conf->multipaths + i;
+		if (tmp->spare || tmp->operational || tmp->number ||
+				tmp->raid_disk || tmp->used_slot)
+			printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+				i, tmp->spare,tmp->operational,
+				tmp->number,tmp->raid_disk,tmp->used_slot,
+				partition_name(tmp->dev));
+	}
+}
+
+static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+	int err = 0;
+	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+	multipath_conf_t *conf = mddev->private;
+	struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+	mdk_rdev_t *spare_rdev, *failed_rdev;
+
+	print_multipath_conf(conf);
+	md_spin_lock_irq(&conf->device_lock);
+	/*
+	 * find the disk ...
+	 */
+	switch (state) {
+
+	case DISKOP_SPARE_ACTIVE:
+
+		/*
+		 * Find the failed disk within the MULTIPATH configuration ...
+		 * (this can only be in the first conf->working_disks part)
+		 */
+		for (i = 0; i < conf->raid_disks; i++) {
+			tmp = conf->multipaths + i;
+			if ((!tmp->operational && !tmp->spare) ||
+					!tmp->used_slot) {
+				failed_disk = i;
+				break;
+			}
+		}
+		/*
+		 * When we activate a spare disk we _must_ have a disk in
+		 * the lower (active) part of the array to replace. 
+		 */
+		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		/* fall through */
+
+	case DISKOP_SPARE_WRITE:
+	case DISKOP_SPARE_INACTIVE:
+
+		/*
+		 * Find the spare disk ... (can only be in the 'high'
+		 * area of the array)
+		 */
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->multipaths + i;
+			if (tmp->spare && tmp->number == (*d)->number) {
+				spare_disk = i;
+				break;
+			}
+		}
+		if (spare_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+
+		for (i = 0; i < MD_SB_DISKS; i++) {
+			tmp = conf->multipaths + i;
+			if (tmp->used_slot && (tmp->number == (*d)->number)) {
+				if (tmp->operational) {
+					printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number);
+					err = -EBUSY;
+					goto abort;
+				}
+				removed_disk = i;
+				break;
+			}
+		}
+		if (removed_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+
+		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+			tmp = conf->multipaths + i;
+			if (!tmp->used_slot) {
+				added_disk = i;
+				break;
+			}
+		}
+		if (added_disk == -1) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+		break;
+	}
+
+	switch (state) {
+	/*
+	 * Switch the spare disk to write-only mode:
+	 */
+	case DISKOP_SPARE_WRITE:
+		sdisk = conf->multipaths + spare_disk;
+		sdisk->operational = 1;
+		sdisk->write_only = 1;
+		break;
+	/*
+	 * Deactivate a spare disk:
+	 */
+	case DISKOP_SPARE_INACTIVE:
+		sdisk = conf->multipaths + spare_disk;
+		sdisk->operational = 0;
+		sdisk->write_only = 0;
+		break;
+	/*
+	 * Activate (mark read-write) the (now sync) spare disk,
+	 * which means we switch it's 'raid position' (->raid_disk)
+	 * with the failed disk. (only the first 'conf->nr_disks'
+	 * slots are used for 'real' disks and we must preserve this
+	 * property)
+	 */
+	case DISKOP_SPARE_ACTIVE:
+		sdisk = conf->multipaths + spare_disk;
+		fdisk = conf->multipaths + failed_disk;
+
+		spare_desc = &sb->disks[sdisk->number];
+		failed_desc = &sb->disks[fdisk->number];
+
+		if (spare_desc != *d) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (spare_desc->raid_disk != sdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+			
+		if (sdisk->raid_disk != spare_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (failed_desc->raid_disk != fdisk->raid_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		if (fdisk->raid_disk != failed_disk) {
+			MD_BUG();
+			err = 1;
+			goto abort;
+		}
+
+		/*
+		 * do the switch finally
+		 */
+		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
+		failed_rdev = find_rdev_nr(mddev, failed_desc->number);
+		xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr);
+//		if (failed_rdev->alias_device)
+//			MD_BUG();
+//		if (!spare_rdev->alias_device)
+//			MD_BUG();
+		spare_rdev->alias_device = 0;
+		failed_rdev->alias_device = 1;
+
+		xchg_values(*spare_desc, *failed_desc);
+		xchg_values(*fdisk, *sdisk);
+
+		/*
+		 * (careful, 'failed' and 'spare' are switched from now on)
+		 *
+		 * we want to preserve linear numbering and we want to
+		 * give the proper raid_disk number to the now activated
+		 * disk. (this means we switch back these values)
+		 */
+	
+		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+		xchg_values(spare_desc->number, failed_desc->number);
+		xchg_values(sdisk->number, fdisk->number);
+
+		*d = failed_desc;
+
+		if (sdisk->dev == MKDEV(0,0))
+			sdisk->used_slot = 0;
+		/*
+		 * this really activates the spare.
+		 */
+		fdisk->spare = 0;
+		fdisk->write_only = 0;
+
+		/*
+		 * if we activate a spare, we definitely replace a
+		 * non-operational disk slot in the 'low' area of
+		 * the disk array.
+		 */
+
+		conf->working_disks++;
+
+		break;
+
+	case DISKOP_HOT_REMOVE_DISK:
+		rdisk = conf->multipaths + removed_disk;
+
+		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+		rdisk->dev = MKDEV(0,0);
+		rdisk->used_slot = 0;
+		conf->nr_disks--;
+		break;
+
+	case DISKOP_HOT_ADD_DISK:
+		adisk = conf->multipaths + added_disk;
+		added_desc = *d;
+
+		if (added_disk != added_desc->number) {
+			MD_BUG();	
+			err = 1;
+			goto abort;
+		}
+
+		adisk->number = added_desc->number;
+		adisk->raid_disk = added_desc->raid_disk;
+		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+		adisk->operational = 0;
+		adisk->write_only = 0;
+		adisk->spare = 1;
+		adisk->used_slot = 1;
+		adisk->head_position = 0;
+		conf->nr_disks++;
+
+		break;
+
+	default:
+		MD_BUG();	
+		err = 1;
+		goto abort;
+	}
+abort:
+	md_spin_unlock_irq(&conf->device_lock);
+
+	print_multipath_conf(conf);
+	return err;
+}
+
+
+#define IO_ERROR KERN_ALERT \
+"multipath: %s: unrecoverable IO read error for block %lu\n"
+
+#define REDIRECT_SECTOR KERN_ERR \
+"multipath: %s: redirecting sector %lu to another IO path\n"
+
+/*
+ * This is a kernel thread which:
+ *
+ *	1.	Retries failed read operations on working multipaths.
+ *	2.	Updates the raid superblock when problems encounter.
+ *	3.	Performs writes following reads for array syncronising.
+ */
+
+static void multipathd (void *data)
+{
+	struct multipath_bh *r1_bh;
+	struct buffer_head *bh;
+	unsigned long flags;
+	mddev_t *mddev;
+	kdev_t dev;
+
+
+	for (;;) {
+		md_spin_lock_irqsave(&retry_list_lock, flags);
+		r1_bh = multipath_retry_list;
+		if (!r1_bh)
+			break;
+		multipath_retry_list = r1_bh->next_r1;
+		md_spin_unlock_irqrestore(&retry_list_lock, flags);
+
+		mddev = r1_bh->mddev;
+		if (mddev->sb_dirty) {
+			printk(KERN_INFO "dirty sb detected, updating.\n");
+			mddev->sb_dirty = 0;
+			md_update_sb(mddev);
+		}
+		bh = &r1_bh->bh_req;
+		dev = bh->b_dev;
+		
+		multipath_map (mddev, &bh->b_dev);
+		if (bh->b_dev == dev) {
+			printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+			multipath_end_bh_io(r1_bh, 0);
+		} else {
+			printk (REDIRECT_SECTOR,
+				partition_name(bh->b_dev), bh->b_blocknr);
+			bh->b_rdev = bh->b_dev;
+			bh->b_rsector = bh->b_blocknr;
+			generic_make_request (r1_bh->cmd, bh);
+		}
+	}
+	md_spin_unlock_irqrestore(&retry_list_lock, flags);
+}
+#undef IO_ERROR
+#undef REDIRECT_SECTOR
+
+/*
+ * This will catch the scenario in which one of the multipaths was
+ * mounted as a normal device rather than as a part of a raid set.
+ *
+ * check_consistency is very personality-dependent, eg. RAID5 cannot
+ * do this check, it uses another method.
+ */
+static int __check_consistency (mddev_t *mddev, int row)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int disks = MD_SB_DISKS;
+	kdev_t dev;
+	struct buffer_head *bh = NULL;
+	int i, rc = 0;
+	char *buffer = NULL;
+
+	for (i = 0; i < disks; i++) {
+		if (!conf->multipaths[i].operational)
+			continue;
+		printk("(checking disk %d)\n",i);
+		dev = conf->multipaths[i].dev;
+		set_blocksize(dev, 4096);
+		if ((bh = bread(dev, row / 4, 4096)) == NULL)
+			break;
+		if (!buffer) {
+			buffer = (char *) __get_free_page(GFP_KERNEL);
+			if (!buffer)
+				break;
+			memcpy(buffer, bh->b_data, 4096);
+		} else if (memcmp(buffer, bh->b_data, 4096)) {
+			rc = 1;
+			break;
+		}
+		bforget(bh);
+		fsync_dev(dev);
+		invalidate_buffers(dev);
+		bh = NULL;
+	}
+	if (buffer)
+		free_page((unsigned long) buffer);
+	if (bh) {
+		dev = bh->b_dev;
+		bforget(bh);
+		fsync_dev(dev);
+		invalidate_buffers(dev);
+	}
+	return rc;
+}
+
+static int check_consistency (mddev_t *mddev)
+{
+	if (__check_consistency(mddev, 0))
+/*
+ * we do not do this currently, as it's perfectly possible to
+ * have an inconsistent array when it's freshly created. Only
+ * newly written data has to be consistent.
+ */
+		return 0;
+
+	return 0;
+}
+
+#define INVALID_LEVEL KERN_WARNING \
+"multipath: md%d: raid level not set to multipath IO (%d)\n"
+
+#define NO_SB KERN_ERR \
+"multipath: disabled IO path %s (couldn't access raid superblock)\n"
+
+#define ERRORS KERN_ERR \
+"multipath: disabled IO path %s (errors detected)\n"
+
+#define NOT_IN_SYNC KERN_ERR \
+"multipath: making IO path %s a spare path (not in sync)\n"
+
+#define INCONSISTENT KERN_ERR \
+"multipath: disabled IO path %s (inconsistent descriptor)\n"
+
+#define ALREADY_RUNNING KERN_ERR \
+"multipath: disabled IO path %s (multipath %d already operational)\n"
+
+#define OPERATIONAL KERN_INFO \
+"multipath: device %s operational as IO path %d\n"
+
+#define MEM_ERROR KERN_ERR \
+"multipath: couldn't allocate memory for md%d\n"
+
+#define SPARE KERN_INFO \
+"multipath: spare IO path %s\n"
+
+#define NONE_OPERATIONAL KERN_ERR \
+"multipath: no operational IO paths for md%d\n"
+
+#define SB_DIFFERENCES KERN_ERR \
+"multipath: detected IO path differences!\n"
+
+#define ARRAY_IS_ACTIVE KERN_INFO \
+"multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n"
+
+#define THREAD_ERROR KERN_ERR \
+"multipath: couldn't allocate thread for md%d\n"
+
+static int multipath_run (mddev_t *mddev)
+{
+	multipath_conf_t *conf;
+	int i, j, disk_idx;
+	struct multipath_info *disk, *disk2;
+	mdp_super_t *sb = mddev->sb;
+	mdp_disk_t *desc, *desc2;
+	mdk_rdev_t *rdev, *def_rdev = NULL;
+	struct md_list_head *tmp;
+	int start_recovery = 0, num_rdevs = 0;
+
+	MOD_INC_USE_COUNT;
+
+	if (sb->level != -4) {
+		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
+		goto out;
+	}
+	/*
+	 * copy the already verified devices into our private MULTIPATH
+	 * bookkeeping area. [whatever we allocate in multipath_run(),
+	 * should be freed in multipath_stop()]
+	 */
+
+	conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
+	mddev->private = conf;
+	if (!conf) {
+		printk(MEM_ERROR, mdidx(mddev));
+		goto out;
+	}
+	memset(conf, 0, sizeof(*conf));
+
+	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev->faulty) {
+			/* this is a "should never happen" case and if it */
+			/* ever does happen, a continue; won't help */
+			printk(ERRORS, partition_name(rdev->dev));
+			continue;
+		} else {
+			/* this is a "should never happen" case and if it */
+			/* ever does happen, a continue; won't help */
+			if (!rdev->sb) {
+				MD_BUG();
+				continue;
+			}
+		}
+		if (rdev->desc_nr == -1) {
+			MD_BUG();
+			continue;
+		}
+
+		desc = &sb->disks[rdev->desc_nr];
+		disk_idx = desc->raid_disk;
+		disk = conf->multipaths + disk_idx;
+
+		if (!disk_sync(desc))
+			printk(NOT_IN_SYNC, partition_name(rdev->dev));
+
+		/*
+		 * Mark all disks as spare to start with, then pick our
+		 * active disk.  If we have a disk that is marked active
+		 * in the sb, then use it, else use the first rdev.
+		 */
+		disk->number = desc->number;
+		disk->raid_disk = desc->raid_disk;
+		disk->dev = rdev->dev;
+		disk->sect_limit = MAX_WORK_PER_DISK;
+		disk->operational = 0;
+		disk->write_only = 0;
+		disk->spare = 1;
+		disk->used_slot = 1;
+		disk->head_position = 0;
+		mark_disk_sync(desc);
+
+		if (disk_active(desc)) {
+			if(!conf->working_disks) {
+				printk(OPERATIONAL, partition_name(rdev->dev),
+ 					desc->raid_disk);
+				disk->operational = 1;
+				disk->spare = 0;
+				conf->working_disks++;
+				def_rdev = rdev;
+			} else {
+				mark_disk_spare(desc);
+			}
+		} else
+			mark_disk_spare(desc);
+
+		if(!num_rdevs++) def_rdev = rdev;
+	}
+	if(!conf->working_disks && num_rdevs) {
+		desc = &sb->disks[def_rdev->desc_nr];
+		disk = conf->multipaths + desc->raid_disk;
+		printk(OPERATIONAL, partition_name(def_rdev->dev),
+			disk->raid_disk);
+		disk->operational = 1;
+		disk->spare = 0;
+		conf->working_disks++;
+		mark_disk_active(desc);
+	}
+	/*
+	 * Make sure our active path is in desc spot 0
+	 */
+	if(def_rdev->desc_nr != 0) {
+		rdev = find_rdev_nr(mddev, 0);
+		desc = &sb->disks[def_rdev->desc_nr];
+		desc2 = sb->disks;
+		disk = conf->multipaths + desc->raid_disk;
+		disk2 = conf->multipaths + desc2->raid_disk;
+		xchg_values(*desc2,*desc);
+		xchg_values(*disk2,*disk);
+		xchg_values(desc2->number, desc->number);
+		xchg_values(disk2->number, disk->number);
+		xchg_values(desc2->raid_disk, desc->raid_disk);
+		xchg_values(disk2->raid_disk, disk->raid_disk);
+		if(rdev) {
+			xchg_values(def_rdev->desc_nr,rdev->desc_nr);
+		} else {
+			def_rdev->desc_nr = 0;
+		}
+	}
+	conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
+	conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
+	sb->failed_disks = 0;
+	sb->spare_disks = num_rdevs - 1;
+	mddev->sb_dirty = 1;
+	conf->mddev = mddev;
+	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
+
+	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
+	init_waitqueue_head(&conf->wait_buffer);
+	init_waitqueue_head(&conf->wait_done);
+	init_waitqueue_head(&conf->wait_ready);
+
+	if (!conf->working_disks) {
+		printk(NONE_OPERATIONAL, mdidx(mddev));
+		goto out_free_conf;
+	}
+
+
+	/* pre-allocate some buffer_head structures.
+	 * As a minimum, 1 mpbh and raid_disks buffer_heads
+	 * would probably get us by in tight memory situations,
+	 * but a few more is probably a good idea.
+	 * For now, try 16 mpbh and 16*raid_disks bufferheads
+	 * This will allow at least 16 concurrent reads or writes
+	 * even if kmalloc starts failing
+	 */
+	if (multipath_grow_mpbh(conf, 16) < 16 ||
+	    multipath_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
+		printk(MEM_ERROR, mdidx(mddev));
+		goto out_free_conf;
+	}
+
+	if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) {
+		/*
+		 * we do sanity checks even if the device says
+		 * it's clean ...
+		 */
+		if (check_consistency(mddev)) {
+			printk(SB_DIFFERENCES);
+			sb->state &= ~(1 << MD_SB_CLEAN);
+		}
+	}
+
+	{
+		const char * name = "multipathd";
+
+		conf->thread = md_register_thread(multipathd, conf, name);
+		if (!conf->thread) {
+			printk(THREAD_ERROR, mdidx(mddev));
+			goto out_free_conf;
+		}
+	}
+
+	/*
+	 * Regenerate the "device is in sync with the raid set" bit for
+	 * each device.
+	 */
+	for (i = 0; i < MD_SB_DISKS; i++) {
+		mark_disk_nonsync(sb->disks+i);
+		for (j = 0; j < sb->raid_disks; j++) {
+			if (sb->disks[i].number == conf->multipaths[j].number)
+				mark_disk_sync(sb->disks+i);
+		}
+	}
+
+	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks,
+			sb->raid_disks, sb->spare_disks);
+	/*
+	 * Ok, everything is just fine now
+	 */
+	return 0;
+
+out_free_conf:
+	multipath_shrink_mpbh(conf);
+	multipath_shrink_bh(conf, conf->freebh_cnt);
+	kfree(conf);
+	mddev->private = NULL;
+out:
+	MOD_DEC_USE_COUNT;
+	return -EIO;
+}
+
+#undef INVALID_LEVEL
+#undef NO_SB
+#undef ERRORS
+#undef NOT_IN_SYNC
+#undef INCONSISTENT
+#undef ALREADY_RUNNING
+#undef OPERATIONAL
+#undef SPARE
+#undef NONE_OPERATIONAL
+#undef SB_DIFFERENCES
+#undef ARRAY_IS_ACTIVE
+
+static int multipath_stop (mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+
+	md_unregister_thread(conf->thread);
+	multipath_shrink_mpbh(conf);
+	multipath_shrink_bh(conf, conf->freebh_cnt);
+	kfree(conf);
+	mddev->private = NULL;
+	MOD_DEC_USE_COUNT;
+	return 0;
+}
+
+static mdk_personality_t multipath_personality=
+{
+	name:		"multipath",
+	make_request:	multipath_make_request,
+	run:		multipath_run,
+	stop:		multipath_stop,
+	status:		multipath_status,
+	error_handler:	multipath_error,
+	diskop:		multipath_diskop,
+};
+
+static int md__init multipath_init (void)
+{
+	return register_md_personality (MULTIPATH, &multipath_personality);
+}
+
+static void multipath_exit (void)
+{
+	unregister_md_personality (MULTIPATH);
+}
+
+module_init(multipath_init);
+module_exit(multipath_exit);
+