File failfast of Package mdadm

From 38f27538b2c807e5d6d933474c1404cf78731d22 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 25 Nov 2011 12:19:19 +1100
Subject: [PATCH] Add failfast support.

Allow per-device failfast flag to be set when creating an
array or adding devices to an array.

Also print failfast status in --detail and --examine.

Signed-off-by: NeilBrown <neilb@suse.de>

---
 Create.c      |    2 ++
 Detail.c      |    1 +
 Incremental.c |    1 +
 Manage.c      |   20 +++++++++++++++++++-
 ReadMe.c      |    2 ++
 md_p.h        |    1 +
 mdadm.8.in    |   28 +++++++++++++++++++++++++++-
 mdadm.c       |   11 +++++++++++
 mdadm.h       |    5 +++++
 super0.c      |   11 +++++++----
 super1.c      |   13 +++++++++++++
 11 files changed, 89 insertions(+), 6 deletions(-)

--- mdadm-3.3.orig/Create.c
+++ mdadm-3.3/Create.c
@@ -870,6 +870,8 @@ int Create(struct supertype *st, char *m
 
 				if (dv->writemostly == 1)
 					inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+				if (dv->failfast == 1)
+					inf->disk.state |= (1<<MD_DISK_FAILFAST);
 
 				if (have_container)
 					fd = -1;
--- mdadm-3.3.orig/Detail.c
+++ mdadm-3.3/Detail.c
@@ -650,6 +650,7 @@ This is pretty boring
 			}
 			if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
 			if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
+			if (disk.state & (1<<MD_DISK_FAILFAST)) printf(" failfast");
 			if ((disk.state &
 			     ((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)
 			      |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)))
--- mdadm-3.3.orig/Incremental.c
+++ mdadm-3.3/Incremental.c
@@ -1031,6 +1031,7 @@ static int array_try_spare(char *devname
 			devlist.next = NULL;
 			devlist.used = 0;
 			devlist.writemostly = 0;
+			devlist.failfast = 0;
 			devlist.devname = devname;
 			sprintf(devname, "%d:%d", major(stb.st_rdev),
 				minor(stb.st_rdev));
--- mdadm-3.3.orig/Manage.c
+++ mdadm-3.3/Manage.c
@@ -647,8 +647,13 @@ int attempt_re_add(int fd, int tfd, stru
 			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
 		if (dv->writemostly == 2)
 			disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+		if (dv->failfast == 1)
+			disc.state |= 1 << MD_DISK_FAILFAST;
+		if (dv->failfast == 2)
+			disc.state &= ~(1 << MD_DISK_FAILFAST);
 		remove_partitions(tfd);
-		if (update || dv->writemostly > 0) {
+		if (update || dv->writemostly > 0
+			|| dv->failfast > 0) {
 			int rv = -1;
 			tfd = dev_open(dv->devname, O_RDWR);
 			if (tfd < 0) {
@@ -665,6 +670,14 @@ int attempt_re_add(int fd, int tfd, stru
 				rv = dev_st->ss->update_super(
 					dev_st, NULL, "readwrite",
 					devname, verbose, 0, NULL);
+			if (dv->failfast == 1)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, "failfast",
+					devname, verbose, 0, NULL);
+			if (dv->failfast == 2)
+				rv = dev_st->ss->update_super(
+					dev_st, NULL, "nofailfast",
+					devname, verbose, 0, NULL);
 			if (update)
 				rv = dev_st->ss->update_super(
 					dev_st, NULL, update,
@@ -897,6 +910,8 @@ int Manage_add(int fd, int tfd, struct m
 		int dfd;
 		if (dv->writemostly == 1)
 			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+		if (dv->failfast == 1)
+			disc.state |= 1 << MD_DISK_FAILFAST;
 		dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
 		if (tst->ss->add_to_super(tst, &disc, dfd,
 					  dv->devname, INVALID_SECTORS))
@@ -934,6 +949,8 @@ int Manage_add(int fd, int tfd, struct m
 	}
 	if (dv->writemostly == 1)
 		disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+	if (dv->failfast == 1)
+		disc.state |= (1 << MD_DISK_FAILFAST);
 	if (tst->ss->external) {
 		/* add a disk
 		 * to an external metadata container */
@@ -1658,6 +1675,7 @@ int move_spare(char *from_devname, char
 	devlist.next = NULL;
 	devlist.used = 0;
 	devlist.writemostly = 0;
+	devlist.failfast = 0;
 	devlist.devname = devname;
 	sprintf(devname, "%d:%d", major(devid), minor(devid));
 
--- mdadm-3.3.orig/ReadMe.c
+++ mdadm-3.3/ReadMe.c
@@ -136,6 +136,8 @@ struct option long_options[] = {
     {"bitmap-chunk", 1, 0, BitmapChunk},
     {"write-behind", 2, 0, WriteBehind},
     {"write-mostly",0, 0, WriteMostly},
+    {"failfast",  0, 0,  FailFast},
+    {"nofailfast",0, 0,  NoFailFast},
     {"re-add",    0, 0,  ReAdd},
     {"homehost",  1, 0,  HomeHost},
     {"symlinks",  1, 0,  Symlinks},
--- mdadm-3.3.orig/md_p.h
+++ mdadm-3.3/md_p.h
@@ -83,6 +83,7 @@
 				   * read requests will only be sent here in
 				   * dire need
 				   */
+#define	MD_DISK_FAILFAST	10 /* Fewer retries, more failures */
 
 #define MD_DISK_REPLACEMENT	17
 
--- mdadm-3.3.orig/mdadm.8.in
+++ mdadm-3.3/mdadm.8.in
@@ -726,7 +726,7 @@ subsequent devices listed in a
 .BR \-\-create ,
 or
 .B \-\-add
-command will be flagged as 'write-mostly'.  This is valid for RAID1
+command will be flagged as 'write\-mostly'.  This is valid for RAID1
 only and means that the 'md' driver will avoid reading from these
 devices if at all possible.  This can be useful if mirroring over a
 slow link.
@@ -741,6 +741,21 @@ mode, and write-behind is only attempted
 .IR write-mostly .
 
 .TP
+.BR \-\-failfast
+subsequence devices listed in a
+.BR \-\-create ,
+or
+.B \-\-add
+command will be flagged as  'failfast'.  This is valid for RAID1 and
+RAID10 only.  IO requests to these devices will be encouraged to fail
+quickly rather than cause long delays due to error handling.  Also no
+attempt is made to repair a read error on these devices.
+
+If an array becomes degraded so that the 'failfast' device is the only
+usable device, the 'failfast' flag will then be ignored and extended
+delays will be preferred to complete failure.
+
+.TP
 .BR \-\-assume\-clean
 Tell
 .I mdadm
@@ -1378,6 +1393,17 @@ will avoid reading from these devices if
 Subsequent devices that are added or re\-added will have the 'write-mostly'
 flag cleared.
 
+.TP
+.BR \-\-failfast
+Subsequent devices that are added or re\-added will have
+the 'failfast' flag set.  This is only valid for RAID1 and RAID10 and
+means that the 'md' driver will avoid long timeouts on error handling
+where possible.
+.TP
+.BR \-\-nofailfast
+Subsequent devices that are re\-added will be re\-added without
+the 'failfast' flag set.
+
 .P
 Each of these options requires that the first device listed is the array
 to be acted upon, and the remainder are component devices to be added,
--- mdadm-3.3.orig/mdadm.c
+++ mdadm-3.3/mdadm.c
@@ -88,6 +88,7 @@ int main(int argc, char *argv[])
 	int spare_sharing = 1;
 	struct supertype *ss = NULL;
 	int writemostly = 0;
+	int failfast = 0;
 	char *shortopt = short_options;
 	int dosyslog = 0;
 	int rebuild_map = 0;
@@ -289,6 +290,7 @@ int main(int argc, char *argv[])
 					dv->devname = optarg;
 					dv->disposition = devmode;
 					dv->writemostly = writemostly;
+					dv->failfast = failfast;
 					dv->used = 0;
 					dv->next = NULL;
 					*devlistend = dv;
@@ -347,6 +349,7 @@ int main(int argc, char *argv[])
 			dv->devname = optarg;
 			dv->disposition = devmode;
 			dv->writemostly = writemostly;
+			dv->failfast = failfast;
 			dv->used = 0;
 			dv->next = NULL;
 			*devlistend = dv;
@@ -414,6 +417,14 @@ int main(int argc, char *argv[])
 			writemostly = 2;
 			continue;
 
+		case O(MANAGE,FailFast):
+		case O(CREATE,FailFast):
+			failfast = 1;
+			continue;
+		case O(MANAGE,NoFailFast):
+			failfast = 2;
+			continue;
+
 		case O(GROW,'z'):
 		case O(CREATE,'z'):
 		case O(BUILD,'z'): /* size */
--- mdadm-3.3.orig/mdadm.h
+++ mdadm-3.3/mdadm.h
@@ -318,6 +318,8 @@ enum special_options {
 	ConfigFile,
 	ChunkSize,
 	WriteMostly,
+	FailFast,
+	NoFailFast,
 	Layout,
 	Auto,
 	Force,
@@ -436,6 +438,7 @@ struct mddev_dev {
 				 * Not set for names read from .config
 				 */
 	char writemostly;	/* 1 for 'set writemostly', 2 for 'clear writemostly' */
+	char failfast;		/* Ditto but for 'failfast' flag */
 	int used;		/* set when used */
 	long long data_offset;
 	struct mddev_dev *next;
@@ -747,6 +750,8 @@ extern struct superswitch {
 	 *   linear-grow-update - now change the size of the array.
 	 *   writemostly - set the WriteMostly1 bit in the superblock devflags
 	 *   readwrite - clear the WriteMostly1 bit in the superblock devflags
+	 *   failfast - set the FailFast1 bit in the superblock
+	 *   nofailfast - clear the FailFast1 bit
 	 *   no-bitmap - clear any record that a bitmap is present.
 	 *   bbl       - add a bad-block-log if possible
 	 *   no-bbl    - remove and bad-block-log is it is empty.
--- mdadm-3.3.orig/super0.c
+++ mdadm-3.3/super0.c
@@ -216,19 +216,21 @@ static void examine_super0(struct supert
 		mdp_disk_t *dp;
 		char *dv;
 		char nb[5];
-		int wonly;
+		int wonly, failfast;
 		if (d>=0) dp = &sb->disks[d];
 		else dp = &sb->this_disk;
 		snprintf(nb, sizeof(nb), "%4d", d);
 		printf("%4s %5d   %5d    %5d    %5d     ", d < 0 ? "this" :  nb,
 		       dp->number, dp->major, dp->minor, dp->raid_disk);
 		wonly = dp->state & (1<<MD_DISK_WRITEMOSTLY);
-		dp->state &= ~(1<<MD_DISK_WRITEMOSTLY);
+		failfast = dp->state & (1<<MD_DISK_FAILFAST);
+		dp->state &= ~(wonly | failfast);
 		if (dp->state & (1<<MD_DISK_FAULTY)) printf(" faulty");
 		if (dp->state & (1<<MD_DISK_ACTIVE)) printf(" active");
 		if (dp->state & (1<<MD_DISK_SYNC)) printf(" sync");
 		if (dp->state & (1<<MD_DISK_REMOVED)) printf(" removed");
 		if (wonly) printf(" write-mostly");
+		if (failfast) printf(" failfast");
 		if (dp->state == 0) printf(" spare");
 		if ((dv=map_dev(dp->major, dp->minor, 0)))
 			printf("   %s", dv);
@@ -557,7 +559,8 @@ static int update_super0(struct supertyp
 	} else if (strcmp(update, "assemble")==0) {
 		int d = info->disk.number;
 		int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
-		int mask = (1<<MD_DISK_WRITEMOSTLY);
+		int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST);
+		int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST);
 		int add = 0;
 		if (sb->minor_version >= 91)
 			/* During reshape we don't insist on everything
@@ -566,7 +569,7 @@ static int update_super0(struct supertyp
 			add = (1<<MD_DISK_SYNC);
 		if (((sb->disks[d].state & ~mask) | add)
 		    != (unsigned)info->disk.state) {
-			sb->disks[d].state = info->disk.state | wonly;
+			sb->disks[d].state = info->disk.state | wonly |failfast;
 			rv = 1;
 		}
 		if (info->reshape_active &&
--- mdadm-3.3.orig/super1.c
+++ mdadm-3.3/super1.c
@@ -73,6 +73,7 @@ struct mdp_superblock_1 {
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
 	__u8    devflags;        /* per-device flags.  Only one defined...*/
 #define WriteMostly1    1        /* mask for writemostly flag in above */
+#define FailFast1	2        /* Device should get FailFast requests */
 	/* bad block log.  If there are any bad blocks the feature flag is set.
 	 * if offset and size are non-zero, that space is reserved and available.
 	 */
@@ -395,6 +396,8 @@ static void examine_super1(struct supert
 		printf("          Flags :");
 		if (sb->devflags & WriteMostly1)
 			printf(" write-mostly");
+		if (sb->devflags & FailFast1)
+			printf(" failfast");
 		printf("\n");
 	}
 
@@ -985,6 +988,8 @@ static void getinfo_super1(struct supert
 	}
 	if (sb->devflags & WriteMostly1)
 		info->disk.state |= (1 << MD_DISK_WRITEMOSTLY);
+	if (sb->devflags & FailFast1)
+		info->disk.state |= (1 << MD_DISK_FAILFAST);
 	info->events = __le64_to_cpu(sb->events);
 	sprintf(info->text_version, "1.%d", st->minor_version);
 	info->safe_mode_delay = 200;
@@ -1316,6 +1321,10 @@ static int update_super1(struct supertyp
 		sb->devflags |= WriteMostly1;
 	else if (strcmp(update, "readwrite")==0)
 		sb->devflags &= ~WriteMostly1;
+	else if (strcmp(update, "failfast") == 0)
+		sb->devflags |= FailFast1;
+	else if (strcmp(update, "nofailfast") == 0)
+		sb->devflags &= ~FailFast1;
 	else
 		rv = -1;
 
@@ -1572,6 +1581,10 @@ static int write_init_super1(struct supe
 			sb->devflags |= WriteMostly1;
 		else
 			sb->devflags &= ~WriteMostly1;
+		if (di->disk.state & (1<<MD_DISK_FAILFAST))
+			sb->devflags |= FailFast1;
+		else
+			sb->devflags &= ~FailFast1;
 
 		if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
 		    read(rfd, sb->device_uuid, 16) != 16) {
openSUSE Build Service is sponsored by