File failfast of Package mdadm
From 38f27538b2c807e5d6d933474c1404cf78731d22 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 25 Nov 2011 12:19:19 +1100
Subject: [PATCH] Add failfast support.
Allow per-device failfast flag to be set when creating an
array or adding devices to an array.
Also print failfast status in --detail and --examine.
Signed-off-by: NeilBrown <neilb@suse.de>
---
Create.c | 2 ++
Detail.c | 1 +
Incremental.c | 1 +
Manage.c | 20 +++++++++++++++++++-
ReadMe.c | 2 ++
md_p.h | 1 +
mdadm.8.in | 28 +++++++++++++++++++++++++++-
mdadm.c | 11 +++++++++++
mdadm.h | 5 +++++
super0.c | 11 +++++++----
super1.c | 13 +++++++++++++
11 files changed, 89 insertions(+), 6 deletions(-)
--- mdadm-3.3.orig/Create.c
+++ mdadm-3.3/Create.c
@@ -870,6 +870,8 @@ int Create(struct supertype *st, char *m
if (dv->writemostly == 1)
inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == 1)
+ inf->disk.state |= (1<<MD_DISK_FAILFAST);
if (have_container)
fd = -1;
--- mdadm-3.3.orig/Detail.c
+++ mdadm-3.3/Detail.c
@@ -650,6 +650,7 @@ This is pretty boring
}
if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
+ if (disk.state & (1<<MD_DISK_FAILFAST)) printf(" failfast");
if ((disk.state &
((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)
|(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)))
--- mdadm-3.3.orig/Incremental.c
+++ mdadm-3.3/Incremental.c
@@ -1031,6 +1031,7 @@ static int array_try_spare(char *devname
devlist.next = NULL;
devlist.used = 0;
devlist.writemostly = 0;
+ devlist.failfast = 0;
devlist.devname = devname;
sprintf(devname, "%d:%d", major(stb.st_rdev),
minor(stb.st_rdev));
--- mdadm-3.3.orig/Manage.c
+++ mdadm-3.3/Manage.c
@@ -647,8 +647,13 @@ int attempt_re_add(int fd, int tfd, stru
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
if (dv->writemostly == 2)
disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == 1)
+ disc.state |= 1 << MD_DISK_FAILFAST;
+ if (dv->failfast == 2)
+ disc.state &= ~(1 << MD_DISK_FAILFAST);
remove_partitions(tfd);
- if (update || dv->writemostly > 0) {
+ if (update || dv->writemostly > 0
+ || dv->failfast > 0) {
int rv = -1;
tfd = dev_open(dv->devname, O_RDWR);
if (tfd < 0) {
@@ -665,6 +670,14 @@ int attempt_re_add(int fd, int tfd, stru
rv = dev_st->ss->update_super(
dev_st, NULL, "readwrite",
devname, verbose, 0, NULL);
+ if (dv->failfast == 1)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "failfast",
+ devname, verbose, 0, NULL);
+ if (dv->failfast == 2)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "nofailfast",
+ devname, verbose, 0, NULL);
if (update)
rv = dev_st->ss->update_super(
dev_st, NULL, update,
@@ -897,6 +910,8 @@ int Manage_add(int fd, int tfd, struct m
int dfd;
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ if (dv->failfast == 1)
+ disc.state |= 1 << MD_DISK_FAILFAST;
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
if (tst->ss->add_to_super(tst, &disc, dfd,
dv->devname, INVALID_SECTORS))
@@ -934,6 +949,8 @@ int Manage_add(int fd, int tfd, struct m
}
if (dv->writemostly == 1)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == 1)
+ disc.state |= (1 << MD_DISK_FAILFAST);
if (tst->ss->external) {
/* add a disk
* to an external metadata container */
@@ -1658,6 +1675,7 @@ int move_spare(char *from_devname, char
devlist.next = NULL;
devlist.used = 0;
devlist.writemostly = 0;
+ devlist.failfast = 0;
devlist.devname = devname;
sprintf(devname, "%d:%d", major(devid), minor(devid));
--- mdadm-3.3.orig/ReadMe.c
+++ mdadm-3.3/ReadMe.c
@@ -136,6 +136,8 @@ struct option long_options[] = {
{"bitmap-chunk", 1, 0, BitmapChunk},
{"write-behind", 2, 0, WriteBehind},
{"write-mostly",0, 0, WriteMostly},
+ {"failfast", 0, 0, FailFast},
+ {"nofailfast",0, 0, NoFailFast},
{"re-add", 0, 0, ReAdd},
{"homehost", 1, 0, HomeHost},
{"symlinks", 1, 0, Symlinks},
--- mdadm-3.3.orig/md_p.h
+++ mdadm-3.3/md_p.h
@@ -83,6 +83,7 @@
* read requests will only be sent here in
* dire need
*/
+#define MD_DISK_FAILFAST 10 /* Fewer retries, more failures */
#define MD_DISK_REPLACEMENT 17
--- mdadm-3.3.orig/mdadm.8.in
+++ mdadm-3.3/mdadm.8.in
@@ -726,7 +726,7 @@ subsequent devices listed in a
.BR \-\-create ,
or
.B \-\-add
-command will be flagged as 'write-mostly'. This is valid for RAID1
+command will be flagged as 'write\-mostly'. This is valid for RAID1
only and means that the 'md' driver will avoid reading from these
devices if at all possible. This can be useful if mirroring over a
slow link.
@@ -741,6 +741,21 @@ mode, and write-behind is only attempted
.IR write-mostly .
.TP
+.BR \-\-failfast
+subsequence devices listed in a
+.BR \-\-create ,
+or
+.B \-\-add
+command will be flagged as 'failfast'. This is valid for RAID1 and
+RAID10 only. IO requests to these devices will be encouraged to fail
+quickly rather than cause long delays due to error handling. Also no
+attempt is made to repair a read error on these devices.
+
+If an array becomes degraded so that the 'failfast' device is the only
+usable device, the 'failfast' flag will then be ignored and extended
+delays will be preferred to complete failure.
+
+.TP
.BR \-\-assume\-clean
Tell
.I mdadm
@@ -1378,6 +1393,17 @@ will avoid reading from these devices if
Subsequent devices that are added or re\-added will have the 'write-mostly'
flag cleared.
+.TP
+.BR \-\-failfast
+Subsequent devices that are added or re\-added will have
+the 'failfast' flag set. This is only valid for RAID1 and RAID10 and
+means that the 'md' driver will avoid long timeouts on error handling
+where possible.
+.TP
+.BR \-\-nofailfast
+Subsequent devices that are re\-added will be re\-added without
+the 'failfast' flag set.
+
.P
Each of these options requires that the first device listed is the array
to be acted upon, and the remainder are component devices to be added,
--- mdadm-3.3.orig/mdadm.c
+++ mdadm-3.3/mdadm.c
@@ -88,6 +88,7 @@ int main(int argc, char *argv[])
int spare_sharing = 1;
struct supertype *ss = NULL;
int writemostly = 0;
+ int failfast = 0;
char *shortopt = short_options;
int dosyslog = 0;
int rebuild_map = 0;
@@ -289,6 +290,7 @@ int main(int argc, char *argv[])
dv->devname = optarg;
dv->disposition = devmode;
dv->writemostly = writemostly;
+ dv->failfast = failfast;
dv->used = 0;
dv->next = NULL;
*devlistend = dv;
@@ -347,6 +349,7 @@ int main(int argc, char *argv[])
dv->devname = optarg;
dv->disposition = devmode;
dv->writemostly = writemostly;
+ dv->failfast = failfast;
dv->used = 0;
dv->next = NULL;
*devlistend = dv;
@@ -414,6 +417,14 @@ int main(int argc, char *argv[])
writemostly = 2;
continue;
+ case O(MANAGE,FailFast):
+ case O(CREATE,FailFast):
+ failfast = 1;
+ continue;
+ case O(MANAGE,NoFailFast):
+ failfast = 2;
+ continue;
+
case O(GROW,'z'):
case O(CREATE,'z'):
case O(BUILD,'z'): /* size */
--- mdadm-3.3.orig/mdadm.h
+++ mdadm-3.3/mdadm.h
@@ -318,6 +318,8 @@ enum special_options {
ConfigFile,
ChunkSize,
WriteMostly,
+ FailFast,
+ NoFailFast,
Layout,
Auto,
Force,
@@ -436,6 +438,7 @@ struct mddev_dev {
* Not set for names read from .config
*/
char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */
+ char failfast; /* Ditto but for 'failfast' flag */
int used; /* set when used */
long long data_offset;
struct mddev_dev *next;
@@ -747,6 +750,8 @@ extern struct superswitch {
* linear-grow-update - now change the size of the array.
* writemostly - set the WriteMostly1 bit in the superblock devflags
* readwrite - clear the WriteMostly1 bit in the superblock devflags
+ * failfast - set the FailFast1 bit in the superblock
+ * nofailfast - clear the FailFast1 bit
* no-bitmap - clear any record that a bitmap is present.
* bbl - add a bad-block-log if possible
* no-bbl - remove and bad-block-log is it is empty.
--- mdadm-3.3.orig/super0.c
+++ mdadm-3.3/super0.c
@@ -216,19 +216,21 @@ static void examine_super0(struct supert
mdp_disk_t *dp;
char *dv;
char nb[5];
- int wonly;
+ int wonly, failfast;
if (d>=0) dp = &sb->disks[d];
else dp = &sb->this_disk;
snprintf(nb, sizeof(nb), "%4d", d);
printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb,
dp->number, dp->major, dp->minor, dp->raid_disk);
wonly = dp->state & (1<<MD_DISK_WRITEMOSTLY);
- dp->state &= ~(1<<MD_DISK_WRITEMOSTLY);
+ failfast = dp->state & (1<<MD_DISK_FAILFAST);
+ dp->state &= ~(wonly | failfast);
if (dp->state & (1<<MD_DISK_FAULTY)) printf(" faulty");
if (dp->state & (1<<MD_DISK_ACTIVE)) printf(" active");
if (dp->state & (1<<MD_DISK_SYNC)) printf(" sync");
if (dp->state & (1<<MD_DISK_REMOVED)) printf(" removed");
if (wonly) printf(" write-mostly");
+ if (failfast) printf(" failfast");
if (dp->state == 0) printf(" spare");
if ((dv=map_dev(dp->major, dp->minor, 0)))
printf(" %s", dv);
@@ -557,7 +559,8 @@ static int update_super0(struct supertyp
} else if (strcmp(update, "assemble")==0) {
int d = info->disk.number;
int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
- int mask = (1<<MD_DISK_WRITEMOSTLY);
+ int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST);
+ int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST);
int add = 0;
if (sb->minor_version >= 91)
/* During reshape we don't insist on everything
@@ -566,7 +569,7 @@ static int update_super0(struct supertyp
add = (1<<MD_DISK_SYNC);
if (((sb->disks[d].state & ~mask) | add)
!= (unsigned)info->disk.state) {
- sb->disks[d].state = info->disk.state | wonly;
+ sb->disks[d].state = info->disk.state | wonly |failfast;
rv = 1;
}
if (info->reshape_active &&
--- mdadm-3.3.orig/super1.c
+++ mdadm-3.3/super1.c
@@ -73,6 +73,7 @@ struct mdp_superblock_1 {
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
__u8 devflags; /* per-device flags. Only one defined...*/
#define WriteMostly1 1 /* mask for writemostly flag in above */
+#define FailFast1 2 /* Device should get FailFast requests */
/* bad block log. If there are any bad blocks the feature flag is set.
* if offset and size are non-zero, that space is reserved and available.
*/
@@ -395,6 +396,8 @@ static void examine_super1(struct supert
printf(" Flags :");
if (sb->devflags & WriteMostly1)
printf(" write-mostly");
+ if (sb->devflags & FailFast1)
+ printf(" failfast");
printf("\n");
}
@@ -985,6 +988,8 @@ static void getinfo_super1(struct supert
}
if (sb->devflags & WriteMostly1)
info->disk.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (sb->devflags & FailFast1)
+ info->disk.state |= (1 << MD_DISK_FAILFAST);
info->events = __le64_to_cpu(sb->events);
sprintf(info->text_version, "1.%d", st->minor_version);
info->safe_mode_delay = 200;
@@ -1316,6 +1321,10 @@ static int update_super1(struct supertyp
sb->devflags |= WriteMostly1;
else if (strcmp(update, "readwrite")==0)
sb->devflags &= ~WriteMostly1;
+ else if (strcmp(update, "failfast") == 0)
+ sb->devflags |= FailFast1;
+ else if (strcmp(update, "nofailfast") == 0)
+ sb->devflags &= ~FailFast1;
else
rv = -1;
@@ -1572,6 +1581,10 @@ static int write_init_super1(struct supe
sb->devflags |= WriteMostly1;
else
sb->devflags &= ~WriteMostly1;
+ if (di->disk.state & (1<<MD_DISK_FAILFAST))
+ sb->devflags |= FailFast1;
+ else
+ sb->devflags &= ~FailFast1;
if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
read(rfd, sb->device_uuid, 16) != 16) {