From 4675719d0f47d18bc13db62bd21cffd4e4ec8001 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 2 Jul 2019 22:35:48 +0800 Subject: [PATCH 1/9] raid1: use an int as the return value of raise_barrier() Using a sector_t as the return value is misleading, because raise_barrier() only return 0 or -EINTR. Also add comments for the return values of raise_barrier(). Signed-off-by: Hou Tao Signed-off-by: Song Liu --- drivers/md/raid1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34e26834ad28..108c75006cef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -872,8 +872,11 @@ static void flush_pending_writes(struct r1conf *conf) * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. + * + * If resync/recovery is interrupted, returns -EINTR; + * Otherwise, returns 0. */ -static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr) +static int raise_barrier(struct r1conf *conf, sector_t sector_nr) { int idx = sector_to_idx(sector_nr); From 143f6e733b73051cd22dcb80951c6c929da413ce Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Mon, 8 Jul 2019 10:14:32 +0800 Subject: [PATCH 2/9] md/raid6: Set R5_ReadError when there is read failure on parity disk 7471fb77ce4d ("md/raid6: Fix anomily when recovering a single device in RAID6.") avoids rereading P when it can be computed from other members. However, this misses the chance to re-write the right data to P. This patch sets R5_ReadError if the re-read fails. Also, when re-read is skipped, we also missed the chance to reset rdev->read_errors to 0. It can fail the disk when there are many read errors on P member disk (other disks don't have read error) V2: upper layer read request don't read parity/Q data. So there is no need to consider such situation. This is Reported-by: kbuild test robot Fixes: 7471fb77ce4d ("md/raid6: Fix anomily when recovering a single device in RAID6.") Cc: #4.4+ Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3de4e13bde98..59cafafd5a5d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2558,7 +2558,9 @@ static void raid5_end_read_request(struct bio * bi) && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) retry = 1; if (retry) - if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + if (sh->qd_idx >= 0 && sh->pd_idx == i) + set_bit(R5_ReadError, &sh->dev[i].flags); + else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); } else From eeba6809d8d58908b5ed1b5ceb5fcb09a98a7cad Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 19 Jul 2019 13:48:46 +0800 Subject: [PATCH 3/9] md/raid1: end bio when the device faulty When write bio return error, it would be added to conf->retry_list and wait for raid1d thread to retry write and acknowledge badblocks. In narrow_write_error(), the error bio will be split in the unit of badblock shift (such as one sector) and raid1d thread issues them one by one. Until all of the splited bio has finished, raid1d thread can go on processing other things, which is time consuming. But, there is a scene for error handling that is not necessary. When the device has been set faulty, flush_bio_list() may end bios in pending_bio_list with error status. Since these bios has not been issued to the device actually, error handlding to retry write and acknowledge badblocks make no sense. Even without that scene, when the device is faulty, badblocks info can not be written out to the device. Thus, we also no need to handle the error IO. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/raid1.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 108c75006cef..7ffbd8112400 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -447,19 +447,21 @@ static void raid1_end_write_request(struct bio *bio) /* We never try FailFast to WriteMostly devices */ !test_bit(WriteMostly, &rdev->flags)) { md_error(r1_bio->mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - /* This is the only remaining device, - * We need to retry the write without - * FailFast - */ - set_bit(R1BIO_WriteError, &r1_bio->state); - else { - /* Finished with this branch */ - r1_bio->bios[mirror] = NULL; - to_put = bio; - } - } else + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + * For failfast, this is the only remaining device, + * We need to retry the write without FailFast. + */ + if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); + else { + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + } } else { /* * Set R1BIO_Uptodate in our master bio, so that we From 7cee6d4e6035603d42acd56d591e624921aa1b14 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 19 Jul 2019 13:48:47 +0800 Subject: [PATCH 4/9] md/raid10: end bio when the device faulty Just like raid1, we do not queue write error bio to retry write and acknowlege badblocks, when the device is faulty. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/raid10.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8a1354a08a1a..a982e040b609 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -465,19 +465,21 @@ static void raid10_end_write_request(struct bio *bio) if (test_bit(FailFast, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { md_error(rdev->mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - /* This is the only remaining device, - * We need to retry the write without - * FailFast - */ - set_bit(R10BIO_WriteError, &r10_bio->state); - else { - r10_bio->devs[slot].bio = NULL; - to_put = bio; - dec_rdev = 1; - } - } else + } + + /* + * When the device is faulty, it is not necessary to + * handle write error. + * For failfast, this is the only remaining device, + * We need to retry the write without FailFast. + */ + if (!test_bit(Faulty, &rdev->flags)) set_bit(R10BIO_WriteError, &r10_bio->state); + else { + r10_bio->devs[slot].bio = NULL; + to_put = bio; + dec_rdev = 1; + } } } else { /* From cf89160793c439dca00e2563d0b7f153c274027b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Jul 2019 23:41:55 +0300 Subject: [PATCH 5/9] md: Convert to use int_pow() Instead of linear approach to calculate power of 10, use generic int_pow() which does it better. Signed-off-by: Andy Shevchenko Signed-off-by: Song Liu --- drivers/md/md.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 24638ccedce4..3f1252440ad0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3664,11 +3664,7 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return -EINVAL; if (decimals < 0) decimals = 0; - while (decimals < scale) { - result *= 10; - decimals ++; - } - *res = result; + *res = result * int_pow(10, scale - decimals); return 0; } From 9a567843f7ce0037bfd4d5fdc58a09d0a527b28b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 Jul 2019 11:09:19 +0200 Subject: [PATCH 6/9] md: allow last device to be forcibly removed from RAID1/RAID10. When the 'last' device in a RAID1 or RAID10 reports an error, we do not mark it as failed. This would serve little purpose as there is no risk of losing data beyond that which is obviously lost (as there is with RAID5), and there could be other sectors on the device which are readable, and only readable from this device. This in general this maximises access to data. However the current implementation also stops an admin from removing the last device by direct action. This is rarely useful, but in many case is not harmful and can make automation easier by removing special cases. Also, if an attempt to write metadata fails the device must be marked as faulty, else an infinite loop will result, attempting to update the metadata on all non-faulty devices. So add 'fail_last_dev' member to 'struct mddev', then we can bypasses the 'last disk' checks for RAID1 and RAID10, and control the behavior per array by change sysfs node. Signed-off-by: NeilBrown [add sysfs node for fail_last_dev by Guoqing] Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 29 +++++++++++++++++++++++++++++ drivers/md/md.h | 1 + drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 6 +++--- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3f1252440ad0..67b90d547f8b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5178,6 +5178,34 @@ static struct md_sysfs_entry md_consistency_policy = __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, consistency_policy_store); +static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->fail_last_dev); +} + +/* + * Setting fail_last_dev to true to allow last device to be forcibly removed + * from RAID1/RAID10. + */ +static ssize_t +fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) +{ + int ret; + bool value; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + if (value != mddev->fail_last_dev) + mddev->fail_last_dev = value; + + return len; +} +static struct md_sysfs_entry md_fail_last_dev = +__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, + fail_last_dev_store); + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, @@ -5194,6 +5222,7 @@ static struct attribute *md_default_attrs[] = { &md_array_size.attr, &max_corr_read_errors.attr, &md_consistency_policy.attr, + &md_fail_last_dev.attr, NULL, }; diff --git a/drivers/md/md.h b/drivers/md/md.h index 10f98200e2f8..b742659150a2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -487,6 +487,7 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ bool has_superblocks:1; + bool fail_last_dev:1; }; enum recovery_flags { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7ffbd8112400..cd80f281b95d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1617,12 +1617,12 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) /* * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. + * else if it is the last working disks with "fail_last_dev == false", + * ignore the error, let the next level up know. * else mark the drive as failed */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a982e040b609..299c7b1c9718 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1640,12 +1640,12 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) /* * If it is not operational, then we have already marked it as dead - * else if it is the last working disks, ignore the error, let the - * next level up know. + * else if it is the last working disks with "fail_last_dev == false", + * ignore the error, let the next level up know. * else mark the drive as failed */ spin_lock_irqsave(&conf->device_lock, flags); - if (test_bit(In_sync, &rdev->flags) + if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev && !enough(conf, rdev->raid_disk)) { /* * Don't fail the drive, just return an IO error. From 062f5b2ae12a153644c765e7ba3b0f825427be1d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 Jul 2019 11:09:20 +0200 Subject: [PATCH 7/9] md: don't set In_sync if array is frozen When a disk is added to array, the following path is called in mdadm. Manage_subdevs -> sysfs_freeze_array -> Manage_add -> sysfs_set_str(&info, NULL, "sync_action","idle") Then from kernel side, Manage_add invokes the path (add_new_disk -> validate_super = super_1_validate) to set In_sync flag. Since In_sync means "device is in_sync with rest of array", and the new added disk need to resync thread to help the synchronization of data. And md_reap_sync_thread would call spare_active to set In_sync for the new added disk finally. So don't set In_sync if array is in frozen. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 67b90d547f8b..1cdaa6ff9bd7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1826,8 +1826,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_BITMAP)) rdev->saved_raid_disk = -1; - } else - set_bit(In_sync, &rdev->flags); + } else { + /* + * If the array is FROZEN, then the device can't + * be in_sync with rest of array. + */ + if (!test_bit(MD_RECOVERY_FROZEN, + &mddev->recovery)) + set_bit(In_sync, &rdev->flags); + } rdev->raid_disk = role; break; } From 0d8ed0e9bf9643f27f4816dca61081784dedb38d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 Jul 2019 11:09:21 +0200 Subject: [PATCH 8/9] md: don't call spare_active in md_reap_sync_thread if all member devices can't work When add one disk to array, the md_reap_sync_thread is responsible to activate the spare and set In_sync flag for the new member in spare_active(). But if raid1 has one member disk A, and disk B is added to the array. Then we offline A before all the datas are synchronized from A to B, obviously B doesn't have the latest data as A, but B is still marked with In_sync flag. So let's not call spare_active under the condition, otherwise B is still showed with 'U' state which is not correct. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1cdaa6ff9bd7..daa885ee4d60 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9075,7 +9075,8 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + mddev->degraded != mddev->raid_disks) { /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { From 449808a254fd567d3dbeb11595a0af238c687c82 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 27 Jul 2019 14:02:58 +0800 Subject: [PATCH 9/9] raid1: factor out a common routine to handle the completion of sync write It's just code clean-up. Signed-off-by: Hou Tao Signed-off-by: Song Liu --- drivers/md/raid1.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cd80f281b95d..6ea4f2679b78 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1906,6 +1906,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) } while (sectors_to_go > 0); } +static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +{ + if (atomic_dec_and_test(&r1_bio->remaining)) { + struct mddev *mddev = r1_bio->mddev; + int s = r1_bio->sectors; + + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } + } +} + static void end_sync_write(struct bio *bio) { int uptodate = !bio->bi_status; @@ -1932,16 +1948,7 @@ static void end_sync_write(struct bio *bio) ) set_bit(R1BIO_MadeGood, &r1_bio->state); - if (atomic_dec_and_test(&r1_bio->remaining)) { - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); - } - } + put_sync_write_buf(r1_bio, uptodate); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -2224,17 +2231,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) generic_make_request(wbio); } - if (atomic_dec_and_test(&r1_bio->remaining)) { - /* if we're here, all write(s) have completed, so clean up */ - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, &r1_bio->state) || - test_bit(R1BIO_WriteError, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, 1); - } - } + put_sync_write_buf(r1_bio, 1); } /*