From 480523feae581ab714ba6610388a3b4619a2f695 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 20 Aug 2019 10:21:09 +1000 Subject: [PATCH 1/3] md: only call set_in_sync() when it is expected to succeed. Since commit 4ad23a976413 ("MD: use per-cpu counter for writes_pending"), set_in_sync() is substantially more expensive: it can wait for a full RCU grace period which can be 10s of milliseconds. So we should only call it when the cost is justified. md_check_recovery() currently calls set_in_sync() every time it finds anything to do (on non-external active arrays). For an array performing resync or recovery, this will be quite often. Each call will introduce a delay to the md thread, which can noticeable affect IO submission latency. In md_check_recovery() we only need to call set_in_sync() if 'safemode' was non-zero at entry, meaning that there has been not recent IO. So we save this "safemode was nonzero" state, and only call set_in_sync() if it was non-zero. This measurably reduces mean and maximum IO submission latency during resync/recovery. Reported-and-tested-by: Jack Wang Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") Cc: stable@vger.kernel.org (v4.12+) Signed-off-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index daa885ee4d60..21efb0b949a6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8932,6 +8932,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; + bool try_set_sync = mddev->safemode != 0; if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; @@ -8977,7 +8978,7 @@ void md_check_recovery(struct mddev *mddev) } } - if (!mddev->external && !mddev->in_sync) { + if (try_set_sync && !mddev->external && !mddev->in_sync) { spin_lock(&mddev->lock); set_in_sync(mddev); spin_unlock(&mddev->lock); From 9d4b45d6af442237560d0bb5502a012baa5234b7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 20 Aug 2019 10:21:09 +1000 Subject: [PATCH 2/3] md: don't report active array_state until after revalidate_disk() completes. Until revalidate_disk() has completed, the size of a new md array will appear to be zero. So we shouldn't report, through array_state, that the array is active until that time. udev rules check array_state to see if the array is ready. As soon as it appear to be zero, fsck can be run. If it find the size to be zero, it will fail. So add a new flag to provide an interlock between do_md_run() and array_state_show(). This flag is set while do_md_run() is active and it prevents array_state_show() from reporting that the array is active. Before do_md_run() is called, ->pers will be NULL so array is definitely not active. After do_md_run() is called, revalidate_disk() will have run and the array will be completely ready. We also move various sysfs_notify*() calls out of md_run() into do_md_run() after MD_NOT_READY is cleared. This ensure the information is ready before the notification is sent. Prior to v4.12, array_state_show() was called with the mddev->reconfig_mutex held, which provided exclusion with do_md_run(). Note that MD_NOT_READY cleared twice. This is deliberate to cover both success and error paths with minimal noise. Fixes: b7b17c9b67e5 ("md: remove mddev_lock() from md_attr_show()") Cc: stable@vger.kernel.org (v4.12++) Signed-off-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/md.c | 11 +++++++---- drivers/md/md.h | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 21efb0b949a6..b46bb143e3c5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4179,7 +4179,7 @@ array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; - if (mddev->pers) + if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) switch(mddev->ro) { case 1: st = readonly; @@ -5776,9 +5776,6 @@ int md_run(struct mddev *mddev) md_update_sb(mddev, 0); md_new_event(mddev); - sysfs_notify_dirent_safe(mddev->sysfs_state); - sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; bitmap_abort: @@ -5799,6 +5796,7 @@ static int do_md_run(struct mddev *mddev) { int err; + set_bit(MD_NOT_READY, &mddev->flags); err = md_run(mddev); if (err) goto out; @@ -5819,9 +5817,14 @@ static int do_md_run(struct mddev *mddev) set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); + clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify(&mddev->kobj, NULL, "degraded"); out: + clear_bit(MD_NOT_READY, &mddev->flags); return err; } diff --git a/drivers/md/md.h b/drivers/md/md.h index b742659150a2..1edcd967eb8e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -248,6 +248,9 @@ enum mddev_flags { MD_UPDATING_SB, /* md_check_recovery is updating the metadata * without explicitly holding reconfig_mutex. */ + MD_NOT_READY, /* do_md_run() is active, so 'array_state' + * must not report that array is ready yet + */ }; enum mddev_sb_flags { From 0009fad033370802de75e4cedab54f4d86450e22 Mon Sep 17 00:00:00 2001 From: Nigel Croxon Date: Wed, 21 Aug 2019 09:27:08 -0400 Subject: [PATCH 3/3] raid5 improve too many read errors msg by adding limits Often limits can be changed by admin. When discussing such things it helps if you can provide "self-sustained" facts. Also sometimes the admin thinks he changed a limit, but it did not take effect for some reason or he changed the wrong thing. V3: Only pr_warn when Faulty is 0. V2: Add read_errors value to pr_warn. Signed-off-by: Nigel Croxon Signed-off-by: Song Liu --- drivers/md/raid5.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59cafafd5a5d..88e56ee98976 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2549,10 +2549,16 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)s, bdn); } else if (atomic_read(&rdev->read_errors) - > conf->max_nr_stripes) - pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", - mdname(conf->mddev), bdn); - else + > conf->max_nr_stripes) { + if (!test_bit(Faulty, &rdev->flags)) { + pr_warn("md/raid:%s: %d read_errors > %d stripes\n", + mdname(conf->mddev), + atomic_read(&rdev->read_errors), + conf->max_nr_stripes); + pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", + mdname(conf->mddev), bdn); + } + } else retry = 1; if (set_bad && test_bit(In_sync, &rdev->flags) && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))