From 480523feae581ab714ba6610388a3b4619a2f695 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 20 Aug 2019 10:21:09 +1000
Subject: [PATCH 1/3] md: only call set_in_sync() when it is expected to
 succeed.

Since commit 4ad23a976413 ("MD: use per-cpu counter for
writes_pending"), set_in_sync() is substantially more expensive: it
can wait for a full RCU grace period which can be 10s of milliseconds.

So we should only call it when the cost is justified.

md_check_recovery() currently calls set_in_sync() every time it finds
anything to do (on non-external active arrays).  For an array
performing resync or recovery, this will be quite often.
Each call will introduce a delay to the md thread, which can noticeable
affect IO submission latency.

In md_check_recovery() we only need to call set_in_sync() if
'safemode' was non-zero at entry, meaning that there has been not
recent IO.  So we save this "safemode was nonzero" state, and only
call set_in_sync() if it was non-zero.

This measurably reduces mean and maximum IO submission latency during
resync/recovery.

Reported-and-tested-by: Jack Wang <jinpu.wang@cloud.ionos.com>
Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending")
Cc: stable@vger.kernel.org (v4.12+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index daa885ee4d60..21efb0b949a6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8932,6 +8932,7 @@ void md_check_recovery(struct mddev *mddev)
 
 	if (mddev_trylock(mddev)) {
 		int spares = 0;
+		bool try_set_sync = mddev->safemode != 0;
 
 		if (!mddev->external && mddev->safemode == 1)
 			mddev->safemode = 0;
@@ -8977,7 +8978,7 @@ void md_check_recovery(struct mddev *mddev)
 			}
 		}
 
-		if (!mddev->external && !mddev->in_sync) {
+		if (try_set_sync && !mddev->external && !mddev->in_sync) {
 			spin_lock(&mddev->lock);
 			set_in_sync(mddev);
 			spin_unlock(&mddev->lock);

From 9d4b45d6af442237560d0bb5502a012baa5234b7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 20 Aug 2019 10:21:09 +1000
Subject: [PATCH 2/3] md: don't report active array_state until after
 revalidate_disk() completes.

Until revalidate_disk() has completed, the size of a new md array will
appear to be zero.
So we shouldn't report, through array_state, that the array is active
until that time.
udev rules check array_state to see if the array is ready.  As soon as
it appear to be zero, fsck can be run.  If it find the size to be
zero, it will fail.

So add a new flag to provide an interlock between do_md_run() and
array_state_show().  This flag is set while do_md_run() is active and
it prevents array_state_show() from reporting that the array is
active.

Before do_md_run() is called, ->pers will be NULL so array is
definitely not active.
After do_md_run() is called, revalidate_disk() will have run and the
array will be completely ready.

We also move various sysfs_notify*() calls out of md_run() into
do_md_run() after MD_NOT_READY is cleared.  This ensure the
information is ready before the notification is sent.

Prior to v4.12, array_state_show() was called with the
mddev->reconfig_mutex held, which provided exclusion with do_md_run().

Note that MD_NOT_READY cleared twice.  This is deliberate to cover
both success and error paths with minimal noise.

Fixes: b7b17c9b67e5 ("md: remove mddev_lock() from md_attr_show()")
Cc: stable@vger.kernel.org (v4.12++)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 11 +++++++----
 drivers/md/md.h |  3 +++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 21efb0b949a6..b46bb143e3c5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4179,7 +4179,7 @@ array_state_show(struct mddev *mddev, char *page)
 {
 	enum array_state st = inactive;
 
-	if (mddev->pers)
+	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags))
 		switch(mddev->ro) {
 		case 1:
 			st = readonly;
@@ -5776,9 +5776,6 @@ int md_run(struct mddev *mddev)
 		md_update_sb(mddev, 0);
 
 	md_new_event(mddev);
-	sysfs_notify_dirent_safe(mddev->sysfs_state);
-	sysfs_notify_dirent_safe(mddev->sysfs_action);
-	sysfs_notify(&mddev->kobj, NULL, "degraded");
 	return 0;
 
 bitmap_abort:
@@ -5799,6 +5796,7 @@ static int do_md_run(struct mddev *mddev)
 {
 	int err;
 
+	set_bit(MD_NOT_READY, &mddev->flags);
 	err = md_run(mddev);
 	if (err)
 		goto out;
@@ -5819,9 +5817,14 @@ static int do_md_run(struct mddev *mddev)
 
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	revalidate_disk(mddev->gendisk);
+	clear_bit(MD_NOT_READY, &mddev->flags);
 	mddev->changed = 1;
 	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+	sysfs_notify_dirent_safe(mddev->sysfs_state);
+	sysfs_notify_dirent_safe(mddev->sysfs_action);
+	sysfs_notify(&mddev->kobj, NULL, "degraded");
 out:
+	clear_bit(MD_NOT_READY, &mddev->flags);
 	return err;
 }
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b742659150a2..1edcd967eb8e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -248,6 +248,9 @@ enum mddev_flags {
 	MD_UPDATING_SB,		/* md_check_recovery is updating the metadata
 				 * without explicitly holding reconfig_mutex.
 				 */
+	MD_NOT_READY,		/* do_md_run() is active, so 'array_state'
+				 * must not report that array is ready yet
+				 */
 };
 
 enum mddev_sb_flags {

From 0009fad033370802de75e4cedab54f4d86450e22 Mon Sep 17 00:00:00 2001
From: Nigel Croxon <ncroxon@redhat.com>
Date: Wed, 21 Aug 2019 09:27:08 -0400
Subject: [PATCH 3/3] raid5 improve too many read errors msg by adding limits

Often limits can be changed by admin. When discussing such things
it helps if you can provide "self-sustained" facts. Also
sometimes the admin thinks he changed a limit, but it did not
take effect for some reason or he changed the wrong thing.

V3: Only pr_warn when Faulty is 0.
V2: Add read_errors value to pr_warn.

Signed-off-by: Nigel Croxon <ncroxon@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59cafafd5a5d..88e56ee98976 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2549,10 +2549,16 @@ static void raid5_end_read_request(struct bio * bi)
 				(unsigned long long)s,
 				bdn);
 		} else if (atomic_read(&rdev->read_errors)
-			 > conf->max_nr_stripes)
-			pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
-			       mdname(conf->mddev), bdn);
-		else
+			 > conf->max_nr_stripes) {
+			if (!test_bit(Faulty, &rdev->flags)) {
+				pr_warn("md/raid:%s: %d read_errors > %d stripes\n",
+				    mdname(conf->mddev),
+				    atomic_read(&rdev->read_errors),
+				    conf->max_nr_stripes);
+				pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
+				    mdname(conf->mddev), bdn);
+			}
+		} else
 			retry = 1;
 		if (set_bad && test_bit(In_sync, &rdev->flags)
 		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))