From 0ba959774e93911caff596de6391f085fb640ac4 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Wed, 1 Mar 2017 17:30:29 +0800
Subject: [PATCH 01/84] md-cluster: use sync way to handle METADATA_UPDATED msg

Previously, when node received METADATA_UPDATED msg, it just
need to wakeup mddev->thread, then md_reload_sb will be called
eventually.

We taken the asynchronous way to avoid a deadlock issue, the
deadlock issue could happen when one node is receiving the
METADATA_UPDATED msg (wants reconfig_mutex) and trying to run
the path:

md_check_recovery -> mddev_trylock(hold reconfig_mutex)
                  -> md_update_sb-metadata_update_start
		     (want EX on token however token is
		      got by the sending node)

Since we will support resizing for clustered raid, and we
need the metadata update handling to be synchronous so that
the initiating node can detect failure, so we need to change
the way for handling METADATA_UPDATED msg.

But, we obviously need to avoid above deadlock with the
sync way. To make this happen, we considered to not hold
reconfig_mutex to call md_reload_sb, if some other thread
has already taken reconfig_mutex and waiting for the 'token',
then process_recvd_msg() can safely call md_reload_sb()
without taking the mutex. This is because we can be certain
that no other thread will take the mutex, and we also certain
that the actions performed by md_reload_sb() won't interfere
with anything that the other thread is in the middle of.

To make this more concrete, we added a new cinfo->state bit
        MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD

Which is set in lock_token() just before dlm_lock_sync() is
called, and cleared just after. As lock_token() is always
called with reconfig_mutex() held (the specific case is the
resync_info_update which is distinguished well in previous
patch), if process_recvd_msg() finds that the new bit is set,
then the mutex must be held by some other thread, and it will
keep waiting.

So process_metadata_update() can call md_reload_sb() if either
mddev_trylock() succeeds, or if MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD
is set. The tricky bit is what to do if neither of these apply.
We need to wait. Fortunately mddev_unlock() always calls wake_up()
on mddev->thread->wqueue. So we can get lock_token() to call
wake_up() on that when it sets the bit.

There are also some related changes inside this commit:
1. remove RELOAD_SB related codes since there are not valid anymore.
2. mddev is added into md_cluster_info then we can get mddev inside
   lock_token.
3. add new parameter for lock_token to distinguish reconfig_mutex
   is held or not.

And, we need to set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD in below:
1. set it before unregister thread, otherwise a deadlock could
   appear if stop a resyncing array.
   This is because md_unregister_thread(&cinfo->recv_thread) is
   blocked by recv_daemon -> process_recvd_msg
			  -> process_metadata_update.
   To resolve the issue, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
   also need to be set before unregister thread.
2. set it in metadata_update_start to fix another deadlock.
	a. Node A sends METADATA_UPDATED msg (held Token lock).
	b. Node B wants to do resync, and is blocked since it can't
	   get Token lock, but MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD is
	   not set since the callchain
	   (md_do_sync -> sync_request
        	       -> resync_info_update
		       -> sendmsg
		       -> lock_comm -> lock_token)
	   doesn't hold reconfig_mutex.
	c. Node B trys to update sb (held reconfig_mutex), but stopped
	   at wait_event() in metadata_update_start since we have set
	   MD_CLUSTER_SEND_LOCK flag in lock_comm (step 2).
	d. Then Node B receives METADATA_UPDATED msg from A, of course
	   recv_daemon is blocked forever.
   Since metadata_update_start always calls lock_token with reconfig_mutex,
   we need to set MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD here as well, and
   lock_token don't need to set it twice unless lock_token is invoked from
   lock_comm.

Finally, thanks to Neil for his great idea and help!

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 82 +++++++++++++++++++++++++++++++++--------
 drivers/md/md.c         |  4 --
 drivers/md/md.h         |  3 --
 3 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 321ecac23027..5cf0a9d29bf0 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -67,9 +67,10 @@ struct resync_info {
  * set up all the related infos such as bitmap and personality */
 #define		MD_CLUSTER_ALREADY_IN_CLUSTER		6
 #define		MD_CLUSTER_PENDING_RECV_EVENT		7
-
+#define 	MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD		8
 
 struct md_cluster_info {
+	struct mddev *mddev; /* the md device which md_cluster_info belongs to */
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
 	int slot_number;
@@ -523,11 +524,17 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
+	int got_lock = 0;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
-	set_bit(MD_RELOAD_SB, &mddev->flags);
+
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
-	md_wakeup_thread(mddev->thread);
+	wait_event(mddev->thread->wqueue,
+		   (got_lock = mddev_trylock(mddev)) ||
+		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
+	md_reload_sb(mddev, mddev->good_device_nr);
+	if (got_lock)
+		mddev_unlock(mddev);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -646,11 +653,29 @@ out:
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
  */
-static int lock_token(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
 {
-	int error;
+	int error, set_bit = 0;
+	struct mddev *mddev = cinfo->mddev;
 
+	/*
+	 * If resync thread run after raid1d thread, then process_metadata_update
+	 * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
+	 * since another node already got EX on Token and waitting the EX of Ack),
+	 * so let resync wake up thread in case flag is set.
+	 */
+	if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+				      &cinfo->state)) {
+		error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+					      &cinfo->state);
+		WARN_ON_ONCE(error);
+		md_wakeup_thread(mddev->thread);
+		set_bit = 1;
+	}
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+	if (set_bit)
+		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
@@ -663,12 +688,12 @@ static int lock_token(struct md_cluster_info *cinfo)
 /* lock_comm()
  * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
  */
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
 {
 	wait_event(cinfo->wait,
 		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
 
-	return lock_token(cinfo);
+	return lock_token(cinfo, mddev_locked);
 }
 
 static void unlock_comm(struct md_cluster_info *cinfo)
@@ -743,11 +768,12 @@ failed_message:
 	return error;
 }
 
-static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
+		   bool mddev_locked)
 {
 	int ret;
 
-	lock_comm(cinfo);
+	lock_comm(cinfo, mddev_locked);
 	ret = __sendmsg(cinfo, cmsg);
 	unlock_comm(cinfo);
 	return ret;
@@ -834,6 +860,7 @@ static int join(struct mddev *mddev, int nodes)
 	mutex_init(&cinfo->recv_mutex);
 
 	mddev->cluster_info = cinfo;
+	cinfo->mddev = mddev;
 
 	memset(str, 0, 64);
 	sprintf(str, "%pU", mddev->uuid);
@@ -908,6 +935,7 @@ static int join(struct mddev *mddev, int nodes)
 
 	return 0;
 err:
+	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 	md_unregister_thread(&cinfo->recovery_thread);
 	md_unregister_thread(&cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
@@ -943,7 +971,7 @@ static void resync_bitmap(struct mddev *mddev)
 	int err;
 
 	cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
-	err = sendmsg(cinfo, &cmsg);
+	err = sendmsg(cinfo, &cmsg, 1);
 	if (err)
 		pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
 			__func__, __LINE__, err);
@@ -963,6 +991,7 @@ static int leave(struct mddev *mddev)
 	if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
 		resync_bitmap(mddev);
 
+	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 	md_unregister_thread(&cinfo->recovery_thread);
 	md_unregister_thread(&cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
@@ -997,16 +1026,30 @@ static int slot_number(struct mddev *mddev)
 static int metadata_update_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret;
+
+	/*
+	 * metadata_update_start is always called with the protection of
+	 * reconfig_mutex, so set WAITING_FOR_TOKEN here.
+	 */
+	ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+				    &cinfo->state);
+	WARN_ON_ONCE(ret);
+	md_wakeup_thread(mddev->thread);
 
 	wait_event(cinfo->wait,
 		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
 		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
 
 	/* If token is already locked, return 0 */
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
+		clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
 		return 0;
+	}
 
-	return lock_token(cinfo);
+	ret = lock_token(cinfo, 1);
+	clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+	return ret;
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -1069,7 +1112,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 	cmsg.low = cpu_to_le64(lo);
 	cmsg.high = cpu_to_le64(hi);
 
-	return sendmsg(cinfo, &cmsg);
+	/*
+	 * mddev_lock is held if resync_info_update is called from
+	 * resync_finish (md_reap_sync_thread -> resync_finish)
+	 */
+	if (lo == 0 && hi == 0)
+		return sendmsg(cinfo, &cmsg, 1);
+	else
+		return sendmsg(cinfo, &cmsg, 0);
 }
 
 static int resync_finish(struct mddev *mddev)
@@ -1119,7 +1169,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 	cmsg.type = cpu_to_le32(NEWDISK);
 	memcpy(cmsg.uuid, uuid, 16);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	lock_comm(cinfo);
+	lock_comm(cinfo, 1);
 	ret = __sendmsg(cinfo, &cmsg);
 	if (ret)
 		return ret;
@@ -1179,7 +1229,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	return sendmsg(cinfo, &cmsg);
+	return sendmsg(cinfo, &cmsg, 1);
 }
 
 static int lock_all_bitmaps(struct mddev *mddev)
@@ -1243,7 +1293,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
 
 	cmsg.type = cpu_to_le32(RE_ADD);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	err = sendmsg(cinfo, &cmsg);
+	err = sendmsg(cinfo, &cmsg, 1);
 	if (err)
 		goto out;
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f6ae1d67bcd0..83f325077dc8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8385,7 +8385,6 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
-		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8434,9 +8433,6 @@ void md_check_recovery(struct mddev *mddev)
 						rdev->raid_disk < 0)
 					md_kick_rdev_from_array(rdev);
 			}
-
-			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
-				md_reload_sb(mddev, mddev->good_device_nr);
 		}
 
 		if (!mddev->external) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index dde8ecb760c8..1c00160b09f9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -219,9 +219,6 @@ enum mddev_flags {
 				 * it then */
 	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
 	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
-	MD_RELOAD_SB,		/* Reload the superblock because another node
-				 * updated it.
-				 */
 	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
 				   * already took resync lock, need to
 				   * release the lock */

From 7da3d203330305695f38302c076ee0d32c005ede Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Wed, 1 Mar 2017 16:42:38 +0800
Subject: [PATCH 02/84] md-cluster: add CHANGE_CAPACITY message type

The msg type CHANGE_CAPACITY is introduced to support
resize clustered raid in later patch, and it is sent
after all the nodes have the same sync_size, receiver
node just need to set new capacity once received this
msg.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5cf0a9d29bf0..8b7d55bf5aa2 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -104,6 +104,7 @@ enum msg_type {
 	REMOVE,
 	RE_ADD,
 	BITMAP_NEEDS_SYNC,
+	CHANGE_CAPACITY,
 };
 
 struct cluster_msg {
@@ -579,6 +580,10 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case METADATA_UPDATED:
 		process_metadata_update(mddev, msg);
 		break;
+	case CHANGE_CAPACITY:
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		revalidate_disk(mddev->gendisk);
+		break;
 	case RESYNCING:
 		process_suspend_info(mddev, le32_to_cpu(msg->slot),
 				     le64_to_cpu(msg->low),

From b98938d16a108d0201d54b8db1337dac71aebfdd Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Wed, 1 Mar 2017 16:42:39 +0800
Subject: [PATCH 03/84] md-cluster: introduce cluster_check_sync_size

Support resize is a little complex for clustered
raid, since we need to ensure all the nodes share
the same knowledge about the size of raid.

We achieve the goal by check the sync_size which
is in each node's bitmap, we can only change the
capacity after cluster_check_sync_size returns 0.

Also, get_bitmap_from_slot is added to get a slot's
bitmap. And we exported some funcs since they are
used in cluster_check_sync_size().

We can also reuse get_bitmap_from_slot to remove
redundant code existed in bitmap_copy_from_slot.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c     | 41 +++++++++++++++++++++-------
 drivers/md/bitmap.h     |  2 ++
 drivers/md/md-cluster.c | 60 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2ccac958a..b6fa55a3cff8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	kunmap_atomic(sb);
 	write_page(bitmap, bitmap->storage.sb_page, 1);
 }
+EXPORT_SYMBOL(bitmap_update_sb);
 
 /* print out the bitmap file superblock */
 void bitmap_print_sb(struct bitmap *bitmap)
@@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
 /*
  * free memory that was allocated
  */
-static void bitmap_free(struct bitmap *bitmap)
+void bitmap_free(struct bitmap *bitmap)
 {
 	unsigned long k, pages;
 	struct bitmap_page *bp;
@@ -1761,6 +1762,7 @@ static void bitmap_free(struct bitmap *bitmap)
 	kfree(bp);
 	kfree(bitmap);
 }
+EXPORT_SYMBOL(bitmap_free);
 
 void bitmap_destroy(struct mddev *mddev)
 {
@@ -1920,6 +1922,27 @@ out:
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
 
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+{
+	int rv = 0;
+	struct bitmap *bitmap;
+
+	bitmap = bitmap_create(mddev, slot);
+	if (IS_ERR(bitmap)) {
+		rv = PTR_ERR(bitmap);
+		return ERR_PTR(rv);
+	}
+
+	rv = bitmap_init_from_disk(bitmap, 0);
+	if (rv) {
+		bitmap_free(bitmap);
+		return ERR_PTR(rv);
+	}
+
+	return bitmap;
+}
+EXPORT_SYMBOL(get_bitmap_from_slot);
+
 /* Loads the bitmap associated with slot and copies the resync information
  * to our bitmap
  */
@@ -1929,14 +1952,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	int rv = 0, i, j;
 	sector_t block, lo = 0, hi = 0;
 	struct bitmap_counts *counts;
-	struct bitmap *bitmap = bitmap_create(mddev, slot);
+	struct bitmap *bitmap;
 
-	if (IS_ERR(bitmap))
-		return PTR_ERR(bitmap);
-
-	rv = bitmap_init_from_disk(bitmap, 0);
-	if (rv)
-		goto err;
+	bitmap = get_bitmap_from_slot(mddev, slot);
+	if (IS_ERR(bitmap)) {
+		pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
+		return -1;
+	}
 
 	counts = &bitmap->counts;
 	for (j = 0; j < counts->chunks; j++) {
@@ -1963,8 +1985,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	bitmap_unplug(mddev->bitmap);
 	*low = lo;
 	*high = hi;
-err:
-	bitmap_free(bitmap);
+
 	return rv;
 }
 EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5b6dd63dda91..9f761097aab2 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,10 @@ void bitmap_daemon_work(struct mddev *mddev);
 
 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 				sector_t *lo, sector_t *hi, bool clear_bits);
+void bitmap_free(struct bitmap *bitmap);
 #endif
 
 #endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 8b7d55bf5aa2..92c3c51ede4d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1091,6 +1091,66 @@ static void metadata_update_cancel(struct mddev *mddev)
 	unlock_comm(cinfo);
 }
 
+/*
+ * return 0 if all the bitmaps have the same sync_size
+ */
+int cluster_check_sync_size(struct mddev *mddev)
+{
+	int i, rv;
+	bitmap_super_t *sb;
+	unsigned long my_sync_size, sync_size = 0;
+	int node_num = mddev->bitmap_info.nodes;
+	int current_slot = md_cluster_ops->slot_number(mddev);
+	struct bitmap *bitmap = mddev->bitmap;
+	char str[64];
+	struct dlm_lock_resource *bm_lockres;
+
+	sb = kmap_atomic(bitmap->storage.sb_page);
+	my_sync_size = sb->sync_size;
+	kunmap_atomic(sb);
+
+	for (i = 0; i < node_num; i++) {
+		if (i == current_slot)
+			continue;
+
+		bitmap = get_bitmap_from_slot(mddev, i);
+		if (IS_ERR(bitmap)) {
+			pr_err("can't get bitmap from slot %d\n", i);
+			return -1;
+		}
+
+		/*
+		 * If we can hold the bitmap lock of one node then
+		 * the slot is not occupied, update the sb.
+		 */
+		snprintf(str, 64, "bitmap%04d", i);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres) {
+			pr_err("md-cluster: Cannot initialize %s\n", str);
+			lockres_free(bm_lockres);
+			return -1;
+		}
+		bm_lockres->flags |= DLM_LKF_NOQUEUE;
+		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (!rv)
+			bitmap_update_sb(bitmap);
+		lockres_free(bm_lockres);
+
+		sb = kmap_atomic(bitmap->storage.sb_page);
+		if (sync_size == 0)
+			sync_size = sb->sync_size;
+		else if (sync_size != sb->sync_size) {
+			kunmap_atomic(sb);
+			bitmap_free(bitmap);
+			return -1;
+		}
+		kunmap_atomic(sb);
+		bitmap_free(bitmap);
+	}
+
+	return (my_sync_size == sync_size) ? 0 : -1;
+}
+
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;

From 818da59f97d6764edb91fa9d1f63d333d54eb999 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Wed, 1 Mar 2017 16:42:40 +0800
Subject: [PATCH 04/84] md-cluster: add the support for resize

To update size for cluster raid, we need to make
sure all nodes can perform the change successfully.
However, it is possible that some of them can't do
it due to failure (bitmap_resize could fail). So
we need to consider the issue before we set the
capacity unconditionally, and we use below steps
to perform sanity check.

1. A change the size, then broadcast METADATA_UPDATED
   msg.
2. B and C receive METADATA_UPDATED change the size
   excepts call set_capacity, sync_size is not update
   if the change failed. Also call bitmap_update_sb
   to sync sb to disk.
3. A checks other node's sync_size, if sync_size has
   been updated in all nodes, then send CHANGE_CAPACITY
   msg otherwise send msg to revert previous change.
4. B and C call set_capacity if receive CHANGE_CAPACITY
   msg, otherwise pers->resize will be called to restore
   the old value.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md/md-cluster.txt |  2 +-
 drivers/md/md-cluster.c         | 76 +++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h         |  1 +
 drivers/md/md.c                 | 21 ++++++---
 4 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
index 38883276d31c..2663d49dd8a0 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -321,4 +321,4 @@ The algorithm is:
 
 There are somethings which are not supported by cluster MD yet.
 
-- update size and change array_sectors.
+- change array_sectors.
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 92c3c51ede4d..b21ef58819f6 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1151,6 +1151,81 @@ int cluster_check_sync_size(struct mddev *mddev)
 	return (my_sync_size == sync_size) ? 0 : -1;
 }
 
+/*
+ * Update the size for cluster raid is a little more complex, we perform it
+ * by the steps:
+ * 1. hold token lock and update superblock in initiator node.
+ * 2. send METADATA_UPDATED msg to other nodes.
+ * 3. The initiator node continues to check each bitmap's sync_size, if all
+ *    bitmaps have the same value of sync_size, then we can set capacity and
+ *    let other nodes to perform it. If one node can't update sync_size
+ *    accordingly, we need to revert to previous value.
+ */
+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	struct md_rdev *rdev;
+	int ret = 0;
+	int raid_slot = -1;
+
+	md_update_sb(mddev, 1);
+	lock_comm(cinfo, 1);
+
+	memset(&cmsg, 0, sizeof(cmsg));
+	cmsg.type = cpu_to_le32(METADATA_UPDATED);
+	rdev_for_each(rdev, mddev)
+		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
+			raid_slot = rdev->desc_nr;
+			break;
+		}
+	if (raid_slot >= 0) {
+		cmsg.raid_slot = cpu_to_le32(raid_slot);
+		/*
+		 * We can only change capiticy after all the nodes can do it,
+		 * so need to wait after other nodes already received the msg
+		 * and handled the change
+		 */
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret) {
+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+			       __func__, __LINE__);
+			unlock_comm(cinfo);
+			return;
+		}
+	} else {
+		pr_err("md-cluster: No good device id found to send\n");
+		unlock_comm(cinfo);
+		return;
+	}
+
+	/*
+	 * check the sync_size from other node's bitmap, if sync_size
+	 * have already updated in other nodes as expected, send an
+	 * empty metadata msg to permit the change of capacity
+	 */
+	if (cluster_check_sync_size(mddev) == 0) {
+		memset(&cmsg, 0, sizeof(cmsg));
+		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret)
+			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
+			       __func__, __LINE__);
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		revalidate_disk(mddev->gendisk);
+	} else {
+		/* revert to previous sectors */
+		ret = mddev->pers->resize(mddev, old_dev_sectors);
+		if (!ret)
+			revalidate_disk(mddev->gendisk);
+		ret = __sendmsg(cinfo, &cmsg);
+		if (ret)
+			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+			       __func__, __LINE__);
+	}
+	unlock_comm(cinfo);
+}
+
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1396,6 +1471,7 @@ static struct md_cluster_operations cluster_ops = {
 	.gather_bitmaps = gather_bitmaps,
 	.lock_all_bitmaps = lock_all_bitmaps,
 	.unlock_all_bitmaps = unlock_all_bitmaps,
+	.update_size = update_size,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e765499ba591..274016177983 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -27,6 +27,7 @@ struct md_cluster_operations {
 	int (*gather_bitmaps)(struct md_rdev *rdev);
 	int (*lock_all_bitmaps)(struct mddev *mddev);
 	void (*unlock_all_bitmaps)(struct mddev *mddev);
+	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 83f325077dc8..72ef3f18ac9a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6493,10 +6493,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	struct md_rdev *rdev;
 	int rv;
 	int fit = (num_sectors == 0);
-
-	/* cluster raid doesn't support update size */
-	if (mddev_is_clustered(mddev))
-		return -EINVAL;
+	sector_t old_dev_sectors = mddev->dev_sectors;
 
 	if (mddev->pers->resize == NULL)
 		return -EINVAL;
@@ -6525,7 +6522,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	}
 	rv = mddev->pers->resize(mddev, num_sectors);
 	if (!rv) {
-		if (mddev->queue) {
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->update_size(mddev, old_dev_sectors);
+		else if (mddev->queue) {
 			set_capacity(mddev->gendisk, mddev->array_sectors);
 			revalidate_disk(mddev->gendisk);
 		}
@@ -8743,6 +8742,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 	int role, ret;
 	char b[BDEVNAME_SIZE];
 
+	/*
+	 * If size is changed in another node then we need to
+	 * do resize as well.
+	 */
+	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
+		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
+		if (ret)
+			pr_info("md-cluster: resize failed\n");
+		else
+			bitmap_update_sb(mddev->bitmap);
+	}
+
 	/* Check for change of roles in the active devices */
 	rdev_for_each(rdev2, mddev) {
 		if (test_bit(Faulty, &rdev2->flags))

From 535ae4eb1225f19e1d1848c65eafea8b7e9112f4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 15 Feb 2017 19:37:32 -0800
Subject: [PATCH 05/84] md/raid5: prioritize stripes for writeback

In raid5-cache writeback mode, we have two types of stripes to handle.
- stripes which aren't cached yet
- stripes which are cached and flushing out to raid disks

Upperlayer is more sensistive to latency of the first type of stripes
generally. But we only one handle list for all these stripes, where the
two types of stripes are mixed together. When reclaim flushes a lot of
stripes, the first type of stripes could be noticeably delayed. On the
other hand, if the log space is tight, we'd like to handle the second
type of stripes faster and free log space.

This patch destinguishes the two types stripes. They are added into
different handle list. When we try to get a stripe to handl, we prefer
the first type of stripes unless log space is tight.

This should have no impact for !writeback case.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 48 +++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h |  2 ++
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..5a28bd9b5b5f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -176,6 +176,13 @@ static int stripe_operations_active(struct stripe_head *sh)
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+		test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+	       !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -191,7 +198,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 	if (list_empty(&sh->lru)) {
 		struct r5worker_group *group;
 		group = conf->worker_groups + cpu_to_group(cpu);
-		list_add_tail(&sh->lru, &group->handle_list);
+		if (stripe_is_lowprio(sh))
+			list_add_tail(&sh->lru, &group->loprio_list);
+		else
+			list_add_tail(&sh->lru, &group->handle_list);
 		group->stripes_cnt++;
 		sh->group = group;
 	}
@@ -254,7 +264,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
 			if (conf->worker_cnt_per_group == 0) {
-				list_add_tail(&sh->lru, &conf->handle_list);
+				if (stripe_is_lowprio(sh))
+					list_add_tail(&sh->lru,
+							&conf->loprio_list);
+				else
+					list_add_tail(&sh->lru,
+							&conf->handle_list);
 			} else {
 				raid5_wakeup_stripe_thread(sh);
 				return;
@@ -5172,19 +5187,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
  */
 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-	struct stripe_head *sh = NULL, *tmp;
+	struct stripe_head *sh, *tmp;
 	struct list_head *handle_list = NULL;
-	struct r5worker_group *wg = NULL;
+	struct r5worker_group *wg;
+	bool second_try = !r5c_is_writeback(conf->log);
+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
 
+again:
+	wg = NULL;
+	sh = NULL;
 	if (conf->worker_cnt_per_group == 0) {
-		handle_list = &conf->handle_list;
+		handle_list = try_loprio ? &conf->loprio_list :
+					&conf->handle_list;
 	} else if (group != ANY_GROUP) {
-		handle_list = &conf->worker_groups[group].handle_list;
+		handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+				&conf->worker_groups[group].handle_list;
 		wg = &conf->worker_groups[group];
 	} else {
 		int i;
 		for (i = 0; i < conf->group_cnt; i++) {
-			handle_list = &conf->worker_groups[i].handle_list;
+			handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+				&conf->worker_groups[i].handle_list;
 			wg = &conf->worker_groups[i];
 			if (!list_empty(handle_list))
 				break;
@@ -5235,8 +5258,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 		wg = NULL;
 	}
 
-	if (!sh)
-		return NULL;
+	if (!sh) {
+		if (second_try)
+			return NULL;
+		second_try = true;
+		try_loprio = !try_loprio;
+		goto again;
+	}
 
 	if (wg) {
 		wg->stripes_cnt--;
@@ -6546,6 +6574,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
 
 		group = &(*worker_groups)[i];
 		INIT_LIST_HEAD(&group->handle_list);
+		INIT_LIST_HEAD(&group->loprio_list);
 		group->conf = conf;
 		group->workers = workers + i * cnt;
 
@@ -6773,6 +6802,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->loprio_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4bb27b97bf6b..6b9d2e839e6d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -542,6 +542,7 @@ struct r5worker {
 
 struct r5worker_group {
 	struct list_head handle_list;
+	struct list_head loprio_list;
 	struct r5conf *conf;
 	struct r5worker *workers;
 	int stripes_cnt;
@@ -608,6 +609,7 @@ struct r5conf {
 						  */
 
 	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	loprio_list; /* low priority stripes */
 	struct list_head	hold_list; /* preread ready stripes */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */

From 84890c03b6c5d7e8d76ea5e20b6aaf7e7ad410f0 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 15 Feb 2017 19:58:05 -0800
Subject: [PATCH 06/84] md/raid5-cache: bump flush stripe batch size

Bump the flush stripe batch size to 2048. For my 12 disks raid
array, the stripes takes:
12 * 4k * 2048 = 96MB

This is still quite small. A hardware raid card generally has 1GB size,
which we suggest the raid5-cache has similar cache size.

The advantage of a big batch size is we can dispatch a lot of IO in the
same time, then we can do some scheduling to make better IO pattern.

Last patch prioritizes stripes, so we don't worry about a big flush
stripe batch will starve normal stripes.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be01b10..738e180d3740 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -43,7 +43,7 @@
 /* wake up reclaim thread periodically */
 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
 /* start flush with these full stripes */
-#define R5C_FULL_STRIPE_FLUSH_BATCH 256
+#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
 /* reclaim stripes in groups */
 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
 
@@ -381,7 +381,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
 	 * or a full stripe (chunk size / 4k stripes).
 	 */
 	if (atomic_read(&conf->r5c_cached_full_stripes) >=
-	    min(R5C_FULL_STRIPE_FLUSH_BATCH,
+	    min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
 		conf->chunk_sectors >> STRIPE_SHIFT))
 		r5l_wake_reclaim(conf->log, 0);
 }
@@ -1393,7 +1393,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
 		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
 	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
 		 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
-		 R5C_FULL_STRIPE_FLUSH_BATCH)
+		 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
 		/*
 		 * if stripe cache pressure moderate, or if there is many full
 		 * stripes,flush all full stripes

From aaf9f12ebfafd1ea603d61ead6dbcf456a86e0f3 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 3 Mar 2017 22:06:12 -0800
Subject: [PATCH 07/84] md/raid5: sort bios

Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
defers IO dispatching. The goal is to create better IO pattern. At that
time, we don't sort the deffered IO and hope the block layer can do IO
merge and sort. Now the raid5-cache writeback could create large amount
of bios. And if we enable muti-thread for stripe handling, we can't
control when to dispatch IO to raid disks. In a lot of time, we are
dispatching IO which block layer can't do merge effectively.

This patch moves further for the IO dispatching defer. We accumulate
bios, but we don't dispatch all the bios after a threshold is met. This
'dispatch partial portion of bios' stragety allows bios coming in a
large time window are sent to disks together. At the dispatching time,
there is large chance the block layer can merge the bios. To make this
more effective, we dispatch IO in ascending order. This increases
request merge chance and reduces disk seek.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 142 ++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h |  14 ++++-
 2 files changed, 128 insertions(+), 28 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5a28bd9b5b5f..013398ce2080 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -58,6 +58,7 @@
 #include <linux/sched/signal.h>
 
 #include <trace/events/block.h>
+#include <linux/list_sort.h>
 
 #include "md.h"
 #include "raid5.h"
@@ -878,41 +879,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 	return 1;
 }
 
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
 {
-	struct bio_list tmp;
 	struct bio *bio;
 
-	if (!conf->batch_bio_dispatch || !conf->group_cnt)
-		return;
-
-	bio_list_init(&tmp);
-	spin_lock(&conf->pending_bios_lock);
-	bio_list_merge(&tmp, &conf->pending_bios);
-	bio_list_init(&conf->pending_bios);
-	spin_unlock(&conf->pending_bios_lock);
-
-	while ((bio = bio_list_pop(&tmp)))
+	while ((bio = bio_list_pop(tmp)))
 		generic_make_request(bio);
 }
 
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
 {
-	/*
-	 * change group_cnt will drain all bios, so this is safe
-	 *
-	 * A read generally means a read-modify-write, which usually means a
-	 * randwrite, so we don't delay it
-	 */
-	if (!conf->batch_bio_dispatch || !conf->group_cnt ||
-	    bio_op(bio) == REQ_OP_READ) {
-		generic_make_request(bio);
+	const struct r5pending_data *da = list_entry(a,
+				struct r5pending_data, sibling);
+	const struct r5pending_data *db = list_entry(b,
+				struct r5pending_data, sibling);
+	if (da->sector > db->sector)
+		return 1;
+	if (da->sector < db->sector)
+		return -1;
+	return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+				struct bio_list *list)
+{
+	struct r5pending_data *data;
+	struct list_head *first, *next = NULL;
+	int cnt = 0;
+
+	if (conf->pending_data_cnt == 0)
 		return;
+
+	list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+	first = conf->pending_list.next;
+
+	/* temporarily move the head */
+	if (conf->next_pending_data)
+		list_move_tail(&conf->pending_list,
+				&conf->next_pending_data->sibling);
+
+	while (!list_empty(&conf->pending_list)) {
+		data = list_first_entry(&conf->pending_list,
+			struct r5pending_data, sibling);
+		if (&data->sibling == first)
+			first = data->sibling.next;
+		next = data->sibling.next;
+
+		bio_list_merge(list, &data->bios);
+		list_move(&data->sibling, &conf->free_list);
+		cnt++;
+		if (cnt >= target)
+			break;
 	}
+	conf->pending_data_cnt -= cnt;
+	BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+	if (next != &conf->pending_list)
+		conf->next_pending_data = list_entry(next,
+				struct r5pending_data, sibling);
+	else
+		conf->next_pending_data = NULL;
+	/* list isn't empty */
+	if (first != &conf->pending_list)
+		list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+	struct bio_list tmp = BIO_EMPTY_LIST;
+
+	if (conf->pending_data_cnt == 0)
+		return;
+
 	spin_lock(&conf->pending_bios_lock);
-	bio_list_add(&conf->pending_bios, bio);
+	dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+	BUG_ON(conf->pending_data_cnt != 0);
 	spin_unlock(&conf->pending_bios_lock);
-	md_wakeup_thread(conf->mddev->thread);
+
+	dispatch_bio_list(&tmp);
+}
+
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+				struct bio_list *bios)
+{
+	struct bio_list tmp = BIO_EMPTY_LIST;
+	struct r5pending_data *ent;
+
+	spin_lock(&conf->pending_bios_lock);
+	ent = list_first_entry(&conf->free_list, struct r5pending_data,
+							sibling);
+	list_move_tail(&ent->sibling, &conf->pending_list);
+	ent->sector = sector;
+	bio_list_init(&ent->bios);
+	bio_list_merge(&ent->bios, bios);
+	conf->pending_data_cnt++;
+	if (conf->pending_data_cnt >= PENDING_IO_MAX)
+		dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
+	spin_unlock(&conf->pending_bios_lock);
+
+	dispatch_bio_list(&tmp);
 }
 
 static void
@@ -925,6 +992,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
 	struct stripe_head *head_sh = sh;
+	struct bio_list pending_bios = BIO_EMPTY_LIST;
+	bool should_defer;
 
 	might_sleep();
 
@@ -941,6 +1010,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		}
 	}
 
+	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
+
 	for (i = disks; i--; ) {
 		int op, op_flags = 0;
 		int replace_only = 0;
@@ -1095,7 +1166,10 @@ again:
 				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
 						      bi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			defer_bio_issue(conf, bi);
+			if (should_defer && op_is_write(op))
+				bio_list_add(&pending_bios, bi);
+			else
+				generic_make_request(bi);
 		}
 		if (rrdev) {
 			if (s->syncing || s->expanding || s->expanded
@@ -1140,7 +1214,10 @@ again:
 				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
 						      rbi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			defer_bio_issue(conf, rbi);
+			if (should_defer && op_is_write(op))
+				bio_list_add(&pending_bios, rbi);
+			else
+				generic_make_request(rbi);
 		}
 		if (!rdev && !rrdev) {
 			if (op_is_write(op))
@@ -1158,6 +1235,9 @@ again:
 		if (sh != head_sh)
 			goto again;
 	}
+
+	if (should_defer && !bio_list_empty(&pending_bios))
+		defer_issue_bios(conf, head_sh->sector, &pending_bios);
 }
 
 static struct dma_async_tx_descriptor *
@@ -6678,6 +6758,7 @@ static void free_conf(struct r5conf *conf)
 			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
 	kfree(conf->stripe_hashtbl);
+	kfree(conf->pending_data);
 	kfree(conf);
 }
 
@@ -6787,6 +6868,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
 	if (conf == NULL)
 		goto abort;
+	INIT_LIST_HEAD(&conf->free_list);
+	INIT_LIST_HEAD(&conf->pending_list);
+	conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+		PENDING_IO_MAX, GFP_KERNEL);
+	if (!conf->pending_data)
+		goto abort;
+	for (i = 0; i < PENDING_IO_MAX; i++)
+		list_add(&conf->pending_data[i].sibling, &conf->free_list);
 	/* Don't enable multi-threading by default*/
 	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
 				 &new_group)) {
@@ -6811,7 +6900,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
-	bio_list_init(&conf->pending_bios);
 	spin_lock_init(&conf->pending_bios_lock);
 	conf->batch_bio_dispatch = true;
 	rdev_for_each(rdev, mddev) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6b9d2e839e6d..985cdc4850c2 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -572,6 +572,14 @@ enum r5_cache_state {
 				 */
 };
 
+#define PENDING_IO_MAX 512
+#define PENDING_IO_ONE_FLUSH 128
+struct r5pending_data {
+	struct list_head sibling;
+	sector_t sector; /* stripe sector */
+	struct bio_list bios;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	/* only protect corresponding hash list and inactive_list */
@@ -689,9 +697,13 @@ struct r5conf {
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
 
-	struct bio_list		pending_bios;
 	spinlock_t		pending_bios_lock;
 	bool			batch_bio_dispatch;
+	struct r5pending_data	*pending_data;
+	struct list_head	free_list;
+	struct list_head	pending_list;
+	int			pending_data_cnt;
+	struct r5pending_data	*next_pending_data;
 };
 
 

From effe6ee7523aa50d0517bd7da141e112b44d89fa Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 7 Mar 2017 16:49:17 -0800
Subject: [PATCH 08/84] md/r5cache: improve recovery with read ahead page pool

In r5cache recovery, the journal device is scanned page by page.
Currently, we use sync_page_io() to read journal device. This is
not efficient when we have to recovery many stripes from the journal.

To improve the speed of recovery, this patch introduces a read ahead
page pool (ra_pool) to recovery_ctx. With ra_pool, multiple consecutive
pages are read in one IO. Then the recovery code read the journal from
ra_pool.

With ra_pool, r5l_recovery_ctx has become much bigger. Therefore,
r5l_recovery_log() is refactored so r5l_recovery_ctx is not using
stack space.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 219 +++++++++++++++++++++++++++++++--------
 1 file changed, 174 insertions(+), 45 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 738e180d3740..5c8640c86b90 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -30,6 +30,7 @@
  * underneath hardware sector size. only works with PAGE_SIZE == 4096
  */
 #define BLOCK_SECTORS (8)
+#define BLOCK_SECTOR_SHIFT (3)
 
 /*
  * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
@@ -1552,6 +1553,8 @@ bool r5l_log_disk_error(struct r5conf *conf)
 	return ret;
 }
 
+#define R5L_RECOVERY_PAGE_POOL_SIZE 256
+
 struct r5l_recovery_ctx {
 	struct page *meta_page;		/* current meta */
 	sector_t meta_total_blocks;	/* total size of current meta and data */
@@ -1560,18 +1563,131 @@ struct r5l_recovery_ctx {
 	int data_parity_stripes;	/* number of data_parity stripes */
 	int data_only_stripes;		/* number of data_only stripes */
 	struct list_head cached_list;
+
+	/*
+	 * read ahead page pool (ra_pool)
+	 * in recovery, log is read sequentially. It is not efficient to
+	 * read every page with sync_page_io(). The read ahead page pool
+	 * reads multiple pages with one IO, so further log read can
+	 * just copy data from the pool.
+	 */
+	struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
+	sector_t pool_offset;	/* offset of first page in the pool */
+	int total_pages;	/* total allocated pages */
+	int valid_pages;	/* pages with valid data */
+	struct bio *ra_bio;	/* bio to do the read ahead */
 };
 
+static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
+					    struct r5l_recovery_ctx *ctx)
+{
+	struct page *page;
+
+	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
+	if (!ctx->ra_bio)
+		return -ENOMEM;
+
+	ctx->valid_pages = 0;
+	ctx->total_pages = 0;
+	while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
+		page = alloc_page(GFP_KERNEL);
+
+		if (!page)
+			break;
+		ctx->ra_pool[ctx->total_pages] = page;
+		ctx->total_pages += 1;
+	}
+
+	if (ctx->total_pages == 0) {
+		bio_put(ctx->ra_bio);
+		return -ENOMEM;
+	}
+
+	ctx->pool_offset = 0;
+	return 0;
+}
+
+static void r5l_recovery_free_ra_pool(struct r5l_log *log,
+					struct r5l_recovery_ctx *ctx)
+{
+	int i;
+
+	for (i = 0; i < ctx->total_pages; ++i)
+		put_page(ctx->ra_pool[i]);
+	bio_put(ctx->ra_bio);
+}
+
+/*
+ * fetch ctx->valid_pages pages from offset
+ * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
+ * However, if the offset is close to the end of the journal device,
+ * ctx->valid_pages could be smaller than ctx->total_pages
+ */
+static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
+				      struct r5l_recovery_ctx *ctx,
+				      sector_t offset)
+{
+	bio_reset(ctx->ra_bio);
+	ctx->ra_bio->bi_bdev = log->rdev->bdev;
+	bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
+	ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
+
+	ctx->valid_pages = 0;
+	ctx->pool_offset = offset;
+
+	while (ctx->valid_pages < ctx->total_pages) {
+		bio_add_page(ctx->ra_bio,
+			     ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
+		ctx->valid_pages += 1;
+
+		offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
+
+		if (offset == 0)  /* reached end of the device */
+			break;
+	}
+
+	return submit_bio_wait(ctx->ra_bio);
+}
+
+/*
+ * try read a page from the read ahead page pool, if the page is not in the
+ * pool, call r5l_recovery_fetch_ra_pool
+ */
+static int r5l_recovery_read_page(struct r5l_log *log,
+				  struct r5l_recovery_ctx *ctx,
+				  struct page *page,
+				  sector_t offset)
+{
+	int ret;
+
+	if (offset < ctx->pool_offset ||
+	    offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
+		ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
+		if (ret)
+			return ret;
+	}
+
+	BUG_ON(offset < ctx->pool_offset ||
+	       offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
+
+	memcpy(page_address(page),
+	       page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
+					 BLOCK_SECTOR_SHIFT]),
+	       PAGE_SIZE);
+	return 0;
+}
+
 static int r5l_recovery_read_meta_block(struct r5l_log *log,
 					struct r5l_recovery_ctx *ctx)
 {
 	struct page *page = ctx->meta_page;
 	struct r5l_meta_block *mb;
 	u32 crc, stored_crc;
+	int ret;
 
-	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
-			  false))
-		return -EIO;
+	ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
+	if (ret != 0)
+		return ret;
 
 	mb = page_address(page);
 	stored_crc = le32_to_cpu(mb->checksum);
@@ -1653,8 +1769,7 @@ static void r5l_recovery_load_data(struct r5l_log *log,
 	raid5_compute_sector(conf,
 			     le64_to_cpu(payload->location), 0,
 			     &dd_idx, sh);
-	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
-		     sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
+	r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
 	sh->dev[dd_idx].log_checksum =
 		le32_to_cpu(payload->checksum[0]);
 	ctx->meta_total_blocks += BLOCK_SECTORS;
@@ -1673,17 +1788,15 @@ static void r5l_recovery_load_parity(struct r5l_log *log,
 	struct r5conf *conf = mddev->private;
 
 	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
-	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
-		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+	r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
 	sh->dev[sh->pd_idx].log_checksum =
 		le32_to_cpu(payload->checksum[0]);
 	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
 
 	if (sh->qd_idx >= 0) {
-		sync_page_io(log->rdev,
-			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
-			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
-			     REQ_OP_READ, 0, false);
+		r5l_recovery_read_page(
+			log, ctx, sh->dev[sh->qd_idx].page,
+			r5l_ring_add(log, log_offset, BLOCK_SECTORS));
 		sh->dev[sh->qd_idx].log_checksum =
 			le32_to_cpu(payload->checksum[1]);
 		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
@@ -1814,14 +1927,15 @@ r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
 
 /* if matches return 0; otherwise return -EINVAL */
 static int
-r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
+r5l_recovery_verify_data_checksum(struct r5l_log *log,
+				  struct r5l_recovery_ctx *ctx,
+				  struct page *page,
 				  sector_t log_offset, __le32 log_checksum)
 {
 	void *addr;
 	u32 checksum;
 
-	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
-		     page, REQ_OP_READ, 0, false);
+	r5l_recovery_read_page(log, ctx, page, log_offset);
 	addr = kmap_atomic(page);
 	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
 	kunmap_atomic(addr);
@@ -1853,17 +1967,17 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 
 		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
 			if (r5l_recovery_verify_data_checksum(
-				    log, page, log_offset,
+				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
 				goto mismatch;
 		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
 			if (r5l_recovery_verify_data_checksum(
-				    log, page, log_offset,
+				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
 				goto mismatch;
 			if (conf->max_degraded == 2 && /* q for RAID 6 */
 			    r5l_recovery_verify_data_checksum(
-				    log, page,
+				    log, ctx, page,
 				    r5l_ring_add(log, log_offset,
 						 BLOCK_SECTORS),
 				    payload->checksum[1]) < 0)
@@ -2241,55 +2355,70 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
 static int r5l_recovery_log(struct r5l_log *log)
 {
 	struct mddev *mddev = log->rdev->mddev;
-	struct r5l_recovery_ctx ctx;
+	struct r5l_recovery_ctx *ctx;
 	int ret;
 	sector_t pos;
 
-	ctx.pos = log->last_checkpoint;
-	ctx.seq = log->last_cp_seq;
-	ctx.meta_page = alloc_page(GFP_KERNEL);
-	ctx.data_only_stripes = 0;
-	ctx.data_parity_stripes = 0;
-	INIT_LIST_HEAD(&ctx.cached_list);
-
-	if (!ctx.meta_page)
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
 		return -ENOMEM;
 
-	ret = r5c_recovery_flush_log(log, &ctx);
-	__free_page(ctx.meta_page);
+	ctx->pos = log->last_checkpoint;
+	ctx->seq = log->last_cp_seq;
+	INIT_LIST_HEAD(&ctx->cached_list);
+	ctx->meta_page = alloc_page(GFP_KERNEL);
+
+	if (!ctx->meta_page) {
+		ret =  -ENOMEM;
+		goto meta_page;
+	}
+
+	if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
+		ret = -ENOMEM;
+		goto ra_pool;
+	}
+
+	ret = r5c_recovery_flush_log(log, ctx);
 
 	if (ret)
-		return ret;
+		goto error;
 
-	pos = ctx.pos;
-	ctx.seq += 10000;
+	pos = ctx->pos;
+	ctx->seq += 10000;
 
-
-	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
+	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
 		pr_debug("md/raid:%s: starting from clean shutdown\n",
 			 mdname(mddev));
 	else
 		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
-			 mdname(mddev), ctx.data_only_stripes,
-			 ctx.data_parity_stripes);
+			 mdname(mddev), ctx->data_only_stripes,
+			 ctx->data_parity_stripes);
 
-	if (ctx.data_only_stripes == 0) {
-		log->next_checkpoint = ctx.pos;
-		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
-		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+	if (ctx->data_only_stripes == 0) {
+		log->next_checkpoint = ctx->pos;
+		r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+		ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+	} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
 		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
 		       mdname(mddev));
-		return -EIO;
+		ret =  -EIO;
+		goto error;
 	}
 
-	log->log_start = ctx.pos;
-	log->seq = ctx.seq;
+	log->log_start = ctx->pos;
+	log->seq = ctx->seq;
 	log->last_checkpoint = pos;
 	r5l_write_super(log, pos);
 
-	r5c_recovery_flush_data_only_stripes(log, &ctx);
-	return 0;
+	r5c_recovery_flush_data_only_stripes(log, ctx);
+	ret = 0;
+error:
+	r5l_recovery_free_ra_pool(log, ctx);
+ra_pool:
+	__free_page(ctx->meta_page);
+meta_page:
+	kfree(ctx);
+	return ret;
 }
 
 static void r5l_write_super(struct r5l_log *log, sector_t cp)

From ea0213e0c7cc1c1b52badf27bd7db4f50a67baaa Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:57 +0100
Subject: [PATCH 09/84] md: superblock changes for PPL

Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c                | 19 +++++++++++++++++++
 drivers/md/md.h                |  8 ++++++++
 drivers/md/raid0.c             |  3 ++-
 drivers/md/raid1.c             |  3 ++-
 include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
 5 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 72ef3f18ac9a..d57045996d35 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1507,6 +1507,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	} else if (sb->bblog_offset != 0)
 		rdev->badblocks.shift = 0;
 
+	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
+		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
+	}
+
 	if (!refdev) {
 		ret = 1;
 	} else {
@@ -1619,6 +1625,13 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+			if (le32_to_cpu(sb->feature_map) &
+			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
+				return -EINVAL;
+			set_bit(MD_HAS_PPL, &mddev->flags);
+		}
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling, except for
 		 * spares (which don't need an event count) */
@@ -1832,6 +1845,12 @@ retry:
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
+	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+	}
+
 	rdev_for_each(rdev2, mddev) {
 		i = rdev2->desc_nr;
 		if (test_bit(Faulty, &rdev2->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1c00160b09f9..a7b2f16452c4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
 					   * sysfs entry */
 
 	struct badblocks badblocks;
+
+	struct {
+		short offset;	/* Offset from superblock to start of PPL.
+				 * Not used by external metadata. */
+		unsigned int size;	/* Size in sectors of the PPL space */
+		sector_t sector;	/* First sector of the PPL space */
+	} ppl;
 };
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
@@ -226,6 +233,7 @@ enum mddev_flags {
 				 * supported as calls to md_error() will
 				 * never cause the array to become failed.
 				 */
+	MD_HAS_PPL,		/* The raid array has PPL feature set */
 };
 
 enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 93347ca7c7a6..56f70c3ad37c 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -29,7 +29,8 @@
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
-	 (1L << MD_FAILFAST_SUPPORTED))
+	 (1L << MD_FAILFAST_SUPPORTED) |\
+	 (1L << MD_HAS_PPL))
 
 static int raid0_congested(struct mddev *mddev, int bits)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a34f58772022..730e57259af9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,7 +47,8 @@
 
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
-	 (1L << MD_JOURNAL_CLEAN))
+	 (1L << MD_JOURNAL_CLEAN) |	\
+	 (1L << MD_HAS_PPL))
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e9040f..fe2112810c43 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
 
 	__le32	chunksize;	/* in 512byte sectors */
 	__le32	raid_disks;
-	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
-				 * NOTE: signed, so bitmap can be before superblock
-				 * only meaningful of feature_map[0] is set.
-				 */
+	union {
+		__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+					 * NOTE: signed, so bitmap can be before superblock
+					 * only meaningful of feature_map[0] is set.
+					 */
+
+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+		struct {
+			__le16 offset; /* sectors from start of superblock that ppl starts (signed) */
+			__le16 size; /* ppl size in sectors */
+		} ppl;
+	};
 
 	/* These are only valid with feature bit '4' */
 	__le32	new_level;	/* new level we are reshaping to		*/
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
 					     */
 #define MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
+#define	MD_FEATURE_PPL			1024 /* support PPL */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_RECOVERY_BITMAP	\
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
+					|MD_FEATURE_PPL			\
 					)
 
 struct r5l_payload_header {

From ff875738edd44e3bc892d378deacc50bccc9d70c Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:58 +0100
Subject: [PATCH 10/84] raid5: separate header for log functions

Move raid5-cache declarations from raid5.h to raid5-log.h, add inline
wrappers for functions which will be shared with ppl and use them in
raid5 core instead of direct calls to raid5-cache.

Remove unused parameter from r5c_cache_data(), move two duplicated
pr_debug() calls to r5l_init_log().

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 17 +++++++--
 drivers/md/raid5-log.h   | 81 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       | 48 +++++++-----------------
 drivers/md/raid5.h       | 30 ---------------
 4 files changed, 107 insertions(+), 69 deletions(-)
 create mode 100644 drivers/md/raid5-log.h

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5c8640c86b90..5f82dabdda6f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -344,6 +344,8 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
 	}
 }
 
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+
 /* Check whether we should flush some stripes to free up stripe cache */
 void r5c_check_stripe_cache_usage(struct r5conf *conf)
 {
@@ -2749,9 +2751,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	}
 }
 
-int
-r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
-	       struct stripe_head_state *s)
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	int pages = 0;
@@ -2914,6 +2914,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
 	struct request_queue *q = bdev_get_queue(rdev->bdev);
 	struct r5l_log *log;
+	char b[BDEVNAME_SIZE];
+
+	pr_debug("md/raid:%s: using device %s as journal\n",
+		 mdname(conf->mddev), bdevname(rdev->bdev, b));
 
 	if (PAGE_SIZE != 4096)
 		return -EINVAL;
@@ -3016,8 +3020,13 @@ io_kc:
 	return -EINVAL;
 }
 
-void r5l_exit_log(struct r5l_log *log)
+void r5l_exit_log(struct r5conf *conf)
 {
+	struct r5l_log *log = conf->log;
+
+	conf->log = NULL;
+	synchronize_rcu();
+
 	flush_work(&log->disable_writeback_work);
 	md_unregister_thread(&log->reclaim_thread);
 	mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
new file mode 100644
index 000000000000..2da4bd3bbd79
--- /dev/null
+++ b/drivers/md/raid5-log.h
@@ -0,0 +1,81 @@
+#ifndef _RAID5_LOG_H
+#define _RAID5_LOG_H
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5conf *conf);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+		      struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+			    struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+
+static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
+{
+	struct r5conf *conf = sh->raid_conf;
+
+	if (conf->log) {
+		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+			/* writing out phase */
+			if (s->waiting_extra_page)
+				return 0;
+			return r5l_write_stripe(conf->log, sh);
+		} else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+			/* caching phase */
+			return r5c_cache_data(conf->log, sh);
+		}
+	}
+
+	return -EAGAIN;
+}
+
+static inline void log_stripe_write_finished(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+
+	if (conf->log)
+		r5l_stripe_write_finished(sh);
+}
+
+static inline void log_write_stripe_run(struct r5conf *conf)
+{
+	if (conf->log)
+		r5l_write_stripe_run(conf->log);
+}
+
+static inline void log_exit(struct r5conf *conf)
+{
+	if (conf->log)
+		r5l_exit_log(conf);
+}
+
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
+{
+	if (journal_dev)
+		return r5l_init_log(conf, journal_dev);
+
+	return 0;
+}
+
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 013398ce2080..f575f40d2acb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -64,6 +64,7 @@
 #include "raid5.h"
 #include "raid0.h"
 #include "bitmap.h"
+#include "raid5-log.h"
 
 #define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
 
@@ -997,18 +998,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
-		/* writing out phase */
-		if (s->waiting_extra_page)
-			return;
-		if (r5l_write_stripe(conf->log, sh) == 0)
-			return;
-	} else {  /* caching phase */
-		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
-			r5c_cache_data(conf->log, sh, s);
-			return;
-		}
-	}
+	if (log_stripe(sh, s) == 0)
+		return;
 
 	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
 
@@ -3345,7 +3336,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (bi)
 			bitmap_end = 1;
 
-		r5l_stripe_write_finished(sh);
+		log_stripe_write_finished(sh);
 
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
@@ -3764,7 +3755,7 @@ returnbi:
 				discard_pending = 1;
 		}
 
-	r5l_stripe_write_finished(sh);
+	log_stripe_write_finished(sh);
 
 	if (!discard_pending &&
 	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4754,7 +4745,7 @@ static void handle_stripe(struct stripe_head *sh)
 
 	if (s.just_cached)
 		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
-	r5l_stripe_write_finished(sh);
+	log_stripe_write_finished(sh);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -6168,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
 
 	for (i = 0; i < batch_size; i++)
 		handle_stripe(batch[i]);
-	r5l_write_stripe_run(conf->log);
+	log_write_stripe_run(conf);
 
 	cond_resched();
 
@@ -6745,8 +6736,8 @@ static void free_conf(struct r5conf *conf)
 {
 	int i;
 
-	if (conf->log)
-		r5l_exit_log(conf->log);
+	log_exit(conf);
+
 	if (conf->shrinker.nr_deferred)
 		unregister_shrinker(&conf->shrinker);
 
@@ -7436,14 +7427,8 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 	}
 
-	if (journal_dev) {
-		char b[BDEVNAME_SIZE];
-
-		pr_debug("md/raid:%s: using device %s as journal\n",
-			 mdname(mddev), bdevname(journal_dev->bdev, b));
-		if (r5l_init_log(conf, journal_dev))
-			goto abort;
-	}
+	if (log_init(conf, journal_dev))
+		goto abort;
 
 	return 0;
 abort:
@@ -7557,17 +7542,13 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 	print_raid5_conf(conf);
 	if (test_bit(Journal, &rdev->flags) && conf->log) {
-		struct r5l_log *log;
 		/*
 		 * we can't wait pending write here, as this is called in
 		 * raid5d, wait will deadlock.
 		 */
 		if (atomic_read(&mddev->writes_pending))
 			return -EBUSY;
-		log = conf->log;
-		conf->log = NULL;
-		synchronize_rcu();
-		r5l_exit_log(log);
+		log_exit(conf);
 		return 0;
 	}
 	if (rdev == p->rdev)
@@ -7636,7 +7617,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int last = conf->raid_disks - 1;
 
 	if (test_bit(Journal, &rdev->flags)) {
-		char b[BDEVNAME_SIZE];
 		if (conf->log)
 			return -EBUSY;
 
@@ -7645,9 +7625,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * The array is in readonly mode if journal is missing, so no
 		 * write requests running. We should be safe
 		 */
-		r5l_init_log(conf, rdev);
-		pr_debug("md/raid:%s: using device %s as journal\n",
-			 mdname(mddev), bdevname(rdev->bdev, b));
+		log_init(conf, rdev);
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 985cdc4850c2..6dd295a80ee1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -779,34 +779,4 @@ extern struct stripe_head *
 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			int previous, int noblock, int noquiesce);
 extern int raid5_calc_degraded(struct r5conf *conf);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
-		      struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
-			    struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
-			  struct stripe_head_state *s);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
-extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 #endif

From 3418d036c81dcb604b7c7c71b209d5890a8418aa Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:59 +0100
Subject: [PATCH 11/84] raid5-ppl: Partial Parity Log write logging
 implementation

Implement the calculation of partial parity for a stripe and PPL write
logging functionality. The description of PPL is added to the
documentation. More details can be found in the comments in raid5-ppl.c.

Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.

Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:

- reconstruct-write case:
  xor data from all not updated disks in a stripe

- read-modify-write case:
  xor old data and parity from all updated disks in a stripe

Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.

Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.

Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.

Warn about using PPL with enabled disk volatile write-back cache for
now. It can be removed once disk cache flushing before writing PPL is
implemented.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md/raid5-ppl.txt |  44 +++
 drivers/md/Makefile            |   2 +-
 drivers/md/raid5-log.h         |  24 ++
 drivers/md/raid5-ppl.c         | 703 +++++++++++++++++++++++++++++++++
 drivers/md/raid5.c             |  64 ++-
 drivers/md/raid5.h             |  10 +-
 include/uapi/linux/raid/md_p.h |  27 ++
 7 files changed, 869 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/md/raid5-ppl.txt
 create mode 100644 drivers/md/raid5-ppl.c

diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
new file mode 100644
index 000000000000..127072b09363
--- /dev/null
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
+Partial Parity Log
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe.  It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid5-cache.o
+raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 2da4bd3bbd79..a67fb58513b9 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -31,6 +31,20 @@ extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 
+extern struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx);
+extern int ppl_init_log(struct r5conf *conf);
+extern void ppl_exit_log(struct r5conf *conf);
+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+extern void ppl_write_stripe_run(struct r5conf *conf);
+extern void ppl_stripe_write_finished(struct stripe_head *sh);
+
+static inline bool raid5_has_ppl(struct r5conf *conf)
+{
+	return test_bit(MD_HAS_PPL, &conf->mddev->flags);
+}
+
 static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -45,6 +59,8 @@ static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s
 			/* caching phase */
 			return r5c_cache_data(conf->log, sh);
 		}
+	} else if (raid5_has_ppl(conf)) {
+		return ppl_write_stripe(conf, sh);
 	}
 
 	return -EAGAIN;
@@ -56,24 +72,32 @@ static inline void log_stripe_write_finished(struct stripe_head *sh)
 
 	if (conf->log)
 		r5l_stripe_write_finished(sh);
+	else if (raid5_has_ppl(conf))
+		ppl_stripe_write_finished(sh);
 }
 
 static inline void log_write_stripe_run(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_write_stripe_run(conf->log);
+	else if (raid5_has_ppl(conf))
+		ppl_write_stripe_run(conf);
 }
 
 static inline void log_exit(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_exit_log(conf);
+	else if (raid5_has_ppl(conf))
+		ppl_exit_log(conf);
 }
 
 static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
 {
 	if (journal_dev)
 		return r5l_init_log(conf, journal_dev);
+	else if (raid5_has_ppl(conf))
+		return ppl_init_log(conf);
 
 	return 0;
 }
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..db5b72b11594
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,703 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/flex_array.h>
+#include <linux/async_tx.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ *   entry0
+ *   ...
+ *   entryN
+ * PP data
+ *   PP for entry0
+ *   ...
+ *   PP for entryN
+ *
+ * An entry describes one or more consecutive stripe_heads, up to a full
+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
+ * number of stripe_heads in the entry and n is the number of modified data
+ * disks. Every stripe_head in the entry must write to the same data disks.
+ * An example of a valid case described by a single entry (writes to the first
+ * stripe of a 4 disk array, 16k chunk size):
+ *
+ * sh->sector   dd0   dd1   dd2    ppl
+ *            +-----+-----+-----+
+ * 0          | --- | --- | --- | +----+
+ * 8          | -W- | -W- | --- | | pp |   data_sector = 8
+ * 16         | -W- | -W- | --- | | pp |   data_size = 3 * 2 * 4k
+ * 24         | -W- | -W- | --- | | pp |   pp_size = 3 * 4k
+ *            +-----+-----+-----+ +----+
+ *
+ * data_sector is the first raid sector of the modified data, data_size is the
+ * total size of modified data and pp_size is the size of partial parity for
+ * this entry. Entries for full stripe writes contain no partial parity
+ * (pp_size = 0), they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
+ * partial parity, the header also has a checksum of the header itself.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one ppl_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to
+ * r5conf->log_private.
+ *
+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
+ * can be appended to the last entry if it meets the conditions for a valid
+ * entry described above, otherwise a new entry is added. Checksums of entries
+ * are calculated incrementally as stripes containing partial parity are being
+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
+ * all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->io_list tracks all io_units of a log
+ * (for a single member disk). New io_units are added to the end of the list
+ * and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always at the end of the list.
+ */
+
+struct ppl_conf {
+	struct mddev *mddev;
+
+	/* array of child logs, one for each raid disk */
+	struct ppl_log *child_logs;
+	int count;
+
+	int block_size;		/* the logical block size used for data_sector
+				 * in ppl_header_entry */
+	u32 signature;		/* raid array identifier */
+	atomic64_t seq;		/* current log write sequence number */
+
+	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
+	struct bio_set *bs;
+	mempool_t *meta_pool;
+};
+
+struct ppl_log {
+	struct ppl_conf *ppl_conf;	/* shared between all log instances */
+
+	struct md_rdev *rdev;		/* array member disk associated with
+					 * this log instance */
+	struct mutex io_mutex;
+	struct ppl_io_unit *current_io;	/* current io_unit accepting new data
+					 * always at the end of io_list */
+	spinlock_t io_list_lock;
+	struct list_head io_list;	/* all io_units of this log */
+	struct list_head no_mem_stripes;/* stripes to retry if failed to
+					 * allocate io_unit */
+};
+
+#define PPL_IO_INLINE_BVECS 32
+
+struct ppl_io_unit {
+	struct ppl_log *log;
+
+	struct page *header_page;	/* for ppl_header */
+
+	unsigned int entries_count;	/* number of entries in ppl_header */
+	unsigned int pp_size;		/* total size current of partial parity */
+
+	u64 seq;			/* sequence number of this log write */
+	struct list_head log_sibling;	/* log->io_list */
+
+	struct list_head stripe_list;	/* stripes added to the io_unit */
+	atomic_t pending_stripes;	/* how many stripes not written to raid */
+
+	bool submitted;			/* true if write to log started */
+
+	/* inline bio and its biovec for submitting the iounit */
+	struct bio bio;
+	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
+};
+
+struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/*
+	 * Partial parity is the XOR of stripe data chunks that are not changed
+	 * during the write request. Depending on available data
+	 * (read-modify-write vs. reconstruct-write case) we calculate it
+	 * differently.
+	 */
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		/* rmw: xor old data and parity from updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+		/* rcw: xor data from all not updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_UPTODATE, &dev->flags))
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		return tx;
+	}
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
+			  NULL, sh, flex_array_get(percpu->scribble, 0)
+			  + sizeof(struct page *) * (sh->disks + 2));
+
+	if (count == 1)
+		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+				  &submit);
+	else
+		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+			       &submit);
+
+	return tx;
+}
+
+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
+					  struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct ppl_io_unit *io;
+	struct ppl_header *pplhdr;
+
+	io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+
+	memset(io, 0, sizeof(*io));
+	io->log = log;
+	INIT_LIST_HEAD(&io->log_sibling);
+	INIT_LIST_HEAD(&io->stripe_list);
+	atomic_set(&io->pending_stripes, 0);
+	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+
+	io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
+	pplhdr = page_address(io->header_page);
+	clear_page(pplhdr);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
+
+	io->seq = atomic64_add_return(1, &ppl_conf->seq);
+	pplhdr->generation = cpu_to_le64(io->seq);
+
+	return io;
+}
+
+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
+{
+	struct ppl_io_unit *io = log->current_io;
+	struct ppl_header_entry *e = NULL;
+	struct ppl_header *pplhdr;
+	int i;
+	sector_t data_sector = 0;
+	int data_disks = 0;
+	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+	struct r5conf *conf = sh->raid_conf;
+
+	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/* check if current io_unit is full */
+	if (io && (io->pp_size == entry_space ||
+		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
+		pr_debug("%s: add io_unit blocked by seq: %llu\n",
+			 __func__, io->seq);
+		io = NULL;
+	}
+
+	/* add a new unit if there is none or the current is full */
+	if (!io) {
+		io = ppl_new_iounit(log, sh);
+		if (!io)
+			return -ENOMEM;
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&io->log_sibling, &log->io_list);
+		spin_unlock_irq(&log->io_list_lock);
+
+		log->current_io = io;
+	}
+
+	for (i = 0; i < sh->disks; i++) {
+		struct r5dev *dev = &sh->dev[i];
+
+		if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+			if (!data_disks || dev->sector < data_sector)
+				data_sector = dev->sector;
+			data_disks++;
+		}
+	}
+	BUG_ON(!data_disks);
+
+	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+		 io->seq, (unsigned long long)data_sector, data_disks);
+
+	pplhdr = page_address(io->header_page);
+
+	if (io->entries_count > 0) {
+		struct ppl_header_entry *last =
+				&pplhdr->entries[io->entries_count - 1];
+		struct stripe_head *sh_last = list_last_entry(
+				&io->stripe_list, struct stripe_head, log_list);
+		u64 data_sector_last = le64_to_cpu(last->data_sector);
+		u32 data_size_last = le32_to_cpu(last->data_size);
+
+		/*
+		 * Check if we can append the stripe to the last entry. It must
+		 * be just after the last logged stripe and write to the same
+		 * disks. Use bit shift and logarithm to avoid 64-bit division.
+		 */
+		if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+		    (data_sector >> ilog2(conf->chunk_sectors) ==
+		     data_sector_last >> ilog2(conf->chunk_sectors)) &&
+		    ((data_sector - data_sector_last) * data_disks ==
+		     data_size_last >> 9))
+			e = last;
+	}
+
+	if (!e) {
+		e = &pplhdr->entries[io->entries_count++];
+		e->data_sector = cpu_to_le64(data_sector);
+		e->parity_disk = cpu_to_le32(sh->pd_idx);
+		e->checksum = cpu_to_le32(~0);
+	}
+
+	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+	/* don't write any PP if full stripe write */
+	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+		le32_add_cpu(&e->pp_size, PAGE_SIZE);
+		io->pp_size += PAGE_SIZE;
+		e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+						    page_address(sh->ppl_page),
+						    PAGE_SIZE));
+	}
+
+	list_add_tail(&sh->log_list, &io->stripe_list);
+	atomic_inc(&io->pending_stripes);
+	sh->ppl_io = io;
+
+	return 0;
+}
+
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_io_unit *io = sh->ppl_io;
+	struct ppl_log *log;
+
+	if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+		return -EAGAIN;
+	}
+
+	log = &ppl_conf->child_logs[sh->pd_idx];
+
+	mutex_lock(&log->io_mutex);
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		mutex_unlock(&log->io_mutex);
+		return -EAGAIN;
+	}
+
+	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	if (ppl_log_stripe(log, sh)) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
+
+	mutex_unlock(&log->io_mutex);
+
+	return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+	struct ppl_io_unit *io = bio->bi_private;
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct stripe_head *sh, *next;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	if (bio->bi_error)
+		md_error(ppl_conf->mddev, log->rdev);
+
+	mempool_free(io->header_page, ppl_conf->meta_pool);
+
+	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+		list_del_init(&sh->log_list);
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
+{
+	char b[BDEVNAME_SIZE];
+
+	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+		 __func__, io->seq, bio->bi_iter.bi_size,
+		 (unsigned long long)bio->bi_iter.bi_sector,
+		 bdevname(bio->bi_bdev, b));
+
+	submit_bio(bio);
+}
+
+static void ppl_submit_iounit(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct ppl_header *pplhdr = page_address(io->header_page);
+	struct bio *bio = &io->bio;
+	struct stripe_head *sh;
+	int i;
+
+	for (i = 0; i < io->entries_count; i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+
+		pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+			 __func__, io->seq, i, le64_to_cpu(e->data_sector),
+			 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
+
+		e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
+					     ilog2(ppl_conf->block_size >> 9));
+		e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+	}
+
+	pplhdr->entries_count = cpu_to_le32(io->entries_count);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+	bio->bi_private = io;
+	bio->bi_end_io = ppl_log_endio;
+	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+	bio->bi_bdev = log->rdev->bdev;
+	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+
+	list_for_each_entry(sh, &io->stripe_list, log_list) {
+		/* entries for full stripe writes have no partial parity */
+		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+			continue;
+
+		if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+			struct bio *prev = bio;
+
+			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+					       ppl_conf->bs);
+			bio->bi_opf = prev->bi_opf;
+			bio->bi_bdev = prev->bi_bdev;
+			bio->bi_iter.bi_sector = bio_end_sector(prev);
+			bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+
+			bio_chain(bio, prev);
+			ppl_submit_iounit_bio(io, prev);
+		}
+	}
+
+	ppl_submit_iounit_bio(io, bio);
+}
+
+static void ppl_submit_current_io(struct ppl_log *log)
+{
+	struct ppl_io_unit *io;
+
+	spin_lock_irq(&log->io_list_lock);
+
+	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+				      log_sibling);
+	if (io && io->submitted)
+		io = NULL;
+
+	spin_unlock_irq(&log->io_list_lock);
+
+	if (io) {
+		io->submitted = true;
+
+		if (io == log->current_io)
+			log->current_io = NULL;
+
+		ppl_submit_iounit(io);
+	}
+}
+
+void ppl_write_stripe_run(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_log *log;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		log = &ppl_conf->child_logs[i];
+
+		mutex_lock(&log->io_mutex);
+		ppl_submit_current_io(log);
+		mutex_unlock(&log->io_mutex);
+	}
+}
+
+static void ppl_io_unit_finished(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	unsigned long flags;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+
+	list_del(&io->log_sibling);
+	mempool_free(io, log->ppl_conf->io_pool);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
+							  struct stripe_head,
+							  log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+void ppl_stripe_write_finished(struct stripe_head *sh)
+{
+	struct ppl_io_unit *io;
+
+	io = sh->ppl_io;
+	sh->ppl_io = NULL;
+
+	if (io && atomic_dec_and_test(&io->pending_stripes))
+		ppl_io_unit_finished(io);
+}
+
+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
+{
+	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+	kfree(ppl_conf->child_logs);
+
+	mempool_destroy(ppl_conf->meta_pool);
+	if (ppl_conf->bs)
+		bioset_free(ppl_conf->bs);
+	mempool_destroy(ppl_conf->io_pool);
+	kmem_cache_destroy(ppl_conf->io_kc);
+
+	kfree(ppl_conf);
+}
+
+void ppl_exit_log(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+
+	if (ppl_conf) {
+		__ppl_exit_log(ppl_conf);
+		conf->log_private = NULL;
+	}
+}
+
+static int ppl_validate_rdev(struct md_rdev *rdev)
+{
+	char b[BDEVNAME_SIZE];
+	int ppl_data_sectors;
+	int ppl_size_new;
+
+	/*
+	 * The configured PPL size must be enough to store
+	 * the header and (at the very least) partial parity
+	 * for one stripe. Round it down to ensure the data
+	 * space is cleanly divisible by stripe size.
+	 */
+	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
+
+	if (ppl_data_sectors > 0)
+		ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+
+	if (ppl_data_sectors <= 0) {
+		pr_warn("md/raid:%s: PPL space too small on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -ENOSPC;
+	}
+
+	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
+
+	if ((rdev->ppl.sector < rdev->data_offset &&
+	     rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
+	    (rdev->ppl.sector >= rdev->data_offset &&
+	     rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
+		pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -EINVAL;
+	}
+
+	if (!rdev->mddev->external &&
+	    ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
+	     (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
+		pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -EINVAL;
+	}
+
+	rdev->ppl.size = ppl_size_new;
+
+	return 0;
+}
+
+int ppl_init_log(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf;
+	struct mddev *mddev = conf->mddev;
+	int ret = 0;
+	int i;
+	bool need_cache_flush;
+
+	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+		 mdname(conf->mddev));
+
+	if (PAGE_SIZE != 4096)
+		return -EINVAL;
+
+	if (mddev->level != 5) {
+		pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
+			mdname(mddev), mddev->level);
+		return -EINVAL;
+	}
+
+	if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
+		pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		pr_warn("md/raid:%s PPL is not compatible with journal\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+	if (!ppl_conf)
+		return -ENOMEM;
+
+	ppl_conf->mddev = mddev;
+
+	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
+	if (!ppl_conf->io_kc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
+	if (!ppl_conf->io_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
+	if (!ppl_conf->bs) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+	if (!ppl_conf->meta_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->count = conf->raid_disks;
+	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
+				       GFP_KERNEL);
+	if (!ppl_conf->child_logs) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	atomic64_set(&ppl_conf->seq, 0);
+
+	if (!mddev->external) {
+		ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+		ppl_conf->block_size = 512;
+	} else {
+		ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+	}
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct ppl_log *log = &ppl_conf->child_logs[i];
+		struct md_rdev *rdev = conf->disks[i].rdev;
+
+		mutex_init(&log->io_mutex);
+		spin_lock_init(&log->io_list_lock);
+		INIT_LIST_HEAD(&log->io_list);
+		INIT_LIST_HEAD(&log->no_mem_stripes);
+
+		log->ppl_conf = ppl_conf;
+		log->rdev = rdev;
+
+		if (rdev) {
+			struct request_queue *q;
+
+			ret = ppl_validate_rdev(rdev);
+			if (ret)
+				goto err;
+
+			q = bdev_get_queue(rdev->bdev);
+			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+				need_cache_flush = true;
+		}
+	}
+
+	if (need_cache_flush)
+		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+			mdname(mddev));
+
+	conf->log_private = ppl_conf;
+
+	return 0;
+err:
+	__ppl_exit_log(ppl_conf);
+	return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f575f40d2acb..6b86e0826afe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+
+	if (sh->ppl_page) {
+		put_page(sh->ppl_page);
+		sh->ppl_page = NULL;
+	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
 	}
+
+	if (raid5_has_ppl(sh->raid_conf)) {
+		sh->ppl_page = alloc_page(gfp);
+		if (!sh->ppl_page)
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return false;
 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
+	if (raid5_has_ppl(sh->raid_conf) &&
+	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
 		s->locked, s->ops_request);
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (forwrite && raid5_has_ppl(conf)) {
+		/*
+		 * With PPL only writes to consecutive data chunks within a
+		 * stripe are allowed because for a single stripe_head we can
+		 * only have one PPL entry at a time, which describes one data
+		 * range. Not really an overlap, but wait_for_overlap can be
+		 * used to handle this.
+		 */
+		sector_t sector;
+		sector_t first = 0;
+		sector_t last = 0;
+		int count = 0;
+		int i;
+
+		for (i = 0; i < sh->disks; i++) {
+			if (i != sh->pd_idx &&
+			    (i == dd_idx || sh->dev[i].towrite)) {
+				sector = sh->dev[i].sector;
+				if (count == 0 || sector < first)
+					first = sector;
+				if (sector > last)
+					last = sector;
+				count++;
+			}
+		}
+
+		if (first + conf->chunk_sectors * (count - 1) != last)
+			goto overlap;
+	}
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev)
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+	    test_bit(MD_HAS_PPL, &mddev->flags)) {
+		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+			mdname(mddev));
+		clear_bit(MD_HAS_PPL, &mddev->flags);
+	}
+
 	if (mddev->private == NULL)
 		conf = setup_conf(mddev);
 	else
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 	sector_t newsize;
 	struct r5conf *conf = mddev->private;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return -EINVAL;
 	sectors &= ~((sector_t)conf->chunk_sectors - 1);
 	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return -EINVAL;
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6dd295a80ee1..ba5b7a3790af 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
 	spinlock_t		batch_lock; /* only header's lock is useful */
 	struct list_head	batch_list; /* protected by head's batch lock*/
 
-	struct r5l_io_unit	*log_io;
+	union {
+		struct r5l_io_unit	*log_io;
+		struct ppl_io_unit	*ppl_io;
+	};
+
 	struct list_head	log_list;
 	sector_t		log_start; /* first meta block on the journal */
 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
+
+	struct page		*ppl_page; /* partial parity of this stripe */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +406,7 @@ enum {
 	STRIPE_OP_BIODRAIN,
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
+	STRIPE_OP_PARTIAL_PARITY,
 };
 
 /*
@@ -696,6 +703,7 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+	void			*log_private;
 
 	spinlock_t		pending_bios_lock;
 	bool			batch_bio_dispatch;
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe2112810c43..d9a1ead867b9 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -398,4 +398,31 @@ struct r5l_meta_block {
 
 #define R5LOG_VERSION 0x1
 #define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+	__le64 data_sector;	/* raid sector of the new data */
+	__le32 pp_size;		/* length of partial parity */
+	__le32 data_size;	/* length of data */
+	__le32 parity_disk;	/* member disk containing parity */
+	__le32 checksum;	/* checksum of partial parity data for this
+				 * entry (~crc32c) */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+	__u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
+	__le32 signature;		/* signature (family number of volume) */
+	__le32 padding;			/* zero pad */
+	__le64 generation;		/* generation number of the header */
+	__le32 entries_count;		/* number of entries in entry array */
+	__le32 checksum;		/* checksum of the header (~crc32c) */
+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
 #endif

From 664aed04446c7f653d8acbe2cdf7989f28238524 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:00 +0100
Subject: [PATCH 12/84] md: add sysfs entries for PPL

Add 'consistency_policy' attribute for array. It indicates how the array
maintains consistency in case of unexpected shutdown.

Add 'ppl_sector' and 'ppl_size' for rdev, which describe the location
and size of the PPL space on the device. They can't be changed for
active members if the array is started and PPL is enabled, so in the
setter functions only basic checks are performed. More checks are done
in ppl_validate_rdev() when starting the log.

These attributes are writable to allow enabling PPL for external
metadata arrays and (later) to enable/disable PPL for a running array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/admin-guide/md.rst |  32 ++++++++-
 drivers/md/md.c                  | 115 +++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 1e61bf50595c..84de718f24a4 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
      array creation it will default to 0, though starting the array as
      ``clean`` will set it much larger.
 
-   new_dev
+  new_dev
      This file can be written but not read.  The value written should
      be a block device number as major:minor.  e.g. 8:0
      This will cause that device to be attached to the array, if it is
      available.  It will then appear at md/dev-XXX (depending on the
      name of the device) and further configuration is then possible.
 
-   safe_mode_delay
+  safe_mode_delay
      When an md array has seen no write requests for a certain period
      of time, it will be marked as ``clean``.  When another write
      request arrives, the array is marked as ``dirty`` before the write
@@ -292,7 +292,7 @@ All md devices contain:
      period as a number of seconds.  The default is 200msec (0.200).
      Writing a value of 0 disables safemode.
 
-   array_state
+  array_state
      This file contains a single word which describes the current
      state of the array.  In many cases, the state can be set by
      writing the word for the desired state, however some states
@@ -401,7 +401,30 @@ All md devices contain:
      once the array becomes non-degraded, and this fact has been
      recorded in the metadata.
 
+  consistency_policy
+     This indicates how the array maintains consistency in case of unexpected
+     shutdown. It can be:
 
+     none
+       Array has no redundancy information, e.g. raid0, linear.
+
+     resync
+       Full resync is performed and all redundancy is regenerated when the
+       array is started after unclean shutdown.
+
+     bitmap
+       Resync assisted by a write-intent bitmap.
+
+     journal
+       For raid4/5/6, journal device is used to log transactions and replay
+       after unclean shutdown.
+
+     ppl
+       For raid5 only, Partial Parity Log is used to close the write hole and
+       eliminate resync.
+
+     The accepted values when writing to this file are ``ppl`` and ``resync``,
+     used to enable and disable PPL.
 
 
 As component devices are added to an md array, they appear in the ``md``
@@ -563,6 +586,9 @@ Each directory contains:
 	adds bad blocks without acknowledging them. This is largely
 	for testing.
 
+      ppl_sector, ppl_size
+        Location and size (in sectors) of the space used for Partial Parity Log
+        on this device.
 
 
 An active md device will also contain an entry for each active device
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d57045996d35..a7740306cbbd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3150,6 +3150,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
 
+static ssize_t
+ppl_sector_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
+}
+
+static ssize_t
+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned long long sector;
+
+	if (kstrtoull(buf, 10, &sector) < 0)
+		return -EINVAL;
+	if (sector != (sector_t)sector)
+		return -EINVAL;
+
+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	if (rdev->mddev->persistent) {
+		if (rdev->mddev->major_version == 0)
+			return -EINVAL;
+		if ((sector > rdev->sb_start &&
+		     sector - rdev->sb_start > S16_MAX) ||
+		    (sector < rdev->sb_start &&
+		     rdev->sb_start - sector > -S16_MIN))
+			return -EINVAL;
+		rdev->ppl.offset = sector - rdev->sb_start;
+	} else if (!rdev->mddev->external) {
+		return -EBUSY;
+	}
+	rdev->ppl.sector = sector;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_sector =
+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
+
+static ssize_t
+ppl_size_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%u\n", rdev->ppl.size);
+}
+
+static ssize_t
+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned int size;
+
+	if (kstrtouint(buf, 10, &size) < 0)
+		return -EINVAL;
+
+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	if (rdev->mddev->persistent) {
+		if (rdev->mddev->major_version == 0)
+			return -EINVAL;
+		if (size > U16_MAX)
+			return -EINVAL;
+	} else if (!rdev->mddev->external) {
+		return -EBUSY;
+	}
+	rdev->ppl.size = size;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_size =
+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_errors.attr,
@@ -3160,6 +3232,8 @@ static struct attribute *rdev_default_attrs[] = {
 	&rdev_recovery_start.attr,
 	&rdev_bad_blocks.attr,
 	&rdev_unack_bad_blocks.attr,
+	&rdev_ppl_sector.attr,
+	&rdev_ppl_size.attr,
 	NULL,
 };
 static ssize_t
@@ -4896,6 +4970,46 @@ static struct md_sysfs_entry md_array_size =
 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
        array_size_store);
 
+static ssize_t
+consistency_policy_show(struct mddev *mddev, char *page)
+{
+	int ret;
+
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		ret = sprintf(page, "journal\n");
+	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		ret = sprintf(page, "ppl\n");
+	} else if (mddev->bitmap) {
+		ret = sprintf(page, "bitmap\n");
+	} else if (mddev->pers) {
+		if (mddev->pers->sync_request)
+			ret = sprintf(page, "resync\n");
+		else
+			ret = sprintf(page, "none\n");
+	} else {
+		ret = sprintf(page, "unknown\n");
+	}
+
+	return ret;
+}
+
+static ssize_t
+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	if (mddev->pers) {
+		return -EBUSY;
+	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		return len;
+	} else {
+		return -EINVAL;
+	}
+}
+
+static struct md_sysfs_entry md_consistency_policy =
+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
+       consistency_policy_store);
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_layout.attr,
@@ -4911,6 +5025,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_reshape_direction.attr,
 	&md_array_size.attr,
 	&max_corr_read_errors.attr,
+	&md_consistency_policy.attr,
 	NULL,
 };
 

From 4536bf9ba2d03404655586b07f8830b6f2106242 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:01 +0100
Subject: [PATCH 13/84] raid5-ppl: load and recover the log

Load the log from each disk when starting the array and recover if the
array is dirty.

The initial empty PPL is written by mdadm. When loading the log we
verify the header checksum and signature. For external metadata arrays
the signature is verified in userspace, so here we read it from the
header, verifying only if it matches on all disks, and use it later when
writing PPL.

In addition to the header checksum, each header entry also contains a
checksum of its partial parity data. If the header is valid, recovery is
performed for each entry until an invalid entry is found. If the array
is not degraded and recovery using PPL fully succeeds, there is no need
to resync the array because data and parity will be consistent, so in
this case resync will be disabled.

Due to compatibility with IMSM implementations on other systems, we
can't assume that the recovery data block size is always 4K. Writes
generated by MD raid5 don't have this issue, but when recovering PPL
written in other environments it is possible to have entries with
512-byte sector granularity. The recovery code takes this into account
and also the logical sector size of the underlying drives.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 489 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c     |   5 +-
 2 files changed, 493 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index db5b72b11594..d336c024eef9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -103,6 +103,10 @@ struct ppl_conf {
 	mempool_t *io_pool;
 	struct bio_set *bs;
 	mempool_t *meta_pool;
+
+	/* used only for recovery */
+	int recovered_entries;
+	int mismatch_count;
 };
 
 struct ppl_log {
@@ -514,6 +518,474 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
 		ppl_io_unit_finished(io);
 }
 
+static void ppl_xor(int size, struct page *page1, struct page *page2)
+{
+	struct async_submit_ctl submit;
+	struct dma_async_tx_descriptor *tx;
+	struct page *xor_srcs[] = { page1, page2 };
+
+	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+			  NULL, NULL, NULL, NULL);
+	tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
+
+	async_tx_quiesce(&tx);
+}
+
+/*
+ * PPL recovery strategy: xor partial parity and data from all modified data
+ * disks within a stripe and write the result as the new stripe parity. If all
+ * stripe data disks are modified (full stripe write), no partial parity is
+ * available, so just xor the data disks.
+ *
+ * Recovery of a PPL entry shall occur only if all modified data disks are
+ * available and read from all of them succeeds.
+ *
+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
+ * the size of the chunk. Examples of possible cases for a single entry:
+ *
+ * case 0: single data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size
+ *
+ * case 1: more than one data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size / modified_data_disks
+ *
+ * case 2: write to all data disks (also full stripe write):
+ *   data0    data1    data2                parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ |           | (no change)        |
+ * | -data- | -data- | -data- | --------> | xor all data       |
+ * | ------ | ------ | ------ | --------> | (no change)        |
+ * | ------ | ------ | ------ |           | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = 0
+ *
+ * The following cases are possible only in other implementations. The recovery
+ * code can handle them, but they are not generated at runtime because they can
+ * be reduced to cases 0, 1 and 2:
+ *
+ * case 3:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | -data- | | pp |    | data1 ^ data2 ^ pp |
+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
+ * | -data- | -data- | -data- | | -- | -> | xor all data       |
+ * | -data- | -data- | ------ | | pp |    | data0 ^ data1 ^ pp |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ *
+ * case 4:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | ------ | | pp |    | data1 ^ pp         |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | -data- | ------ | ------ | | pp |    | data0 ^ pp         |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ */
+static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
+			     sector_t ppl_sector)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct mddev *mddev = ppl_conf->mddev;
+	struct r5conf *conf = mddev->private;
+	int block_size = ppl_conf->block_size;
+	struct page *page1;
+	struct page *page2;
+	sector_t r_sector_first;
+	sector_t r_sector_last;
+	int strip_sectors;
+	int data_disks;
+	int i;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+	unsigned int pp_size = le32_to_cpu(e->pp_size);
+	unsigned int data_size = le32_to_cpu(e->data_size);
+
+	page1 = alloc_page(GFP_KERNEL);
+	page2 = alloc_page(GFP_KERNEL);
+
+	if (!page1 || !page2) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
+
+	if ((pp_size >> 9) < conf->chunk_sectors) {
+		if (pp_size > 0) {
+			data_disks = data_size / pp_size;
+			strip_sectors = pp_size >> 9;
+		} else {
+			data_disks = conf->raid_disks - conf->max_degraded;
+			strip_sectors = (data_size >> 9) / data_disks;
+		}
+		r_sector_last = r_sector_first +
+				(data_disks - 1) * conf->chunk_sectors +
+				strip_sectors;
+	} else {
+		data_disks = conf->raid_disks - conf->max_degraded;
+		strip_sectors = conf->chunk_sectors;
+		r_sector_last = r_sector_first + (data_size >> 9);
+	}
+
+	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
+		 (unsigned long long)r_sector_first,
+		 (unsigned long long)r_sector_last);
+
+	/* if start and end is 4k aligned, use a 4k block */
+	if (block_size == 512 &&
+	    (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
+	    (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
+		block_size = STRIPE_SIZE;
+
+	/* iterate through blocks in strip */
+	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+		bool update_parity = false;
+		sector_t parity_sector;
+		struct md_rdev *parity_rdev;
+		struct stripe_head sh;
+		int disk;
+		int indent = 0;
+
+		pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
+		indent += 2;
+
+		memset(page_address(page1), 0, PAGE_SIZE);
+
+		/* iterate through data member disks */
+		for (disk = 0; disk < data_disks; disk++) {
+			int dd_idx;
+			struct md_rdev *rdev;
+			sector_t sector;
+			sector_t r_sector = r_sector_first + i +
+					    (disk * conf->chunk_sectors);
+
+			pr_debug("%s:%*s data member disk %d start\n",
+				 __func__, indent, "", disk);
+			indent += 2;
+
+			if (r_sector >= r_sector_last) {
+				pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
+					 __func__, indent, "",
+					 (unsigned long long)r_sector);
+				indent -= 2;
+				continue;
+			}
+
+			update_parity = true;
+
+			/* map raid sector to member disk */
+			sector = raid5_compute_sector(conf, r_sector, 0,
+						      &dd_idx, NULL);
+			pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)r_sector, dd_idx,
+				 (unsigned long long)sector);
+
+			rdev = conf->disks[dd_idx].rdev;
+			if (!rdev) {
+				pr_debug("%s:%*s data member disk %d missing\n",
+					 __func__, indent, "", dd_idx);
+				update_parity = false;
+				break;
+			}
+
+			pr_debug("%s:%*s reading data member disk %s sector %llu\n",
+				 __func__, indent, "", bdevname(rdev->bdev, b),
+				 (unsigned long long)sector);
+			if (!sync_page_io(rdev, sector, block_size, page2,
+					REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2);
+
+			indent -= 2;
+		}
+
+		if (!update_parity)
+			continue;
+
+		if (pp_size > 0) {
+			pr_debug("%s:%*s reading pp disk sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)(ppl_sector + i));
+			if (!sync_page_io(log->rdev,
+					ppl_sector - log->rdev->data_offset + i,
+					block_size, page2, REQ_OP_READ, 0,
+					false)) {
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				md_error(mddev, log->rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2);
+		}
+
+		/* map raid sector to parity disk */
+		parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+				0, &disk, &sh);
+		BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
+		parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+		BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+		pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
+			 __func__, indent, "",
+			 (unsigned long long)parity_sector,
+			 bdevname(parity_rdev->bdev, b));
+		if (!sync_page_io(parity_rdev, parity_sector, block_size,
+				page1, REQ_OP_WRITE, 0, false)) {
+			pr_debug("%s:%*s parity write error!\n", __func__,
+				 indent, "");
+			md_error(mddev, parity_rdev);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	if (page1)
+		__free_page(page1);
+	if (page2)
+		__free_page(page2);
+	return ret;
+}
+
+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+	struct page *page;
+	int i;
+	int ret = 0;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* iterate through all PPL entries saved */
+	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 pp_size = le32_to_cpu(e->pp_size);
+		sector_t sector = ppl_sector;
+		int ppl_entry_sectors = pp_size >> 9;
+		u32 crc, crc_stored;
+
+		pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
+			 __func__, rdev->raid_disk, i,
+			 (unsigned long long)ppl_sector, pp_size);
+
+		crc = ~0;
+		crc_stored = le32_to_cpu(e->checksum);
+
+		/* read parial parity for this entry and calculate its checksum */
+		while (pp_size) {
+			int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
+
+			if (!sync_page_io(rdev, sector - rdev->data_offset,
+					s, page, REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			crc = crc32c_le(crc, page_address(page), s);
+
+			pp_size -= s;
+			sector += s >> 9;
+		}
+
+		crc = ~crc;
+
+		if (crc != crc_stored) {
+			/*
+			 * Don't recover this entry if the checksum does not
+			 * match, but keep going and try to recover other
+			 * entries.
+			 */
+			pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+				 __func__, crc_stored, crc);
+			ppl_conf->mismatch_count++;
+		} else {
+			ret = ppl_recover_entry(log, e, ppl_sector);
+			if (ret)
+				goto out;
+			ppl_conf->recovered_entries++;
+		}
+
+		ppl_sector += ppl_entry_sectors;
+	}
+
+	/* flush the disk cache after recovery if necessary */
+	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
+out:
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_write_empty_header(struct ppl_log *log)
+{
+	struct page *page;
+	struct ppl_header *pplhdr;
+	struct md_rdev *rdev = log->rdev;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
+		 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
+
+	page = alloc_page(GFP_NOIO | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	pplhdr = page_address(page);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
+			  false)) {
+		md_error(rdev->mddev, rdev);
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_load_distributed(struct ppl_log *log)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	struct page *page;
+	struct ppl_header *pplhdr;
+	u32 crc, crc_stored;
+	u32 signature;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
+
+	/* read PPL header */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+		md_error(mddev, rdev);
+		ret = -EIO;
+		goto out;
+	}
+	pplhdr = page_address(page);
+
+	/* check header validity */
+	crc_stored = le32_to_cpu(pplhdr->checksum);
+	pplhdr->checksum = 0;
+	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+	if (crc_stored != crc) {
+		pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+			 __func__, crc_stored, crc);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	signature = le32_to_cpu(pplhdr->signature);
+
+	if (mddev->external) {
+		/*
+		 * For external metadata the header signature is set and
+		 * validated in userspace.
+		 */
+		ppl_conf->signature = signature;
+	} else if (ppl_conf->signature != signature) {
+		pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+			 __func__, signature, ppl_conf->signature);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	/* attempt to recover from log if we are starting a dirty array */
+	if (!mddev->pers && mddev->recovery_cp != MaxSector)
+		ret = ppl_recover(log, pplhdr);
+out:
+	/* write empty header if we are starting the array */
+	if (!ret && !mddev->pers)
+		ret = ppl_write_empty_header(log);
+
+	__free_page(page);
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
+static int ppl_load(struct ppl_conf *ppl_conf)
+{
+	int ret = 0;
+	u32 signature = 0;
+	bool signature_set = false;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct ppl_log *log = &ppl_conf->child_logs[i];
+
+		/* skip missing drive */
+		if (!log->rdev)
+			continue;
+
+		ret = ppl_load_distributed(log);
+		if (ret)
+			break;
+
+		/*
+		 * For external metadata we can't check if the signature is
+		 * correct on a single drive, but we can check if it is the same
+		 * on all drives.
+		 */
+		if (ppl_conf->mddev->external) {
+			if (!signature_set) {
+				signature = ppl_conf->signature;
+				signature_set = true;
+			} else if (signature != ppl_conf->signature) {
+				pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
+					mdname(ppl_conf->mddev));
+				ret = -EINVAL;
+				break;
+			}
+		}
+	}
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
 	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
@@ -694,6 +1166,23 @@ int ppl_init_log(struct r5conf *conf)
 		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
 			mdname(mddev));
 
+	/* load and possibly recover the logs from the member disks */
+	ret = ppl_load(ppl_conf);
+
+	if (ret) {
+		goto err;
+	} else if (!mddev->pers &&
+		   mddev->recovery_cp == 0 && !mddev->degraded &&
+		   ppl_conf->recovered_entries > 0 &&
+		   ppl_conf->mismatch_count == 0) {
+		/*
+		 * If we are starting a dirty array and the recovery succeeds
+		 * without any issues, set the array as clean.
+		 */
+		mddev->recovery_cp = MaxSector;
+		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	}
+
 	conf->log_private = ppl_conf;
 
 	return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6b86e0826afe..78ed5748d33d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7357,7 +7357,10 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
-		if (mddev->ok_start_degraded)
+		if (test_bit(MD_HAS_PPL, &mddev->flags))
+			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+				mdname(mddev));
+		else if (mddev->ok_start_degraded)
 			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
 				mdname(mddev));
 		else {

From 6358c239d88c751a9f14152a8d4ad2b69f5be48f Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:02 +0100
Subject: [PATCH 14/84] raid5-ppl: support disk hot add/remove with PPL

Add a function to modify the log by removing an rdev when a drive fails
or adding when a spare/replacement is activated as a raid member.

Removing a disk just clears the child log rdev pointer. No new stripes
will be accepted for this child log in ppl_write_stripe() and running io
units will be processed without writing PPL to the device.

Adding a disk sets the child log rdev pointer and writes an empty PPL
header.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-log.h |  9 +++++++++
 drivers/md/raid5-ppl.c | 45 +++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.c     | 12 ++++++++++-
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index a67fb58513b9..4f5a0f4e0b1f 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -39,6 +39,7 @@ extern void ppl_exit_log(struct r5conf *conf);
 extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
 extern void ppl_write_stripe_run(struct r5conf *conf);
 extern void ppl_stripe_write_finished(struct stripe_head *sh);
+extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
 
 static inline bool raid5_has_ppl(struct r5conf *conf)
 {
@@ -102,4 +103,12 @@ static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
 	return 0;
 }
 
+static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+	if (raid5_has_ppl(conf))
+		return ppl_modify_log(conf, rdev, add);
+
+	return 0;
+}
+
 #endif
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index d336c024eef9..4af420f4d8c0 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -400,6 +400,13 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	struct stripe_head *sh;
 	int i;
 
+	bio->bi_private = io;
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		ppl_log_endio(bio);
+		return;
+	}
+
 	for (i = 0; i < io->entries_count; i++) {
 		struct ppl_header_entry *e = &pplhdr->entries[i];
 
@@ -415,7 +422,6 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	pplhdr->entries_count = cpu_to_le32(io->entries_count);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
-	bio->bi_private = io;
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1190,3 +1196,40 @@ err:
 	__ppl_exit_log(ppl_conf);
 	return ret;
 }
+
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_log *log;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+
+	if (!rdev)
+		return -EINVAL;
+
+	pr_debug("%s: disk: %d operation: %s dev: %s\n",
+		 __func__, rdev->raid_disk, add ? "add" : "remove",
+		 bdevname(rdev->bdev, b));
+
+	if (rdev->raid_disk < 0)
+		return 0;
+
+	if (rdev->raid_disk >= ppl_conf->count)
+		return -ENODEV;
+
+	log = &ppl_conf->child_logs[rdev->raid_disk];
+
+	mutex_lock(&log->io_mutex);
+	if (add) {
+		ret = ppl_validate_rdev(rdev);
+		if (!ret) {
+			log->rdev = rdev;
+			ret = ppl_write_empty_header(log);
+		}
+	} else {
+		log->rdev = NULL;
+	}
+	mutex_unlock(&log->io_mutex);
+
+	return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78ed5748d33d..6760af251864 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7648,6 +7648,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			*rdevp = rdev;
 		}
 	}
+	if (!err) {
+		err = log_modify(conf, rdev, false);
+		if (err)
+			goto abort;
+	}
 	if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
@@ -7657,6 +7662,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   */
 		p->replacement = NULL;
 		clear_bit(WantReplacement, &rdev->flags);
+
+		if (!err)
+			err = log_modify(conf, p->rdev, true);
 	} else
 		/* We might have just removed the Replacement as faulty-
 		 * clear the bit just in case
@@ -7713,10 +7721,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		if (p->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
-			err = 0;
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
+
+			err = log_modify(conf, rdev, true);
+
 			goto out;
 		}
 	}

From ba903a3ea465bd2f2bb9316054b295e79a7a518e Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:03 +0100
Subject: [PATCH 15/84] raid5-ppl: runtime PPL enabling or disabling

Allow writing to 'consistency_policy' attribute when the array is
active. Add a new function 'change_consistency_policy' to the
md_personality operations structure to handle the change in the
personality code. Values "ppl" and "resync" are accepted and
turn PPL on and off respectively.

When enabling PPL its location and size should first be set using
'ppl_sector' and 'ppl_size' attributes and a valid PPL header should be
written at this location on each member device.

Enabling or disabling PPL is performed under a suspended array.  The
raid5_reset_stripe_cache function frees the stripe cache and allocates
it again in order to allocate or free the ppl_pages for the stripes in
the stripe cache.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c        | 12 +++++++---
 drivers/md/md.h        |  2 ++
 drivers/md/raid5-ppl.c |  4 ++++
 drivers/md/raid5.c     | 53 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a7740306cbbd..af9118711228 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4996,14 +4996,20 @@ consistency_policy_show(struct mddev *mddev, char *page)
 static ssize_t
 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	int err = 0;
+
 	if (mddev->pers) {
-		return -EBUSY;
+		if (mddev->pers->change_consistency_policy)
+			err = mddev->pers->change_consistency_policy(mddev, buf);
+		else
+			err = -EBUSY;
 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
 		set_bit(MD_HAS_PPL, &mddev->flags);
-		return len;
 	} else {
-		return -EINVAL;
+		err = -EINVAL;
 	}
+
+	return err ? err : len;
 }
 
 static struct md_sysfs_entry md_consistency_policy =
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a7b2f16452c4..e0940064c3ec 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -545,6 +545,8 @@ struct md_personality
 	/* congested implements bdi.congested_fn().
 	 * Will not be called while array is 'suspended' */
 	int (*congested)(struct mddev *mddev, int bits);
+	/* Changes the consistency policy of an active array. */
+	int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
 };
 
 struct md_sysfs_entry {
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 4af420f4d8c0..27bad3e2d7ce 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1187,6 +1187,10 @@ int ppl_init_log(struct r5conf *conf)
 		 */
 		mddev->recovery_cp = MaxSector;
 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
+		/* no mismatch allowed when enabling PPL for a running array */
+		ret = -EINVAL;
+		goto err;
 	}
 
 	conf->log_private = ppl_conf;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6760af251864..88cc8981bd49 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8334,6 +8334,58 @@ static void *raid6_takeover(struct mddev *mddev)
 	return setup_conf(mddev);
 }
 
+static void raid5_reset_stripe_cache(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	mutex_lock(&conf->cache_size_mutex);
+	while (conf->max_nr_stripes &&
+	       drop_one_stripe(conf))
+		;
+	while (conf->min_nr_stripes > conf->max_nr_stripes &&
+	       grow_one_stripe(conf, GFP_KERNEL))
+		;
+	mutex_unlock(&conf->cache_size_mutex);
+}
+
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+	struct r5conf *conf;
+	int err;
+
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	conf = mddev->private;
+	if (!conf) {
+		mddev_unlock(mddev);
+		return -ENODEV;
+	}
+
+	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
+		mddev_suspend(mddev);
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		err = log_init(conf, NULL);
+		if (!err)
+			raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) {
+		mddev_suspend(mddev);
+		log_exit(conf);
+		raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else {
+		err = -EINVAL;
+	}
+
+	if (!err)
+		md_update_sb(mddev, 1);
+
+	mddev_unlock(mddev);
+
+	return err;
+}
+
 static struct md_personality raid6_personality =
 {
 	.name		= "raid6",
@@ -8379,6 +8431,7 @@ static struct md_personality raid5_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static struct md_personality raid4_personality =

From 2d4f468753777a29124f779cdffd1ec009d70597 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 7 Mar 2017 17:44:21 -0800
Subject: [PATCH 16/84] md/r5cache: handle R5LOG_PAYLOAD_FLUSH in recovery

This patch adds handling of R5LOG_PAYLOAD_FLUSH in journal recovery.
Next patch will add logic that generate R5LOG_PAYLOAD_FLUSH on flush
finish.

When R5LOG_PAYLOAD_FLUSH is seen in recovery, pending data and parity
will be dropped from recovery. This will reduce the number of stripes
to replay, and thus accelerate the recovery process.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 47 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5f82dabdda6f..f85ab2744bfb 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1959,6 +1959,7 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
 	struct page *page;
 	struct r5l_payload_data_parity *payload;
+	struct r5l_payload_flush *payload_flush;
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
@@ -1966,6 +1967,7 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 
 	while (mb_offset < le32_to_cpu(mb->meta_size)) {
 		payload = (void *)mb + mb_offset;
+		payload_flush = (void *)mb + mb_offset;
 
 		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
 			if (r5l_recovery_verify_data_checksum(
@@ -1984,15 +1986,23 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 						 BLOCK_SECTORS),
 				    payload->checksum[1]) < 0)
 				goto mismatch;
-		} else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
+		} else if (payload->header.type == R5LOG_PAYLOAD_FLUSH) {
+			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
+		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
 			goto mismatch;
 
-		log_offset = r5l_ring_add(log, log_offset,
-					  le32_to_cpu(payload->size));
+		if (payload->header.type == R5LOG_PAYLOAD_FLUSH) {
+			mb_offset += sizeof(struct r5l_payload_flush) +
+				le32_to_cpu(payload_flush->size);
+		} else {
+			/* DATA or PARITY payload */
+			log_offset = r5l_ring_add(log, log_offset,
+						  le32_to_cpu(payload->size));
+			mb_offset += sizeof(struct r5l_payload_data_parity) +
+				sizeof(__le32) *
+				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+		}
 
-		mb_offset += sizeof(struct r5l_payload_data_parity) +
-			sizeof(__le32) *
-			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 	}
 
 	put_page(page);
@@ -2020,6 +2030,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 	struct r5conf *conf = mddev->private;
 	struct r5l_meta_block *mb;
 	struct r5l_payload_data_parity *payload;
+	struct r5l_payload_flush *payload_flush;
 	int mb_offset;
 	sector_t log_offset;
 	sector_t stripe_sect;
@@ -2045,6 +2056,30 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		int dd;
 
 		payload = (void *)mb + mb_offset;
+		payload_flush = (void *)mb + mb_offset;
+
+		if (payload->header.type == R5LOG_PAYLOAD_FLUSH) {
+			int i, count;
+
+			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
+			for (i = 0; i < count; ++i) {
+				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
+				sh = r5c_recovery_lookup_stripe(cached_stripe_list,
+								stripe_sect);
+				if (sh) {
+					WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
+					r5l_recovery_reset_stripe(sh);
+					list_del_init(&sh->lru);
+					raid5_release_stripe(sh);
+				}
+			}
+
+			mb_offset += sizeof(struct r5l_payload_flush) +
+				le32_to_cpu(payload_flush->size);
+			continue;
+		}
+
+		/* DATA or PARITY payload */
 		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
 			raid5_compute_sector(
 				conf, le64_to_cpu(payload->location), 0, &dd,

From ea17481fb48888fa11f412766bde36be9171247e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 9 Mar 2017 21:23:39 -0800
Subject: [PATCH 17/84] md/r5cache: generate R5LOG_PAYLOAD_FLUSH

In r5c_finish_stripe_write_out(), R5LOG_PAYLOAD_FLUSH is append to
log->current_io.

Appending R5LOG_PAYLOAD_FLUSH in quiesce needs extra writes to
journal. To simplify the logic, we just skip R5LOG_PAYLOAD_FLUSH in
quiesce.

Even R5LOG_PAYLOAD_FLUSH supports multiple stripes per payload.
However, current implementation is one stripe per R5LOG_PAYLOAD_FLUSH,
which is simpler.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 45 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f85ab2744bfb..64493132470b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -593,7 +593,7 @@ static void r5l_log_endio(struct bio *bio)
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
-	if (log->need_cache_flush)
+	if (log->need_cache_flush && !list_empty(&io->stripe_list))
 		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
@@ -621,9 +621,11 @@ static void r5l_log_endio(struct bio *bio)
 			bio_endio(bi);
 			atomic_dec(&io->pending_stripe);
 		}
-		if (atomic_read(&io->pending_stripe) == 0)
-			__r5l_stripe_write_finished(io);
 	}
+
+	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+	if (atomic_read(&io->pending_stripe) == 0)
+		__r5l_stripe_write_finished(io);
 }
 
 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@ -845,6 +847,41 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	r5_reserve_log_entry(log, io);
 }
 
+static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	struct r5l_io_unit *io;
+	struct r5l_payload_flush *payload;
+	int meta_size;
+
+	/*
+	 * payload_flush requires extra writes to the journal.
+	 * To avoid handling the extra IO in quiesce, just skip
+	 * flush_payload
+	 */
+	if (conf->quiesce)
+		return;
+
+	mutex_lock(&log->io_mutex);
+	meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
+
+	if (r5l_get_meta(log, meta_size)) {
+		mutex_unlock(&log->io_mutex);
+		return;
+	}
+
+	/* current implementation is one stripe per flush payload */
+	io = log->current_io;
+	payload = page_address(io->meta_page) + io->meta_offset;
+	payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
+	payload->header.flags = cpu_to_le16(0);
+	payload->size = cpu_to_le32(sizeof(__le64));
+	payload->flush_stripes[0] = cpu_to_le64(sect);
+	io->meta_offset += meta_size;
+	mutex_unlock(&log->io_mutex);
+}
+
 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 			   int data_pages, int parity_pages)
 {
@@ -2784,6 +2821,8 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 		atomic_dec(&conf->r5c_flushing_full_stripes);
 		atomic_dec(&conf->r5c_cached_full_stripes);
 	}
+
+	r5l_append_flush_payload(log, sh->sector);
 }
 
 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)

From 48df498daf62e1292868005675331929305067f0 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 14 Mar 2017 09:40:20 +0800
Subject: [PATCH 18/84] md: move bitmap_destroy to the beginning of __md_stop

Since we have switched to sync way to handle METADATA_UPDATED
msg for md-cluster, then process_metadata_update is depended
on mddev->thread->wqueue.

With the new change, clustered raid could possible hang if
array received a METADATA_UPDATED msg after array unregistered
mddev->thread, so we need to stop clustered raid (bitmap_destroy
-> bitmap_free -> md_cluster_stop) earlier than unregister
thread (mddev_detach -> md_unregister_thread).

And this change should be safe for non-clustered raid since
all writes are stopped before the destroy. Also in md_run,
we activate the personality (pers->run()) before activating
the bitmap (bitmap_create()). So it is pleasingly symmetric
to stop the bitmap (bitmap_destroy()) before stopping the
personality (__md_stop() calls pers->free()), we achieve this
by move bitmap_destroy to the beginning of __md_stop.

But we don't want to break the codes for waiting behind IO as
Shaohua mentioned, so introduce bitmap_wait_behind_writes to
call the codes, and call the new fun in both mddev_detach and
bitmap_destroy, then we will not break original behind IO code
and also fit the new condition well.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 16 ++++++++++++++++
 drivers/md/bitmap.h |  1 +
 drivers/md/md.c     | 13 ++-----------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b6fa55a3cff8..cc7bbd21813e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1764,6 +1764,20 @@ void bitmap_free(struct bitmap *bitmap)
 }
 EXPORT_SYMBOL(bitmap_free);
 
+void bitmap_wait_behind_writes(struct mddev *mddev)
+{
+	struct bitmap *bitmap = mddev->bitmap;
+
+	/* wait for behind writes to complete */
+	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
+			 mdname(mddev));
+		/* need to kick something here to make sure I/O goes? */
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
+	}
+}
+
 void bitmap_destroy(struct mddev *mddev)
 {
 	struct bitmap *bitmap = mddev->bitmap;
@@ -1771,6 +1785,8 @@ void bitmap_destroy(struct mddev *mddev)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
+	bitmap_wait_behind_writes(mddev);
+
 	mutex_lock(&mddev->bitmap_info.mutex);
 	spin_lock(&mddev->lock);
 	mddev->bitmap = NULL; /* disconnect from the md device */
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 9f761097aab2..d15721ac07a6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -271,6 +271,7 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 				sector_t *lo, sector_t *hi, bool clear_bits);
 void bitmap_free(struct bitmap *bitmap);
+void bitmap_wait_behind_writes(struct mddev *mddev);
 #endif
 
 #endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index af9118711228..42e68b2e0b41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5675,15 +5675,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
 
 static void mddev_detach(struct mddev *mddev)
 {
-	struct bitmap *bitmap = mddev->bitmap;
-	/* wait for behind writes to complete */
-	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
-		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
-			 mdname(mddev));
-		/* need to kick something here to make sure I/O goes? */
-		wait_event(bitmap->behind_wait,
-			   atomic_read(&bitmap->behind_writes) == 0);
-	}
+	bitmap_wait_behind_writes(mddev);
 	if (mddev->pers && mddev->pers->quiesce) {
 		mddev->pers->quiesce(mddev, 1);
 		mddev->pers->quiesce(mddev, 0);
@@ -5696,6 +5688,7 @@ static void mddev_detach(struct mddev *mddev)
 static void __md_stop(struct mddev *mddev)
 {
 	struct md_personality *pers = mddev->pers;
+	bitmap_destroy(mddev);
 	mddev_detach(mddev);
 	/* Ensure ->event_work is done */
 	flush_workqueue(md_misc_wq);
@@ -5716,7 +5709,6 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
-	bitmap_destroy(mddev);
 	if (mddev->bio_set)
 		bioset_free(mddev->bio_set);
 }
@@ -5854,7 +5846,6 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	if (mode == 0) {
 		pr_info("md: %s stopped.\n", mdname(mddev));
 
-		bitmap_destroy(mddev);
 		if (mddev->bitmap_info.file) {
 			struct file *f = mddev->bitmap_info.file;
 			spin_lock(&mddev->lock);

From 497280509f32340d90feac030bce18006a3e3605 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:12 +1100
Subject: [PATCH 19/84] md/raid5: use md_write_start to count stripes, not bios

We use md_write_start() to increase the count of pending writes, and
md_write_end() to decrement the count.  We currently count bios
submitted to md/raid5.  Change it count stripe_heads that a WRITE bio
has been attached to.

So now, raid5_make_request() calls md_write_start() and then
md_write_end() to keep the count elevated during the setup of the
request.

add_stripe_bio() calls md_write_start() for each stripe_head, and the
completion routines always call md_write_end(), instead of only
calling it when raid5_dec_bi_active_stripes() returns 0.
make_discard_request also calls md_write_start/end().

The parallel between md_write_{start,end} and use of bi_phys_segments
can be seen in that:
 Whenever we set bi_phys_segments to 1, we now call md_write_start.
 Whenever we increment it on non-read requests with
   raid5_inc_bi_active_stripes(), we now call md_write_start().
 Whenever we decrement bi_phys_segments on non-read requsts with
    raid5_dec_bi_active_stripes(), we now call md_write_end().

This reduces our dependence on keeping a per-bio count of active
stripes in bi_phys_segments.

md_write_inc() is added which parallels md_write_start(), but requires
that a write has already been started, and is certain never to sleep.
This can be used inside a spinlocked region when adding to a write
request.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c          | 17 +++++++++++++++++
 drivers/md/md.h          |  1 +
 drivers/md/raid5-cache.c |  2 +-
 drivers/md/raid5.c       | 27 +++++++++++++--------------
 4 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 42e68b2e0b41..41f766ab824a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7907,6 +7907,23 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 }
 EXPORT_SYMBOL(md_write_start);
 
+/* md_write_inc can only be called when md_write_start() has
+ * already been called at least once of the current request.
+ * It increments the counter and is useful when a single request
+ * is split into several parts.  Each part causes an increment and
+ * so needs a matching md_write_end().
+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
+ * a spinlocked region.
+ */
+void md_write_inc(struct mddev *mddev, struct bio *bi)
+{
+	if (bio_data_dir(bi) != WRITE)
+		return;
+	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+	atomic_inc(&mddev->writes_pending);
+}
+EXPORT_SYMBOL(md_write_inc);
+
 void md_write_end(struct mddev *mddev)
 {
 	if (atomic_dec_and_test(&mddev->writes_pending)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e0940064c3ec..0cd12721a536 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -648,6 +648,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 64493132470b..f5034ecb4e94 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	while (wbi && wbi->bi_iter.bi_sector <
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
+		md_write_end(conf->mddev);
 		if (!raid5_dec_bi_active_stripes(wbi)) {
-			md_write_end(conf->mddev);
 			bio_list_add(return_bi, wbi);
 		}
 		wbi = wbi2;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 88cc8981bd49..a684003fc965 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3274,6 +3274,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		bi->bi_next = *bip;
 	*bip = bi;
 	raid5_inc_bi_active_stripes(bi);
+	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
 		/* check if page is covered */
@@ -3397,10 +3398,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3421,10 +3421,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = bi2;
 		}
 
@@ -3781,10 +3780,9 @@ returnbi:
 				while (wbi && wbi->bi_iter.bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
-					if (!raid5_dec_bi_active_stripes(wbi)) {
-						md_write_end(conf->mddev);
+					md_write_end(conf->mddev);
+					if (!raid5_dec_bi_active_stripes(wbi))
 						bio_list_add(return_bi, wbi);
-					}
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5487,6 +5485,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
 		(conf->raid_disks - conf->max_degraded);
@@ -5533,6 +5532,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
@@ -5555,9 +5555,9 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		release_stripe_plug(mddev, sh);
 	}
 
+	md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
-		md_write_end(mddev);
 		bio_endio(bi);
 	}
 }
@@ -5592,8 +5592,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
-	md_write_start(mddev, bi);
-
 	/*
 	 * If array is degraded, better not do chunk aligned read because
 	 * later we might have to read it again in order to reconstruct
@@ -5615,6 +5613,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5749,11 +5748,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	}
 	finish_wait(&conf->wait_for_overlap, &w);
 
+	if (rw == WRITE)
+		md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
 
-		if ( rw == WRITE )
-			md_write_end(mddev);
 
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);

From 16d997b78b157315f5c90fcbc2f9ce575cb3879f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:12 +1100
Subject: [PATCH 20/84] md/raid5: simplfy delaying of writes while metadata is
 updated.

If a device fails during a write, we must ensure the failure is
recorded in the metadata before the completion of the write is
acknowleged.

Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before
write request returns.")  added code for this, but it was
unnecessarily complicated.  We already had similar functionality for
handling updates to the bad-block-list, thanks to Commit de393cdea66c
("md: make it easier to wait for bad blocks to be acknowledged.")

So revert most of the former commit, and instead avoid collecting
completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
the metadata and retry the stripe_head.
As this change can leave a stripe_head ready for handling immediately
after handle_active_stripes() returns, we change raid5_do_work() to
pause when MD_CHANGE_PENDING is set, so that it doesn't spin.

We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
asynchronously.  After analyse_stripe(), we have collected stable data
about the state of devices, which will be used to make decisions.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 31 ++++++++-----------------------
 drivers/md/raid5.h |  3 ---
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a684003fc965..a2c9ddc35335 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4691,7 +4691,8 @@ static void handle_stripe(struct stripe_head *sh)
 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 		goto finish;
 
-	if (s.handle_bad_blocks) {
+	if (s.handle_bad_blocks ||
+	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		goto finish;
 	}
@@ -5021,15 +5022,8 @@ finish:
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-			spin_lock_irq(&conf->device_lock);
-			bio_list_merge(&conf->return_bi, &s.return_bi);
-			spin_unlock_irq(&conf->device_lock);
-			md_wakeup_thread(conf->mddev->thread);
-		} else
-			return_io(&s.return_bi);
-	}
+	if (!bio_list_empty(&s.return_bi))
+		return_io(&s.return_bi);
 
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
@@ -6226,6 +6220,7 @@ static void raid5_do_work(struct work_struct *work)
 	struct r5worker *worker = container_of(work, struct r5worker, work);
 	struct r5worker_group *group = worker->group;
 	struct r5conf *conf = group->conf;
+	struct mddev *mddev = conf->mddev;
 	int group_id = group - conf->worker_groups;
 	int handled;
 	struct blk_plug plug;
@@ -6246,6 +6241,9 @@ static void raid5_do_work(struct work_struct *work)
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
+		wait_event_lock_irq(mddev->sb_wait,
+			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
@@ -6273,18 +6271,6 @@ static void raid5d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
-	if (!bio_list_empty(&conf->return_bi) &&
-	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-		struct bio_list tmp = BIO_EMPTY_LIST;
-		spin_lock_irq(&conf->device_lock);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			bio_list_merge(&tmp, &conf->return_bi);
-			bio_list_init(&conf->return_bi);
-		}
-		spin_unlock_irq(&conf->device_lock);
-		return_io(&tmp);
-	}
-
 	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
@@ -6936,7 +6922,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
-	bio_list_init(&conf->return_bi);
 	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ba5b7a3790af..13800dc9dd88 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -638,9 +638,6 @@ struct r5conf {
 	int			skip_copy; /* Don't copy data from bio to stripe cache */
 	struct list_head	*last_hold; /* detect hold_list promotions */
 
-	/* bios to have bi_end_io called after metadata is synced */
-	struct bio_list		return_bi;
-
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.

From bd83d0a28c68bacba88a3193a1bd6a083bb8d9f5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:12 +1100
Subject: [PATCH 21/84] md/raid5: call bio_endio() directly rather than
 queueing for later.

We currently gather bios that need to be returned into a bio_list
and call bio_endio() on them all together.
The original reason for this was to avoid making the calls while
holding a spinlock.
Locking has changed a lot since then, and that reason is no longer
valid.

So discard return_io() and various return_bi lists, and just call
bio_endio() directly as needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 13 +++++--------
 drivers/md/raid5-log.h   |  2 +-
 drivers/md/raid5.c       | 38 ++++++++++----------------------------
 drivers/md/raid5.h       |  1 -
 4 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f5034ecb4e94..5be8dbc5d91b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -308,8 +308,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 }
 
 static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
-			      struct bio_list *return_bi)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 {
 	struct bio *wbi, *wbi2;
 
@@ -319,23 +318,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi)) {
-			bio_list_add(return_bi, wbi);
-		}
+		if (!raid5_dec_bi_active_stripes(wbi))
+			bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
 
 void r5c_handle_cached_data_endio(struct r5conf *conf,
-	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+				  struct stripe_head *sh, int disks)
 {
 	int i;
 
 	for (i = sh->disks; i--; ) {
 		if (sh->dev[i].written) {
 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
-			r5c_return_dev_pending_writes(conf, &sh->dev[i],
-						      return_bi);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 					STRIPE_SECTORS,
 					!test_bit(STRIPE_DEGRADED, &sh->state),
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 4f5a0f4e0b1f..738930ff5d17 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -21,7 +21,7 @@ extern void r5c_release_extra_page(struct stripe_head *sh);
 extern void r5c_use_extra_page(struct stripe_head *sh);
 extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+	struct stripe_head *sh, int disks);
 extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
 extern void r5c_make_stripe_write_out(struct stripe_head *sh);
 extern void r5c_flush_cache(struct r5conf *conf, int num);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a2c9ddc35335..44c8ceba13fe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -158,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 	return slot;
 }
 
-static void return_io(struct bio_list *return_bi)
-{
-	struct bio *bi;
-	while ((bi = bio_list_pop(return_bi)) != NULL) {
-		bi->bi_iter.bi_size = 0;
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
-		bio_endio(bi);
-	}
-}
-
 static void print_raid5_conf (struct r5conf *conf);
 
 static int stripe_operations_active(struct stripe_head *sh)
@@ -1310,7 +1299,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 static void ops_complete_biofill(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	struct bio_list return_bi = BIO_EMPTY_LIST;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -1335,15 +1323,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_list_add(&return_bi, rbi);
+					bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
 	}
 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
-	return_io(&return_bi);
-
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
 }
@@ -3351,8 +3337,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 
 static void
 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
-				struct stripe_head_state *s, int disks,
-				struct bio_list *return_bi)
+		     struct stripe_head_state *s, int disks)
 {
 	int i;
 	BUG_ON(sh->batch_head);
@@ -3400,7 +3385,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3423,7 +3408,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3449,7 +3434,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 				bi->bi_error = -EIO;
 				if (!raid5_dec_bi_active_stripes(bi))
-					bio_list_add(return_bi, bi);
+					bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3748,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
  * never LOCKED, so we don't need to test 'failed' directly.
  */
 static void handle_stripe_clean_event(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi)
+	struct stripe_head *sh, int disks)
 {
 	int i;
 	struct r5dev *dev;
@@ -3782,7 +3767,7 @@ returnbi:
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
 					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_list_add(return_bi, wbi);
+						bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -4725,7 +4710,7 @@ static void handle_stripe(struct stripe_head *sh)
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
 		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+			handle_failed_stripe(conf, sh, &s, disks);
 		if (s.syncing + s.replacing)
 			handle_failed_sync(conf, sh, &s);
 	}
@@ -4791,10 +4776,10 @@ static void handle_stripe(struct stripe_head *sh)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
 				 test_bit(R5_Discard, &qdev->flags))))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+		handle_stripe_clean_event(conf, sh, disks);
 
 	if (s.just_cached)
-		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+		r5c_handle_cached_data_endio(conf, sh, disks);
 	log_stripe_write_finished(sh);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -5022,9 +5007,6 @@ finish:
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi))
-		return_io(&s.return_bi);
-
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 13800dc9dd88..fd5c21cde77f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -278,7 +278,6 @@ struct stripe_head_state {
 	int dec_preread_active;
 	unsigned long ops_request;
 
-	struct bio_list return_bi;
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 	int log_failed;

From 016c76ac76e4c678b01a75a602dc6be0282f5b29 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: [PATCH 22/84] md/raid5: use bio_inc_remaining() instead of
 repurposing bi_phys_segments as a counter

md/raid5 needs to keep track of how many stripe_heads are processing a
bio so that it can delay calling bio_endio() until all stripe_heads
have completed.  It currently uses 16 bits of ->bi_phys_segments for
this purpose.

16 bits is only enough for 256M requests, and it is possible for a
single bio to be larger than this, which causes problems.  Also, the
bio struct contains a larger counter, __bi_remaining, which has a
purpose very similar to the purpose of our counter.  So stop using
->bi_phys_segments, and instead use __bi_remaining.

This means we don't need to initialize the counter, as our caller
initializes it to '1'.  It also means we can call bio_endio() directly
as it tests this counter internally.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c |  3 +--
 drivers/md/raid5.c       | 57 +++++++++-------------------------------
 drivers/md/raid5.h       | 17 +-----------
 3 files changed, 15 insertions(+), 62 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5be8dbc5d91b..25eb048298fe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,7 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi))
-			bio_endio(wbi);
+		bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 44c8ceba13fe..0ec9e0212158 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1322,8 +1322,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_endio(rbi);
+				bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
@@ -3196,14 +3195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		(unsigned long long)bi->bi_iter.bi_sector,
 		(unsigned long long)sh->sector);
 
-	/*
-	 * If several bio share a stripe. The bio bi_phys_segments acts as a
-	 * reference count to avoid race. The reference count should already be
-	 * increased before this function is called (for example, in
-	 * raid5_make_request()), so other bio sharing this stripe will not free the
-	 * stripe. If a stripe is owned by one stripe, the stripe lock will
-	 * protect it.
-	 */
 	spin_lock_irq(&sh->stripe_lock);
 	/* Don't allow new IO added to stripes in batch list */
 	if (sh->batch_head)
@@ -3259,7 +3250,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip)
 		bi->bi_next = *bip;
 	*bip = bi;
-	raid5_inc_bi_active_stripes(bi);
+	bio_inc_remaining(bi);
 	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
@@ -3384,8 +3375,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3407,8 +3397,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3433,8 +3422,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 					r5_next_bio(bi, sh->dev[i].sector);
 
 				bi->bi_error = -EIO;
-				if (!raid5_dec_bi_active_stripes(bi))
-					bio_endio(bi);
+				bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3766,8 +3754,7 @@ returnbi:
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
-					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_endio(wbi);
+					bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5112,7 +5099,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 		 * this sets the active strip count to 1 and the processed
 		 * strip count to zero (upper 8 bits)
 		 */
-		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+		raid5_set_bi_processed_stripes(bi, 0);
 	}
 
 	return bi;
@@ -5449,7 +5436,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	struct r5conf *conf = mddev->private;
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
-	int remaining;
 	int stripe_sectors;
 
 	if (mddev->reshape_position != MaxSector)
@@ -5460,7 +5446,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
@@ -5507,7 +5492,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-			raid5_inc_bi_active_stripes(bi);
+			bio_inc_remaining(bi);
 			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
@@ -5532,10 +5517,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	}
 
 	md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5546,7 +5528,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
 	bool do_flush = false;
@@ -5588,7 +5569,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -5726,14 +5706,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
 	if (rw == WRITE)
 		md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-
-
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -6098,7 +6071,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	int dd_idx;
 	sector_t sector, logical_sector, last_sector;
 	int scnt = 0;
-	int remaining;
 	int handled = 0;
 
 	logical_sector = raid_bio->bi_iter.bi_sector &
@@ -6137,12 +6109,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		raid5_release_stripe(sh);
 		handled++;
 	}
-	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0) {
-		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
-					 raid_bio, 0);
-		bio_endio(raid_bio);
-	}
+
+	bio_endio(raid_bio);
+
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_quiescent);
 	return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index fd5c21cde77f..7d74fb3f2ec6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -488,8 +488,7 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 }
 
 /*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ * We maintain a count of processed stripes in the upper 16 bits
  */
 static inline int raid5_bi_processed_stripes(struct bio *bio)
 {
@@ -498,20 +497,6 @@ static inline int raid5_bi_processed_stripes(struct bio *bio)
 	return (atomic_read(segments) >> 16) & 0xffff;
 }
 
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_inc(segments);
-}
-
 static inline void raid5_set_bi_processed_stripes(struct bio *bio,
 	unsigned int cnt)
 {

From 0472a42ba1f89ec85f070c731f4440d7cc38c44c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: [PATCH 23/84] md/raid5: remove over-loading of ->bi_phys_segments.

When a read request, which bypassed the cache, fails, we need to retry
it through the cache.
This involves attaching it to a sequence of stripe_heads, and it may not
be possible to get all the stripe_heads we need at once.
We do what we can, and record how far we got in ->bi_phys_segments so
we can pick up again later.

There is only ever one bio which may have a non-zero offset stored in
->bi_phys_segments, the one that is either active in the single thread
which calls retry_aligned_read(), or is in conf->retry_read_aligned
waiting for retry_aligned_read() to be called again.

So we only need to store one offset value.  This can be in a local
variable passed between remove_bio_from_retry() and
retry_aligned_read(), or in the r5conf structure next to the
->retry_read_aligned pointer.

Storing it there allows the last usage of ->bi_phys_segments to be
removed from md/raid5.c.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 24 ++++++++++++------------
 drivers/md/raid5.h | 30 +-----------------------------
 2 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0ec9e0212158..1c8be667e9a9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5082,12 +5082,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 	md_wakeup_thread(conf->mddev->thread);
 }
 
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+					 unsigned int *offset)
 {
 	struct bio *bi;
 
 	bi = conf->retry_read_aligned;
 	if (bi) {
+		*offset = conf->retry_read_offset;
 		conf->retry_read_aligned = NULL;
 		return bi;
 	}
@@ -5095,11 +5097,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 	if(bi) {
 		conf->retry_read_aligned_list = bi->bi_next;
 		bi->bi_next = NULL;
-		/*
-		 * this sets the active strip count to 1 and the processed
-		 * strip count to zero (upper 8 bits)
-		 */
-		raid5_set_bi_processed_stripes(bi, 0);
+		*offset = 0;
 	}
 
 	return bi;
@@ -6055,7 +6053,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	return STRIPE_SECTORS;
 }
 
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+			       unsigned int offset)
 {
 	/* We may not be able to submit a whole bio at once as there
 	 * may not be enough stripe_heads available.
@@ -6084,7 +6083,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		     sector += STRIPE_SECTORS,
 		     scnt++) {
 
-		if (scnt < raid5_bi_processed_stripes(raid_bio))
+		if (scnt < offset)
 			/* already done this stripe */
 			continue;
 
@@ -6092,15 +6091,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			raid5_release_stripe(sh);
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
@@ -6228,6 +6227,7 @@ static void raid5d(struct md_thread *thread)
 	while (1) {
 		struct bio *bio;
 		int batch_size, released;
+		unsigned int offset;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);
 		if (released)
@@ -6245,10 +6245,10 @@ static void raid5d(struct md_thread *thread)
 		}
 		raid5_activate_delayed(conf);
 
-		while ((bio = remove_bio_from_retry(conf))) {
+		while ((bio = remove_bio_from_retry(conf, &offset))) {
 			int ok;
 			spin_unlock_irq(&conf->device_lock);
-			ok = retry_aligned_read(conf, bio);
+			ok = retry_aligned_read(conf, bio, offset);
 			spin_lock_irq(&conf->device_lock);
 			if (!ok)
 				break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7d74fb3f2ec6..cdc7f92e1806 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -487,35 +487,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 		return NULL;
 }
 
-/*
- * We maintain a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_set(segments, cnt);
-}
-
 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
  * This is because we sometimes take all the spinlocks
  * and creating that much locking depth can cause
@@ -613,6 +584,7 @@ struct r5conf {
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	unsigned int		retry_read_offset; /* sector offset into retry_read_aligned */
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;

From 97d53438081edd25ccb1de34051efe084d240828 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: [PATCH 24/84] Revert "md/raid5: limit request size according to
 implementation limits"

This reverts commit e8d7c33232e5fdfa761c3416539bc5b4acd12db5.

Now that raid5 doesn't abuse bi_phys_segments any more, we no longer
need to impose these limits.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1c8be667e9a9..00a34faabcdf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7361,15 +7361,6 @@ static int raid5_run(struct mddev *mddev)
 			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
-
-		/*
-		 * We use 16-bit counter of active stripes in bi_phys_segments
-		 * (minus one for over-loaded initialization)
-		 */
-		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
-		blk_queue_max_discard_sectors(mddev->queue,
-					      0xfffe * STRIPE_SECTORS);
-
 		/*
 		 * unaligned part of discard request will be ignored, so can't
 		 * guarantee discard_zeroes_data

From 6b6c8110e173ce10f2b169d82a6670001f7184d1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: [PATCH 25/84] md/raid1, raid10: move rXbio accounting closer to
 allocation.

When raid1 or raid10 find they will need to allocate a new
r1bio/r10bio, in order to work around a known bad block, they
account for the allocation well before the allocation is
made.  This separation makes the correctness less obvious
and requires comments.

The accounting needs to be a little before: before the first
rXbio is submitted, but that is all.

So move the accounting down to where it makes more sense.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 24 +++++++++++-------------
 drivers/md/raid10.c | 22 +++++++++-------------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 730e57259af9..3afa60eb72c5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1436,18 +1436,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		goto retry_write;
 	}
 
-	if (max_sectors < r1_bio->sectors) {
-		/* We are splitting this write into multiple parts, so
-		 * we need to prepare for allocating another r1_bio.
-		 */
+	if (max_sectors < r1_bio->sectors)
 		r1_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
-	}
+
 	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
 
 	atomic_set(&r1_bio->remaining, 1);
@@ -1553,10 +1544,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	 * as it could result in the bio being freed.
 	 */
 	if (sectors_handled < bio_sectors(bio)) {
-		r1_bio_write_done(r1_bio);
-		/* We need another r1_bio.  It has already been counted
+		/* We need another r1_bio, which must be accounted
 		 * in bio->bi_phys_segments
 		 */
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+
+		r1_bio_write_done(r1_bio);
 		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
 		goto retry_write;
 	}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e89a8d78a9ed..c7c5b2693fc9 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1384,18 +1384,8 @@ retry_write:
 		goto retry_write;
 	}
 
-	if (max_sectors < r10_bio->sectors) {
-		/* We are splitting this into multiple parts, so
-		 * we need to prepare for allocating another r10_bio.
-		 */
+	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
-	}
 	sectors_handled = r10_bio->sector + max_sectors -
 		bio->bi_iter.bi_sector;
 
@@ -1505,10 +1495,16 @@ retry_write:
 	 */
 
 	if (sectors_handled < bio_sectors(bio)) {
-		one_write_done(r10_bio);
-		/* We need another r10_bio.  It has already been counted
+		/* We need another r10_bio and it needs to be counted
 		 * in bio->bi_phys_segments.
 		 */
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		one_write_done(r10_bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 		r10_bio->master_bio = bio;

From fd16f2e8489100eb8005483ff630856bce51f803 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: [PATCH 26/84] md/raid10: stop using bi_phys_segments

raid10 currently repurposes bi_phys_segments on each
incoming bio to count how many r10bio was used to encode the
request.

We need to know when the number of attached r10bio reaches
zero to:
1/ call bio_endio() when all IO on the bio is finished
2/ decrement ->nr_pending so that resync IO can proceed.

Now that the bio has its own __bi_remaining counter, that
can be used instead. We can call bio_inc_remaining to
increment the counter and call bio_endio() every time an
r10bio completes, rather than only when bi_phys_segments
reaches zero.

This addresses point 1, but not point 2.  bio_endio()
doesn't (and cannot) report when the last r10bio has
finished, so a different approach is needed.

So: instead of counting bios in ->nr_pending, count r10bios.
i.e. every time we attach a bio, increment nr_pending.
Every time an r10bio completes, decrement nr_pending.

Normally we only increment nr_pending after first checking
that ->barrier is zero, or some other non-trivial tests and
possible waiting.  When attaching multiple r10bios to a bio,
we only need the tests and the waiting once.  After the
first increment, subsequent increments can happen
unconditionally as they are really all part of the one
request.

So introduce inc_pending() which can be used when we know
that nr_pending is already elevated.

Note that this fixes a bug.  freeze_array() contains the line
	atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
which implies that the units for ->nr_pending, ->nr_queued and extra
are the same.
->nr_queue and extra count r10_bios, but prior to this patch,
->nr_pending counted bios.  If a bio ever resulted in multiple
r10_bios (due to bad blocks), freeze_array() would not work correctly.
Now it does.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 76 +++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 51 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c7c5b2693fc9..0f1b78b38649 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -301,27 +301,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
 static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
-	int done;
 	struct r10conf *conf = r10_bio->mddev->private;
 
-	if (bio->bi_phys_segments) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio->bi_phys_segments--;
-		done = (bio->bi_phys_segments == 0);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-	} else
-		done = 1;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_error = -EIO;
-	if (done) {
-		bio_endio(bio);
-		/*
-		 * Wake up any possible resync thread that waits for the device
-		 * to go idle.
-		 */
-		allow_barrier(conf);
-	}
+
+	bio_endio(bio);
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	allow_barrier(conf);
+
 	free_r10bio(r10_bio);
 }
 
@@ -985,6 +976,15 @@ static void wait_barrier(struct r10conf *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void inc_pending(struct r10conf *conf)
+{
+	/* The current request requires multiple r10_bio, so
+	 * we need to increment the pending count.
+	 */
+	WARN_ON(!atomic_read(&conf->nr_pending));
+	atomic_inc(&conf->nr_pending);
+}
+
 static void allow_barrier(struct r10conf *conf)
 {
 	if ((atomic_dec_and_test(&conf->nr_pending)) ||
@@ -1162,12 +1162,8 @@ read_again:
 		sectors_handled = (r10_bio->sector + max_sectors
 				   - bio->bi_iter.bi_sector);
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		inc_pending(conf);
+		bio_inc_remaining(bio);
 		/*
 		 * Cannot call generic_make_request directly as that will be
 		 * queued in __generic_make_request and subsequent
@@ -1262,9 +1258,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 * on which we have seen a write error, we want to avoid
 	 * writing to those blocks.  This potentially requires several
 	 * writes to write around the bad blocks.  Each set of writes
-	 * gets its own r10_bio with a set of bios attached.  The number
-	 * of r10_bios is recored in bio->bi_phys_segments just as with
-	 * the read case.
+	 * gets its own r10_bio with a set of bios attached.
 	 */
 
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1495,15 +1489,9 @@ retry_write:
 	 */
 
 	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r10_bio and it needs to be counted
-		 * in bio->bi_phys_segments.
-		 */
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		/* We need another r10_bio and it needs to be counted */
+		inc_pending(conf);
+		bio_inc_remaining(bio);
 		one_write_done(r10_bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
@@ -1532,16 +1520,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 
-	/*
-	 * We might need to issue multiple reads to different devices if there
-	 * are bad blocks around, so we keep track of the number of reads in
-	 * bio->bi_phys_segments.  If this is 0, there is only one r10_bio and
-	 * no locking will be needed when the request completes.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
 	else
@@ -2693,12 +2671,8 @@ read_more:
 			r10_bio->sector + max_sectors
 			- mbio->bi_iter.bi_sector;
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (mbio->bi_phys_segments == 0)
-			mbio->bi_phys_segments = 2;
-		else
-			mbio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		bio_inc_remaining(mbio);
+		inc_pending(conf);
 		generic_make_request(bio);
 
 		r10_bio = mempool_alloc(conf->r10bio_pool,

From 37011e3afb0fdc462307dc006246358bddf61e92 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: [PATCH 27/84] md/raid1: stop using bi_phys_segment

Change to use bio->__bi_remaining to count number of r1bio attached
to a bio.
See precious raid10 patch for more details.

Like the raid10.c patch, this fixes a bug as nr_queued and nr_pending
used to measure different things, but were being compared.

This patch fixes another bug in that nr_pending previously did not
could write-behind requests, so behind writes could continue while
resync was happening.  How that nr_pending counts all r1_bio,
the resync cannot commence until the behind writes have completed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 89 ++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 66 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3afa60eb72c5..941f81063891 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -246,35 +246,17 @@ static void reschedule_retry(struct r1bio *r1_bio)
 static void call_bio_endio(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
-	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
-	sector_t bi_sector = bio->bi_iter.bi_sector;
-
-	if (bio->bi_phys_segments) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio->bi_phys_segments--;
-		done = (bio->bi_phys_segments == 0);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
-		 * make_request() might be waiting for
-		 * bi_phys_segments to decrease
-		 */
-		wake_up(&conf->wait_barrier);
-	} else
-		done = 1;
 
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 		bio->bi_error = -EIO;
 
-	if (done) {
-		bio_endio(bio);
-		/*
-		 * Wake up any possible resync thread that waits for the device
-		 * to go idle.
-		 */
-		allow_barrier(conf, bi_sector);
-	}
+	bio_endio(bio);
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	allow_barrier(conf, r1_bio->sector);
 }
 
 static void raid_end_bio_io(struct r1bio *r1_bio)
@@ -977,6 +959,16 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void inc_pending(struct r1conf *conf, sector_t bi_sector)
+{
+	/* The current request requires multiple r1_bio, so
+	 * we need to increment the pending count, and the corresponding
+	 * window count.
+	 */
+	int idx = sector_to_idx(bi_sector);
+	atomic_inc(&conf->nr_pending[idx]);
+}
+
 static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	int idx = sector_to_idx(sector_nr);
@@ -1191,17 +1183,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
 
-	/*
-	 * We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	/*
 	 * make_request() can abort the operation when read-ahead is being
 	 * used and no empty request is available.
@@ -1257,12 +1238,7 @@ read_again:
 		sectors_handled = (r1_bio->sector + max_sectors
 				   - bio->bi_iter.bi_sector);
 		r1_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		bio_inc_remaining(bio);
 
 		/*
 		 * Cannot call generic_make_request directly as that will be
@@ -1329,16 +1305,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
 
-	/* We might need to issue multiple writes to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of writes in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid1_log(mddev, "wait queued");
@@ -1544,16 +1510,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	 * as it could result in the bio being freed.
 	 */
 	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r1_bio, which must be accounted
-		 * in bio->bi_phys_segments
-		 */
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		/* We need another r1_bio, which must be counted */
+		sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
 
+		inc_pending(conf, sect);
+		bio_inc_remaining(bio);
 		r1_bio_write_done(r1_bio);
 		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
 		goto retry_write;
@@ -2573,12 +2534,7 @@ read_more:
 			int sectors_handled = (r1_bio->sector + max_sectors
 					       - mbio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (mbio->bi_phys_segments == 0)
-				mbio->bi_phys_segments = 2;
-			else
-				mbio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
+			bio_inc_remaining(mbio);
 			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
 					      bio, bio_dev, bio_sector);
 			generic_make_request(bio);
@@ -2586,6 +2542,7 @@ read_more:
 
 			r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
 			set_bit(R1BIO_ReadError, &r1_bio->state);
+			inc_pending(conf, r1_bio->sector);
 
 			goto read_more;
 		} else {

From 84dd97a69092cef858483b775f1900d743d796a4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: [PATCH 28/84] md/raid5: don't test ->writes_pending in
 raid5_remove_disk

This test on ->writes_pending cannot be safe as the counter
can be incremented at any moment and cannot be locked against.

Change it to test conf->active_stripes, which at least
can be locked against.  More changes are still needed.

A future patch will change ->writes_pending, and testing it here will
be very inconvenient.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 00a34faabcdf..0b1a4339a437 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7532,9 +7532,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		/*
 		 * we can't wait pending write here, as this is called in
 		 * raid5d, wait will deadlock.
+		 * neilb: there is no locking about new writes here,
+		 * so this cannot be safe.
 		 */
-		if (atomic_read(&mddev->writes_pending))
+		if (atomic_read(&conf->active_stripes)) {
 			return -EBUSY;
+		}
 		log_exit(conf);
 		return 0;
 	}

From 6497709b5d1bccce7de1df678d5f147d614551d1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: [PATCH 29/84] md: factor out set_in_sync()

Three separate places in md.c check if the number of active
writes is zero and, if so, sets mddev->in_sync.

There are a few differences, but there shouldn't be:
- it is always appropriate to notify the change in
  sysfs_state, and there is no need to do this outside a
  spin-locked region.
- we never need to check ->recovery_cp.  The state of resync
  is not relevant for whether there are any pending writes
  or not (which is what ->in_sync reports).

So create set_in_sync() which does the correct tests and
makes the correct changes, and call this in all three
places.

Any behaviour changes here a minor and cosmetic.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 54 ++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 41f766ab824a..5951dea75aba 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2252,6 +2252,21 @@ static void export_array(struct mddev *mddev)
 	mddev->major_version = 0;
 }
 
+static bool set_in_sync(struct mddev *mddev)
+{
+	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
+	if (atomic_read(&mddev->writes_pending) == 0) {
+		if (mddev->in_sync == 0) {
+			mddev->in_sync = 1;
+			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+			sysfs_notify_dirent_safe(mddev->sysfs_state);
+		}
+	}
+	if (mddev->safemode == 1)
+		mddev->safemode = 0;
+	return mddev->in_sync;
+}
+
 static void sync_sbs(struct mddev *mddev, int nospares)
 {
 	/* Update each superblock (in-memory image), but
@@ -4024,7 +4039,7 @@ static int restart_array(struct mddev *mddev);
 static ssize_t
 array_state_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	int err;
+	int err = 0;
 	enum array_state st = match_word(buf, array_states);
 
 	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
@@ -4037,18 +4052,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
 			md_wakeup_thread(mddev->thread);
 			wake_up(&mddev->sb_wait);
-			err = 0;
 		} else /* st == clean */ {
 			restart_array(mddev);
-			if (atomic_read(&mddev->writes_pending) == 0) {
-				if (mddev->in_sync == 0) {
-					mddev->in_sync = 1;
-					if (mddev->safemode == 1)
-						mddev->safemode = 0;
-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-				}
-				err = 0;
-			} else
+			if (!set_in_sync(mddev))
 				err = -EBUSY;
 		}
 		if (!err)
@@ -4106,15 +4112,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 			if (err)
 				break;
 			spin_lock(&mddev->lock);
-			if (atomic_read(&mddev->writes_pending) == 0) {
-				if (mddev->in_sync == 0) {
-					mddev->in_sync = 1;
-					if (mddev->safemode == 1)
-						mddev->safemode = 0;
-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-				}
-				err = 0;
-			} else
+			if (!set_in_sync(mddev))
 				err = -EBUSY;
 			spin_unlock(&mddev->lock);
 		} else
@@ -8582,22 +8580,10 @@ void md_check_recovery(struct mddev *mddev)
 			}
 		}
 
-		if (!mddev->external) {
-			int did_change = 0;
+		if (!mddev->external && !mddev->in_sync) {
 			spin_lock(&mddev->lock);
-			if (mddev->safemode &&
-			    !atomic_read(&mddev->writes_pending) &&
-			    !mddev->in_sync &&
-			    mddev->recovery_cp == MaxSector) {
-				mddev->in_sync = 1;
-				did_change = 1;
-				set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-			}
-			if (mddev->safemode == 1)
-				mddev->safemode = 0;
+			set_in_sync(mddev);
 			spin_unlock(&mddev->lock);
-			if (did_change)
-				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
 		if (mddev->sb_flags)

From 55cc39f345256af241deb6152ff5c06bedd10f11 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: [PATCH 30/84] md: close a race with setting mddev->in_sync

If ->in_sync is being set just as md_write_start() is being called,
it is possible that set_in_sync() won't see the elevated
->writes_pending, and md_write_start() won't see the set ->in_sync.

To close this race, re-test ->writes_pending after setting ->in_sync,
and add memory barriers to ensure the increment of ->writes_pending
will be seen by the time of this second test, or the new ->in_sync
will be seen by md_write_start().

Add a spinlock to array_state_show() to ensure this temporary
instability is never visible from userspace.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5951dea75aba..dee6bbfb29b8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2258,6 +2258,10 @@ static bool set_in_sync(struct mddev *mddev)
 	if (atomic_read(&mddev->writes_pending) == 0) {
 		if (mddev->in_sync == 0) {
 			mddev->in_sync = 1;
+			smp_mb();
+			if (atomic_read(&mddev->writes_pending))
+				/* lost a race with md_write_start() */
+				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
@@ -4011,6 +4015,7 @@ array_state_show(struct mddev *mddev, char *page)
 			st = read_auto;
 			break;
 		case 0:
+			spin_lock(&mddev->lock);
 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 				st = write_pending;
 			else if (mddev->in_sync)
@@ -4019,6 +4024,7 @@ array_state_show(struct mddev *mddev, char *page)
 				st = active_idle;
 			else
 				st = active;
+			spin_unlock(&mddev->lock);
 		}
 	else {
 		if (list_empty(&mddev->disks) &&
@@ -7885,6 +7891,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		did_change = 1;
 	}
 	atomic_inc(&mddev->writes_pending);
+	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
 	if (mddev->in_sync) {

From 210f7cdcf088c304ee0533ffd33d6f71a8821862 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: [PATCH 31/84] percpu-refcount: support synchronous switch to atomic
 mode.

percpu_ref_switch_to_atomic_sync() schedules the switch to atomic mode, then
waits for it to complete.

Also export percpu_ref_switch_to_* so they can be used from modules.

This will be used in md/raid to count the number of pending write
requests to an array.
We occasionally need to check if the count is zero, but most often
we don't care.
We always want updates to the counter to be fast, as in some cases
we count every 4K page.

Signed-off-by: NeilBrown <neilb@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 3a481a49546e..c13dceb87b60 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
 void percpu_ref_exit(struct percpu_ref *ref);
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch);
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9ac959ef4cae..fe03c6d52761 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -260,6 +260,22 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
+
+/**
+ * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ *
+ * Schedule switching the ref to atomic mode, and wait for the
+ * switch to complete.  Caller must ensure that no other thread
+ * will switch back to percpu mode.
+ */
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
+{
+	percpu_ref_switch_to_atomic(ref, NULL);
+	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
 
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
@@ -290,6 +306,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
 
 /**
  * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation

From 4ad23a976413aa57fe5ba7a25953dc35ccca5b71 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: [PATCH 32/84] MD: use per-cpu counter for writes_pending

The 'writes_pending' counter is used to determine when the
array is stable so that it can be marked in the superblock
as "Clean".  Consequently it needs to be updated frequently
but only checked for zero occasionally.  Recent changes to
raid5 cause the count to be updated even more often - once
per 4K rather than once per bio.  This provided
justification for making the updates more efficient.

So we replace the atomic counter a percpu-refcount.
This can be incremented and decremented cheaply most of the
time, and can be switched to "atomic" mode when more
precise counting is needed.  As it is possible for multiple
threads to want a precise count, we introduce a
"sync_checker" counter to count the number of threads
in "set_in_sync()", and only switch the refcount back
to percpu mode when that is zero.

We need to be careful about races between set_in_sync()
setting ->in_sync to 1, and md_write_start() setting it
to zero.  md_write_start() holds the rcu_read_lock()
while checking if the refcount is in percpu mode.  If
it is, then we know a switch to 'atomic' will not happen until
after we call rcu_read_unlock(), in which case set_in_sync()
will see the elevated count, and not set in_sync to 1.
If it is not in percpu mode, we take the mddev->lock to
ensure proper synchronization.

It is no longer possible to quickly check if the count is zero, which
we previously did to update a timer or to schedule the md_thread.
So now we do these every time we decrement that counter, but make
sure they are fast.

mod_timer() already optimizes the case where the timeout value doesn't
actually change.  We leverage that further by always rounding off the
jiffies to the timeout value.  This may delay the marking of 'clean'
slightly, but ensure we only perform atomic operation here when absolutely
needed.

md_wakeup_thread() current always calls wake_up(), even if
THREAD_WAKEUP is already set.  That too can be optimised to avoid
calls to wake_up().

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 70 +++++++++++++++++++++++++++++++++----------------
 drivers/md/md.h |  3 ++-
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index dee6bbfb29b8..1db88d79549a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -65,6 +65,8 @@
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
 #include <linux/slab.h>
+#include <linux/percpu-refcount.h>
+
 #include <trace/events/block.h>
 #include "md.h"
 #include "bitmap.h"
@@ -2255,16 +2257,24 @@ static void export_array(struct mddev *mddev)
 static bool set_in_sync(struct mddev *mddev)
 {
 	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
-	if (atomic_read(&mddev->writes_pending) == 0) {
-		if (mddev->in_sync == 0) {
+	if (!mddev->in_sync) {
+		mddev->sync_checkers++;
+		spin_unlock(&mddev->lock);
+		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
+		spin_lock(&mddev->lock);
+		if (!mddev->in_sync &&
+		    percpu_ref_is_zero(&mddev->writes_pending)) {
 			mddev->in_sync = 1;
+			/*
+			 * Ensure ->in_sync is visible before we clear
+			 * ->sync_checkers.
+			 */
 			smp_mb();
-			if (atomic_read(&mddev->writes_pending))
-				/* lost a race with md_write_start() */
-				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
+		if (--mddev->sync_checkers == 0)
+			percpu_ref_switch_to_percpu(&mddev->writes_pending);
 	}
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
@@ -5120,6 +5130,7 @@ static void md_free(struct kobject *ko)
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
+	percpu_ref_exit(&mddev->writes_pending);
 
 	kfree(mddev);
 }
@@ -5145,6 +5156,8 @@ static void mddev_delayed_delete(struct work_struct *ws)
 	kobject_put(&mddev->kobj);
 }
 
+static void no_op(struct percpu_ref *r) {}
+
 static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
@@ -5196,6 +5209,10 @@ static int md_alloc(dev_t dev, char *name)
 	blk_queue_make_request(mddev->queue, md_make_request);
 	blk_set_stacking_limits(&mddev->queue->limits);
 
+	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+		goto abort;
+	/* We want to start with the refcount at zero */
+	percpu_ref_put(&mddev->writes_pending);
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
 		blk_cleanup_queue(mddev->queue);
@@ -5279,11 +5296,10 @@ static void md_safemode_timeout(unsigned long data)
 {
 	struct mddev *mddev = (struct mddev *) data;
 
-	if (!atomic_read(&mddev->writes_pending)) {
-		mddev->safemode = 1;
-		if (mddev->external)
-			sysfs_notify_dirent_safe(mddev->sysfs_state);
-	}
+	mddev->safemode = 1;
+	if (mddev->external)
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
+
 	md_wakeup_thread(mddev->thread);
 }
 
@@ -5488,7 +5504,6 @@ int md_run(struct mddev *mddev)
 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
 		mddev->ro = 0;
 
-	atomic_set(&mddev->writes_pending,0);
 	atomic_set(&mddev->max_corr_read_errors,
 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
 	mddev->safemode = 0;
@@ -7342,8 +7357,8 @@ void md_wakeup_thread(struct md_thread *thread)
 {
 	if (thread) {
 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
-		set_bit(THREAD_WAKEUP, &thread->flags);
-		wake_up(&thread->wqueue);
+		if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
+			wake_up(&thread->wqueue);
 	}
 }
 EXPORT_SYMBOL(md_wakeup_thread);
@@ -7890,11 +7905,13 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		md_wakeup_thread(mddev->sync_thread);
 		did_change = 1;
 	}
-	atomic_inc(&mddev->writes_pending);
+	rcu_read_lock();
+	percpu_ref_get(&mddev->writes_pending);
 	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
-	if (mddev->in_sync) {
+	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
+	if (mddev->in_sync || !mddev->sync_checkers) {
 		spin_lock(&mddev->lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
@@ -7905,6 +7922,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		}
 		spin_unlock(&mddev->lock);
 	}
+	rcu_read_unlock();
 	if (did_change)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
@@ -7925,19 +7943,25 @@ void md_write_inc(struct mddev *mddev, struct bio *bi)
 	if (bio_data_dir(bi) != WRITE)
 		return;
 	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
-	atomic_inc(&mddev->writes_pending);
+	percpu_ref_get(&mddev->writes_pending);
 }
 EXPORT_SYMBOL(md_write_inc);
 
 void md_write_end(struct mddev *mddev)
 {
-	if (atomic_dec_and_test(&mddev->writes_pending)) {
-		if (mddev->safemode == 2)
-			md_wakeup_thread(mddev->thread);
-		else if (mddev->safemode_delay)
-			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
-	}
+	percpu_ref_put(&mddev->writes_pending);
+
+	if (mddev->safemode == 2)
+		md_wakeup_thread(mddev->thread);
+	else if (mddev->safemode_delay)
+		/* The roundup() ensures this only performs locking once
+		 * every ->safemode_delay jiffies
+		 */
+		mod_timer(&mddev->safemode_timer,
+			  roundup(jiffies, mddev->safemode_delay) +
+			  mddev->safemode_delay);
 }
+
 EXPORT_SYMBOL(md_write_end);
 
 /* md_allow_write(mddev)
@@ -8538,7 +8562,7 @@ void md_check_recovery(struct mddev *mddev)
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
-		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+		(mddev->safemode == 2
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
 		))
 		return;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0cd12721a536..7a7847d1cc39 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -409,7 +409,8 @@ struct mddev {
 							 */
 	unsigned int			safemode_delay;
 	struct timer_list		safemode_timer;
-	atomic_t			writes_pending;
+	struct percpu_ref		writes_pending;
+	int				sync_checkers;	/* # of threads checking writes_pending */
 	struct request_queue		*queue;	/* for plugging ... */
 
 	struct bitmap			*bitmap; /* the bitmap for the device */

From 0b408baf7f4f3ea94239d021a1f19e60cd8694de Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 21 Mar 2017 23:43:05 +0300
Subject: [PATCH 33/84] raid5-ppl: silence a misleading warning message

The "need_cache_flush" variable is never set to false.  When the
variable is true that means we print a warning message at the end of
the function.

Fixes: 3418d036c81d ("raid5-ppl: Partial Parity Log write logging implementation")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 27bad3e2d7ce..86ea9addb51a 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1070,7 +1070,7 @@ int ppl_init_log(struct r5conf *conf)
 	struct mddev *mddev = conf->mddev;
 	int ret = 0;
 	int i;
-	bool need_cache_flush;
+	bool need_cache_flush = false;
 
 	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
 		 mdname(conf->mddev));

From 27f26a0f3767b6688b9a88b9becb6f8e760421f3 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 20 Mar 2017 17:46:04 +0800
Subject: [PATCH 34/84] md/raid10: refactor some codes from
 raid10_write_request

Previously, we clone both bio and repl_bio in raid10_write_request,
then add the cloned bio to plug->pending or conf->pending_bio_list
based on plug or not, and most of the logics are same for the two
conditions.

So introduce raid10_write_one_disk for it, and use replacement parameter
to distinguish the difference. No functional changes in the patch.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 177 +++++++++++++++++++-------------------------
 1 file changed, 76 insertions(+), 101 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0f1b78b38649..28c62e0c42aa 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1185,18 +1185,82 @@ read_again:
 	return;
 }
 
+static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
+				  struct bio *bio, bool replacement,
+				  int n_copy, int max_sectors)
+{
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
+	unsigned long flags;
+	struct blk_plug_cb *cb;
+	struct raid10_plug_cb *plug = NULL;
+	struct r10conf *conf = mddev->private;
+	struct md_rdev *rdev;
+	int devnum = r10_bio->devs[n_copy].devnum;
+	struct bio *mbio;
+
+	if (replacement) {
+		rdev = conf->mirrors[devnum].replacement;
+		if (rdev == NULL) {
+			/* Replacement just got moved to main 'rdev' */
+			smp_mb();
+			rdev = conf->mirrors[devnum].rdev;
+		}
+	} else
+		rdev = conf->mirrors[devnum].rdev;
+
+	mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+	bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+	if (replacement)
+		r10_bio->devs[n_copy].repl_bio = mbio;
+	else
+		r10_bio->devs[n_copy].bio = mbio;
+
+	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
+				   choose_data_offset(r10_bio, rdev));
+	mbio->bi_bdev = rdev->bdev;
+	mbio->bi_end_io	= raid10_end_write_request;
+	bio_set_op_attrs(mbio, op, do_sync | do_fua);
+	if (!replacement && test_bit(FailFast,
+				     &conf->mirrors[devnum].rdev->flags)
+			 && enough(conf, devnum))
+		mbio->bi_opf |= MD_FAILFAST;
+	mbio->bi_private = r10_bio;
+
+	if (conf->mddev->gendisk)
+		trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+				      mbio, disk_devt(conf->mddev->gendisk),
+				      r10_bio->sector);
+	/* flush_pending_writes() needs access to the rdev so...*/
+	mbio->bi_bdev = (void *)rdev;
+
+	atomic_inc(&r10_bio->remaining);
+
+	cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
+	if (cb)
+		plug = container_of(cb, struct raid10_plug_cb, cb);
+	else
+		plug = NULL;
+	spin_lock_irqsave(&conf->device_lock, flags);
+	if (plug) {
+		bio_list_add(&plug->pending, mbio);
+		plug->pending_cnt++;
+	} else {
+		bio_list_add(&conf->pending_bio_list, mbio);
+		conf->pending_count++;
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	if (!plug)
+		md_wakeup_thread(mddev->thread);
+}
+
 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				 struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int i;
-	const int op = bio_op(bio);
-	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
-	unsigned long flags;
 	struct md_rdev *blocked_rdev;
-	struct blk_plug_cb *cb;
-	struct raid10_plug_cb *plug = NULL;
 	sector_t sectors;
 	int sectors_handled;
 	int max_sectors;
@@ -1387,101 +1451,12 @@ retry_write:
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
 	for (i = 0; i < conf->copies; i++) {
-		struct bio *mbio;
-		int d = r10_bio->devs[i].devnum;
-		if (r10_bio->devs[i].bio) {
-			struct md_rdev *rdev = conf->mirrors[d].rdev;
-			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
-				 max_sectors);
-			r10_bio->devs[i].bio = mbio;
-
-			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
-					   choose_data_offset(r10_bio, rdev));
-			mbio->bi_bdev = rdev->bdev;
-			mbio->bi_end_io	= raid10_end_write_request;
-			bio_set_op_attrs(mbio, op, do_sync | do_fua);
-			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) &&
-			    enough(conf, d))
-				mbio->bi_opf |= MD_FAILFAST;
-			mbio->bi_private = r10_bio;
-
-			if (conf->mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
-						      mbio, disk_devt(conf->mddev->gendisk),
-						      r10_bio->sector);
-			/* flush_pending_writes() needs access to the rdev so...*/
-			mbio->bi_bdev = (void*)rdev;
-
-			atomic_inc(&r10_bio->remaining);
-
-			cb = blk_check_plugged(raid10_unplug, mddev,
-					       sizeof(*plug));
-			if (cb)
-				plug = container_of(cb, struct raid10_plug_cb,
-						    cb);
-			else
-				plug = NULL;
-			spin_lock_irqsave(&conf->device_lock, flags);
-			if (plug) {
-				bio_list_add(&plug->pending, mbio);
-				plug->pending_cnt++;
-			} else {
-				bio_list_add(&conf->pending_bio_list, mbio);
-				conf->pending_count++;
-			}
-			spin_unlock_irqrestore(&conf->device_lock, flags);
-			if (!plug)
-				md_wakeup_thread(mddev->thread);
-		}
-
-		if (r10_bio->devs[i].repl_bio) {
-			struct md_rdev *rdev = conf->mirrors[d].replacement;
-			if (rdev == NULL) {
-				/* Replacement just got moved to main 'rdev' */
-				smp_mb();
-				rdev = conf->mirrors[d].rdev;
-			}
-			mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-			bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
-				 max_sectors);
-			r10_bio->devs[i].repl_bio = mbio;
-
-			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
-					   choose_data_offset(r10_bio, rdev));
-			mbio->bi_bdev = rdev->bdev;
-			mbio->bi_end_io	= raid10_end_write_request;
-			bio_set_op_attrs(mbio, op, do_sync | do_fua);
-			mbio->bi_private = r10_bio;
-
-			if (conf->mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
-						      mbio, disk_devt(conf->mddev->gendisk),
-						      r10_bio->sector);
-			/* flush_pending_writes() needs access to the rdev so...*/
-			mbio->bi_bdev = (void*)rdev;
-
-			atomic_inc(&r10_bio->remaining);
-
-			cb = blk_check_plugged(raid10_unplug, mddev,
-					       sizeof(*plug));
-			if (cb)
-				plug = container_of(cb, struct raid10_plug_cb,
-						    cb);
-			else
-				plug = NULL;
-			spin_lock_irqsave(&conf->device_lock, flags);
-			if (plug) {
-				bio_list_add(&plug->pending, mbio);
-				plug->pending_cnt++;
-			} else {
-				bio_list_add(&conf->pending_bio_list, mbio);
-				conf->pending_count++;
-			}
-			spin_unlock_irqrestore(&conf->device_lock, flags);
-			if (!plug)
-				md_wakeup_thread(mddev->thread);
-		}
+		if (r10_bio->devs[i].bio)
+			raid10_write_one_disk(mddev, r10_bio, bio, false,
+					      i, max_sectors);
+		if (r10_bio->devs[i].repl_bio)
+			raid10_write_one_disk(mddev, r10_bio, bio, true,
+					      i, max_sectors);
 	}
 
 	/* Don't remove the bias on 'remaining' (one_write_done) until

From 3560741e316b3ea52cfb27901ae284921445180f Mon Sep 17 00:00:00 2001
From: Zhilong Liu <zlliu@suse.com>
Date: Wed, 15 Mar 2017 16:14:53 +0800
Subject: [PATCH 35/84] md: fix several trivial typos in comments

Signed-off-by: Zhilong Liu <zlliu@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 2 +-
 drivers/md/raid5.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cc7bbd21813e..bf7419a56454 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -697,7 +697,7 @@ re_read:
 
 out:
 	kunmap_atomic(sb);
-	/* Assiging chunksize is required for "re_read" */
+	/* Assigning chunksize is required for "re_read" */
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0b1a4339a437..266d661dc69b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2296,7 +2296,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 *    pages have been transferred over, and the old kmem_cache is
 	 *    freed when all stripes are done.
 	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
-	 *    we simple return a failre status - no need to clean anything up.
+	 *    we simple return a failure status - no need to clean anything up.
 	 * 4/ allocate new pages for the new slots in the new stripe_heads.
 	 *    If this fails, we don't bother trying the shrink the
 	 *    stripe_heads down again, we just leave them as they are.
@@ -3558,7 +3558,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 		/* Pre-reads at not permitted until after short delay
 		 * to gather multiple requests.  However if this
-		 * device is no Insync, the block could only be be computed
+		 * device is no Insync, the block could only be computed
 		 * and there is no need to delay that.
 		 */
 		return 0;
@@ -3577,7 +3577,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 
 	/* If we are forced to do a reconstruct-write, either because
 	 * the current RAID6 implementation only supports that, or
-	 * or because parity cannot be trusted and we are currently
+	 * because parity cannot be trusted and we are currently
 	 * recovering it, there is extra need to be careful.
 	 * If one of the devices that we would need to read, because
 	 * it is not being overwritten (and maybe not written at all)

From c85ba149de4bd14aa028ac824f9f12aeded28b86 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:22 +0800
Subject: [PATCH 36/84] md: raid1/raid10: don't handle failure of
 bio_add_page()

All bio_add_page() is for adding one page into resync bio,
which is big enough to hold RESYNC_PAGES pages, and
the current bio_add_page() doesn't check queue limit any more,
so it won't fail at all.

remove unused label (shaohua)

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 22 ++++++----------------
 drivers/md/raid10.c | 41 ++++++++++-------------------------------
 2 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 941f81063891..569f501fb710 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2894,28 +2894,18 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
 				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-				if (bio_add_page(bio, page, len, 0) == 0) {
-					/* stop here */
-					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-					while (i > 0) {
-						i--;
-						bio = r1_bio->bios[i];
-						if (bio->bi_end_io==NULL)
-							continue;
-						/* remove last page from this bio */
-						bio->bi_vcnt--;
-						bio->bi_iter.bi_size -= len;
-						bio_clear_flag(bio, BIO_SEG_VALID);
-					}
-					goto bio_full;
-				}
+
+				/*
+				 * won't fail because the vec table is big
+				 * enough to hold all these pages
+				 */
+				bio_add_page(bio, page, len, 0);
 			}
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
 	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
- bio_full:
 	r1_bio->sectors = nr_sectors;
 
 	if (mddev_is_clustered(mddev) &&
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 28c62e0c42aa..6c9783ba85db 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3413,27 +3413,16 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
-			struct bio *bio2;
 			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
-			if (bio_add_page(bio, page, len, 0))
-				continue;
-
-			/* stop here */
-			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
-			for (bio2 = biolist;
-			     bio2 && bio2 != bio;
-			     bio2 = bio2->bi_next) {
-				/* remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
-				bio_clear_flag(bio2, BIO_SEG_VALID);
-			}
-			goto bio_full;
+			/*
+			 * won't fail because the vec table is big enough
+			 * to hold all these pages
+			 */
+			bio_add_page(bio, page, len, 0);
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
- bio_full:
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
@@ -4502,25 +4491,15 @@ read_more:
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		for (bio = blist; bio ; bio = bio->bi_next) {
-			struct bio *bio2;
-			if (bio_add_page(bio, page, len, 0))
-				continue;
-
-			/* Didn't fit, must stop */
-			for (bio2 = blist;
-			     bio2 && bio2 != bio;
-			     bio2 = bio2->bi_next) {
-				/* Remove last page from this bio */
-				bio2->bi_vcnt--;
-				bio2->bi_iter.bi_size -= len;
-				bio_clear_flag(bio2, BIO_SEG_VALID);
-			}
-			goto bio_full;
+			/*
+			 * won't fail because the vec table is big enough
+			 * to hold all these pages
+			 */
+			bio_add_page(bio, page, len, 0);
 		}
 		sector_nr += len >> 9;
 		nr_sectors += len >> 9;
 	}
-bio_full:
 	rcu_read_unlock();
 	r10_bio->sectors = nr_sectors;
 

From d8e29fbc3bed181f2653fb89ac8c34e40db39c30 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:23 +0800
Subject: [PATCH 37/84] md: move two macros into md.h

Both raid1 and raid10 share common resync
block size and page count, so move them into md.h.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.h     | 5 +++++
 drivers/md/raid1.c  | 2 --
 drivers/md/raid10.c | 3 ---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7a7847d1cc39..31d2d70849b6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -718,4 +718,9 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
+
+/* Maximum size of each resync request */
+#define RESYNC_BLOCK_SIZE (64*1024)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 569f501fb710..c31f9e206148 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -95,10 +95,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 	kfree(r1_bio);
 }
 
-#define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6c9783ba85db..301e73fe2d2f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -125,9 +125,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 	kfree(r10_bio);
 }
 
-/* Maximum size of each resync request */
-#define RESYNC_BLOCK_SIZE (64*1024)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 /* amount of memory to reserve for resync requests */
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */

From 513e2faa0138462ce014e1b0e226ca45c83bc6c1 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:24 +0800
Subject: [PATCH 38/84] md: prepare for managing resync I/O pages in clean way

Now resync I/O use bio's bec table to manage pages,
this way is very hacky, and may not work any more
once multipage bvec is introduced.

So introduce helpers and new data structure for
managing resync I/O pages more cleanly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 31d2d70849b6..0418b29945e7 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -723,4 +723,54 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 
+/* for managing resync I/O pages */
+struct resync_pages {
+	unsigned	idx;	/* for get/put page from the pool */
+	void		*raid_bio;
+	struct page	*pages[RESYNC_PAGES];
+};
+
+static inline int resync_alloc_pages(struct resync_pages *rp,
+				     gfp_t gfp_flags)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++) {
+		rp->pages[i] = alloc_page(gfp_flags);
+		if (!rp->pages[i])
+			goto out_free;
+	}
+
+	return 0;
+
+out_free:
+	while (--i >= 0)
+		put_page(rp->pages[i]);
+	return -ENOMEM;
+}
+
+static inline void resync_free_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		put_page(rp->pages[i]);
+}
+
+static inline void resync_get_all_pages(struct resync_pages *rp)
+{
+	int i;
+
+	for (i = 0; i < RESYNC_PAGES; i++)
+		get_page(rp->pages[i]);
+}
+
+static inline struct page *resync_fetch_page(struct resync_pages *rp,
+					     unsigned idx)
+{
+	if (WARN_ON_ONCE(idx >= RESYNC_PAGES))
+		return NULL;
+	return rp->pages[idx];
+}
+
 #endif /* _MD_MD_H */

From a7234234d0d6373d0510582ab632efbf73243403 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:25 +0800
Subject: [PATCH 39/84] md: raid1: simplify r1buf_pool_free()

This patch gets each page's reference of each bio for resync,
then r1buf_pool_free() gets simplified a lot.

The same policy has been taken in raid10's buf pool allocation/free
too.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c31f9e206148..7ee0911fba7d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -142,10 +142,13 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	}
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
-		for (i=0; i<RESYNC_PAGES ; i++)
-			for (j=1; j<pi->raid_disks; j++)
-				r1_bio->bios[j]->bi_io_vec[i].bv_page =
+		for (i = 0; i< RESYNC_PAGES; i++)
+			for (j = 1; j < pi->raid_disks; j++) {
+				struct page *page =
 					r1_bio->bios[0]->bi_io_vec[i].bv_page;
+				get_page(page);
+				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
+			}
 	}
 
 	r1_bio->master_bio = NULL;
@@ -170,12 +173,8 @@ static void r1buf_pool_free(void *__r1_bio, void *data)
 	struct r1bio *r1bio = __r1_bio;
 
 	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;) {
-			if (j == 0 ||
-			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
-			    r1bio->bios[0]->bi_io_vec[i].bv_page)
-				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-		}
+		for (j = pi->raid_disks; j-- ;)
+			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 

From 98d30c5812c343c970b5997369b4f6b197c29b3d Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:26 +0800
Subject: [PATCH 40/84] md: raid1: don't use bio's vec table to manage resync
 pages

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * raid_disks
bytes per r1_bio, and it is fine because the inflight r1_bio for
resync shouldn't be much, as pointed by Shaohua.

Also the bio_reset() in raid1_sync_request() is removed because
all bios are freshly new now and not necessary to reset any more.

This patch can be thought as a cleanup too

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 93 +++++++++++++++++++++++++++++++---------------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7ee0911fba7d..89a384bdae29 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -81,6 +81,24 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 #define raid1_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
 
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ *  IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+/*
+ * for resync bio, r1bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
@@ -108,12 +126,18 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	struct r1bio *r1_bio;
 	struct bio *bio;
 	int need_pages;
-	int i, j;
+	int j;
+	struct resync_pages *rps;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
 	if (!r1_bio)
 		return NULL;
 
+	rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+		      gfp_flags);
+	if (!rps)
+		goto out_free_r1bio;
+
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
 	 */
@@ -133,22 +157,22 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		need_pages = pi->raid_disks;
 	else
 		need_pages = 1;
-	for (j = 0; j < need_pages; j++) {
-		bio = r1_bio->bios[j];
-		bio->bi_vcnt = RESYNC_PAGES;
+	for (j = 0; j < pi->raid_disks; j++) {
+		struct resync_pages *rp = &rps[j];
 
-		if (bio_alloc_pages(bio, gfp_flags))
-			goto out_free_pages;
-	}
-	/* If not user-requests, copy the page pointers to all bios */
-	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
-		for (i = 0; i< RESYNC_PAGES; i++)
-			for (j = 1; j < pi->raid_disks; j++) {
-				struct page *page =
-					r1_bio->bios[0]->bi_io_vec[i].bv_page;
-				get_page(page);
-				r1_bio->bios[j]->bi_io_vec[i].bv_page = page;
-			}
+		bio = r1_bio->bios[j];
+
+		if (j < need_pages) {
+			if (resync_alloc_pages(rp, gfp_flags))
+				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
+
+		rp->idx = 0;
+		rp->raid_bio = r1_bio;
+		bio->bi_private = rp;
 	}
 
 	r1_bio->master_bio = NULL;
@@ -157,11 +181,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 out_free_pages:
 	while (--j >= 0)
-		bio_free_pages(r1_bio->bios[j]);
+		resync_free_pages(&rps[j]);
 
 out_free_bio:
 	while (++j < pi->raid_disks)
 		bio_put(r1_bio->bios[j]);
+	kfree(rps);
+
+out_free_r1bio:
 	r1bio_pool_free(r1_bio, data);
 	return NULL;
 }
@@ -169,14 +196,18 @@ out_free_bio:
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
 	struct pool_info *pi = data;
-	int i,j;
+	int i;
 	struct r1bio *r1bio = __r1_bio;
+	struct resync_pages *rp = NULL;
 
-	for (i = 0; i < RESYNC_PAGES; i++)
-		for (j = pi->raid_disks; j-- ;)
-			safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
-	for (i=0 ; i < pi->raid_disks; i++)
+	for (i = pi->raid_disks; i--; ) {
+		rp = get_resync_pages(r1bio->bios[i]);
+		resync_free_pages(rp);
 		bio_put(r1bio->bios[i]);
+	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
 
 	r1bio_pool_free(r1bio, data);
 }
@@ -1844,7 +1875,7 @@ abort:
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 
 	update_head_pos(r1_bio->read_disk, r1_bio);
 
@@ -1863,7 +1894,7 @@ static void end_sync_read(struct bio *bio)
 static void end_sync_write(struct bio *bio)
 {
 	int uptodate = !bio->bi_error;
-	struct r1bio *r1_bio = bio->bi_private;
+	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	sector_t first_bad;
@@ -2080,6 +2111,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int size;
 		int error;
 		struct bio *b = r1_bio->bios[i];
+		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
@@ -2092,7 +2124,8 @@ static void process_checks(struct r1bio *r1_bio)
 			conf->mirrors[i].rdev->data_offset;
 		b->bi_bdev = conf->mirrors[i].rdev->bdev;
 		b->bi_end_io = end_sync_read;
-		b->bi_private = r1_bio;
+		rp->raid_bio = r1_bio;
+		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
 		for (j = 0; j < vcnt ; j++) {
@@ -2746,7 +2779,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
 		bio = r1_bio->bios[i];
-		bio_reset(bio);
 
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
@@ -2802,7 +2834,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&rdev->nr_pending);
 			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
 			bio->bi_bdev = rdev->bdev;
-			bio->bi_private = r1_bio;
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 		}
@@ -2888,9 +2919,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 
 		for (i = 0 ; i < conf->raid_disks * 2; i++) {
+			struct resync_pages *rp;
+
 			bio = r1_bio->bios[i];
+			rp = get_resync_pages(bio);
 			if (bio->bi_end_io) {
-				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+				page = resync_fetch_page(rp, rp->idx++);
 
 				/*
 				 * won't fail because the vec table is big
@@ -2902,7 +2936,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
-	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+	} while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
+
 	r1_bio->sectors = nr_sectors;
 
 	if (mddev_is_clustered(mddev) &&

From 44cf0f4dc76b5e44e6a9c727be6902434f99a9bd Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:27 +0800
Subject: [PATCH 41/84] md: raid1: retrieve page from pre-allocated resync page
 array

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 89a384bdae29..21ef09ae5123 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1973,6 +1973,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
 	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+	struct page **pages = get_resync_pages(bio)->pages;
 	sector_t sect = r1_bio->sector;
 	int sectors = r1_bio->sectors;
 	int idx = 0;
@@ -2006,7 +2007,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				 */
 				rdev = conf->mirrors[d].rdev;
 				if (sync_page_io(rdev, sect, s<<9,
-						 bio->bi_io_vec[idx].bv_page,
+						 pages[idx],
 						 REQ_OP_READ, 0, false)) {
 					success = 1;
 					break;
@@ -2061,7 +2062,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    WRITE) == 0) {
 				r1_bio->bios[d]->bi_end_io = NULL;
 				rdev_dec_pending(rdev, mddev);
@@ -2076,7 +2077,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 				continue;
 			rdev = conf->mirrors[d].rdev;
 			if (r1_sync_page_io(rdev, sect, s,
-					    bio->bi_io_vec[idx].bv_page,
+					    pages[idx],
 					    READ) != 0)
 				atomic_add(s, &rdev->corrected_errors);
 		}
@@ -2152,6 +2153,8 @@ static void process_checks(struct r1bio *r1_bio)
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
 		int error = sbio->bi_error;
+		struct page **ppages = get_resync_pages(pbio)->pages;
+		struct page **spages = get_resync_pages(sbio)->pages;
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
@@ -2160,11 +2163,8 @@ static void process_checks(struct r1bio *r1_bio)
 
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
-				struct page *p, *s;
-				p = pbio->bi_io_vec[j].bv_page;
-				s = sbio->bi_io_vec[j].bv_page;
-				if (memcmp(page_address(p),
-					   page_address(s),
+				if (memcmp(page_address(ppages[j]),
+					   page_address(spages[j]),
 					   sbio->bi_io_vec[j].bv_len))
 					break;
 			}

From 60928a91b0b3beca4a1cf2739118f967c783f79a Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:28 +0800
Subject: [PATCH 42/84] md: raid1: use bio helper in process_checks()

Avoid to direct access to bvec table.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 21ef09ae5123..d27b84666884 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2111,6 +2111,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		int size;
 		int error;
+		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
@@ -2129,9 +2130,7 @@ static void process_checks(struct r1bio *r1_bio)
 		b->bi_private = rp;
 
 		size = b->bi_iter.bi_size;
-		for (j = 0; j < vcnt ; j++) {
-			struct bio_vec *bi;
-			bi = &b->bi_io_vec[j];
+		bio_for_each_segment_all(bi, b, j) {
 			bi->bv_offset = 0;
 			if (size > PAGE_SIZE)
 				bi->bv_len = PAGE_SIZE;
@@ -2155,17 +2154,22 @@ static void process_checks(struct r1bio *r1_bio)
 		int error = sbio->bi_error;
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
+		struct bio_vec *bi;
+		int page_len[RESYNC_PAGES];
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
 		sbio->bi_error = 0;
 
+		bio_for_each_segment_all(bi, sbio, j)
+			page_len[j] = bi->bv_len;
+
 		if (!error) {
 			for (j = vcnt; j-- ; ) {
 				if (memcmp(page_address(ppages[j]),
 					   page_address(spages[j]),
-					   sbio->bi_io_vec[j].bv_len))
+					   page_len[j]))
 					break;
 			}
 		} else

From 6f8802852f7e58a12177a86179803b9efaad98e2 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:29 +0800
Subject: [PATCH 43/84] block: introduce bio_copy_data_partial

Turns out we can use bio_copy_data in raid1's write behind,
and we can make alloc_behind_pages() more clean/efficient,
but we need to partial version of bio_copy_data().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 60 +++++++++++++++++++++++++++++++++++----------
 include/linux/bio.h |  2 ++
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..1ccff0dace89 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1025,19 +1025,8 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
+static void __bio_copy_data(struct bio *dst, struct bio *src,
+			    int offset, int size)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1047,6 +1036,12 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
+	/* for supporting partial copy */
+	if (offset || size != src->bi_iter.bi_size) {
+		bio_advance_iter(src, &src_iter, offset);
+		src_iter.bi_size = size;
+	}
+
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1083,8 +1078,47 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_copy_data);
 
+/**
+ * bio_copy_data_partial - copy partial contents of data buffers from one
+ * chain of bios to another
+ * @dst: destination bio list
+ * @src: source bio list
+ * @offset: starting copy from the offset
+ * @size: how many bytes to copy
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data_partial(struct bio *dst, struct bio *src,
+			   int offset, int size)
+{
+	__bio_copy_data(dst, src, offset, size);
+
+}
+EXPORT_SYMBOL(bio_copy_data_partial);
+
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..42b62a0288b0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -468,6 +468,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
+				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 

From d8c84c4f8becc1fb993911e18c8aef5ecf7a72ac Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:30 +0800
Subject: [PATCH 44/84] md: raid1: move 'offset' out of loop

The 'offset' local variable can't be changed inside the loop, so
move it out.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d27b84666884..64bf2005f082 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1294,6 +1294,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	int first_clone;
 	int sectors_handled;
 	int max_sectors;
+	sector_t offset;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1439,13 +1440,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	atomic_set(&r1_bio->behind_remaining, 0);
 
 	first_clone = 1;
+
+	offset = r1_bio->sector - bio->bi_iter.bi_sector;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
-		sector_t offset;
 		if (!r1_bio->bios[i])
 			continue;
 
-		offset = r1_bio->sector - bio->bi_iter.bi_sector;
 
 		if (first_clone) {
 			/* do behind I/O ?

From 841c1316c7da6199a7df473893c141943991a756 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:31 +0800
Subject: [PATCH 45/84] md: raid1: improve write behind

This patch improve handling of write behind in the following ways:

- introduce behind master bio to hold all write behind pages
- fast clone bios from behind master bio
- avoid to change bvec table directly
- use bio_copy_data() and make code more clean

Suggested-by: Shaohua Li <shli@fb.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 118 +++++++++++++++++++++------------------------
 drivers/md/raid1.h |  10 ++--
 2 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 64bf2005f082..c6a671f13bc0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -388,12 +388,9 @@ static void close_write(struct r1bio *r1_bio)
 {
 	/* it really is the end of this request */
 	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-		/* free extra copy of the data pages */
-		int i = r1_bio->behind_page_count;
-		while (i--)
-			safe_put_page(r1_bio->behind_bvecs[i].bv_page);
-		kfree(r1_bio->behind_bvecs);
-		r1_bio->behind_bvecs = NULL;
+		bio_free_pages(r1_bio->behind_master_bio);
+		bio_put(r1_bio->behind_master_bio);
+		r1_bio->behind_master_bio = NULL;
 	}
 	/* clear the bitmap if all writes complete successfully */
 	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -495,6 +492,10 @@ static void raid1_end_write_request(struct bio *bio)
 	}
 
 	if (behind) {
+		/* we release behind master bio when all write are done */
+		if (r1_bio->behind_master_bio == bio)
+			to_put = NULL;
+
 		if (test_bit(WriteMostly, &rdev->flags))
 			atomic_dec(&r1_bio->behind_remaining);
 
@@ -1089,39 +1090,46 @@ static void unfreeze_array(struct r1conf *conf)
 	wake_up(&conf->wait_barrier);
 }
 
-/* duplicate the data pages for behind I/O
- */
-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
+					   struct bio *bio,
+					   int offset, int size)
 {
-	int i;
-	struct bio_vec *bvec;
-	struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
-					GFP_NOIO);
-	if (unlikely(!bvecs))
-		return;
+	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	int i = 0;
+	struct bio *behind_bio = NULL;
 
-	bio_for_each_segment_all(bvec, bio, i) {
-		bvecs[i] = *bvec;
-		bvecs[i].bv_page = alloc_page(GFP_NOIO);
-		if (unlikely(!bvecs[i].bv_page))
-			goto do_sync_io;
-		memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
-		       kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
-		kunmap(bvecs[i].bv_page);
-		kunmap(bvec->bv_page);
+	behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
+	if (!behind_bio)
+		goto fail;
+
+	while (i < vcnt && size) {
+		struct page *page;
+		int len = min_t(int, PAGE_SIZE, size);
+
+		page = alloc_page(GFP_NOIO);
+		if (unlikely(!page))
+			goto free_pages;
+
+		bio_add_page(behind_bio, page, len, 0);
+
+		size -= len;
+		i++;
 	}
-	r1_bio->behind_bvecs = bvecs;
-	r1_bio->behind_page_count = bio->bi_vcnt;
-	set_bit(R1BIO_BehindIO, &r1_bio->state);
-	return;
 
-do_sync_io:
-	for (i = 0; i < bio->bi_vcnt; i++)
-		if (bvecs[i].bv_page)
-			put_page(bvecs[i].bv_page);
-	kfree(bvecs);
+	bio_copy_data_partial(behind_bio, bio, offset,
+			      behind_bio->bi_iter.bi_size);
+
+	r1_bio->behind_master_bio = behind_bio;;
+	set_bit(R1BIO_BehindIO, &r1_bio->state);
+
+	return behind_bio;
+
+free_pages:
 	pr_debug("%dB behind alloc failed, doing sync I/O\n",
 		 bio->bi_iter.bi_size);
+	bio_free_pages(behind_bio);
+fail:
+	return behind_bio;
 }
 
 struct raid1_plug_cb {
@@ -1457,11 +1465,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
-				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
-								mddev->bio_set,
-								offset << 9,
-								max_sectors << 9);
-				alloc_behind_pages(mbio, r1_bio);
+				mbio = alloc_behind_master_bio(r1_bio, bio,
+							       offset << 9,
+							       max_sectors << 9);
 			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,
@@ -1472,26 +1478,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		}
 
 		if (!mbio) {
-			if (r1_bio->behind_bvecs)
-				mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
-								mddev->bio_set,
-								offset << 9,
-								max_sectors << 9);
+			if (r1_bio->behind_master_bio)
+				mbio = bio_clone_fast(r1_bio->behind_master_bio,
+						      GFP_NOIO,
+						      mddev->bio_set);
 			else {
 				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 				bio_trim(mbio, offset, max_sectors);
 			}
 		}
 
-		if (r1_bio->behind_bvecs) {
-			struct bio_vec *bvec;
-			int j;
-
-			/*
-			 * We trimmed the bio, so _all is legit
-			 */
-			bio_for_each_segment_all(bvec, mbio, j)
-				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
+		if (r1_bio->behind_master_bio) {
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
 		}
@@ -2386,18 +2383,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 		/* Write at 'sector' for 'sectors'*/
 
 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-			unsigned vcnt = r1_bio->behind_page_count;
-			struct bio_vec *vec = r1_bio->behind_bvecs;
-
-			while (!vec->bv_page) {
-				vec++;
-				vcnt--;
-			}
-
-			wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-			memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-
-			wbio->bi_vcnt = vcnt;
+			wbio = bio_clone_fast(r1_bio->behind_master_bio,
+					      GFP_NOIO,
+					      mddev->bio_set);
+			/* We really need a _all clone */
+			wbio->bi_iter = (struct bvec_iter){ 0 };
 		} else {
 			wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
 					      mddev->bio_set);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index dd22a37d0d83..4271cd7ac2de 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -153,9 +153,13 @@ struct r1bio {
 	int			read_disk;
 
 	struct list_head	retry_list;
-	/* Next two are only valid when R1BIO_BehindIO is set */
-	struct bio_vec		*behind_bvecs;
-	int			behind_page_count;
+
+	/*
+	 * When R1BIO_BehindIO is set, we store pages for write behind
+	 * in behind_master_bio.
+	 */
+	struct bio		*behind_master_bio;
+
 	/*
 	 * if the IO is in WRITE direction, then multiple bios are used.
 	 * We choose the number when they are allocated.

From 81fa152008ac903877b59bcc7d16777c3292c206 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:32 +0800
Subject: [PATCH 46/84] md: raid10: refactor code of read reshape's .bi_end_io

reshape read request is a bit special and requires one extra
bio which isn't allocated from r10buf_pool.

Refactor the .bi_end_io for read reshape, so that we can use
raid10's resync page mangement approach easily in the following
patches.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 301e73fe2d2f..94f5c368c683 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1889,17 +1889,9 @@ abort:
 	return err;
 }
 
-static void end_sync_read(struct bio *bio)
+static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 {
-	struct r10bio *r10_bio = bio->bi_private;
 	struct r10conf *conf = r10_bio->mddev->private;
-	int d;
-
-	if (bio == r10_bio->master_bio) {
-		/* this is a reshape read */
-		d = r10_bio->read_slot; /* really the read dev */
-	} else
-		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
 	if (!bio->bi_error)
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1923,6 +1915,22 @@ static void end_sync_read(struct bio *bio)
 	}
 }
 
+static void end_sync_read(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+	struct r10conf *conf = r10_bio->mddev->private;
+	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
+
+	__end_sync_read(r10_bio, bio, d);
+}
+
+static void end_reshape_read(struct bio *bio)
+{
+	struct r10bio *r10_bio = bio->bi_private;
+
+	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
+}
+
 static void end_sync_request(struct r10bio *r10_bio)
 {
 	struct mddev *mddev = r10_bio->mddev;
@@ -4438,7 +4446,7 @@ read_more:
 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
-	read_bio->bi_end_io = end_sync_read;
+	read_bio->bi_end_io = end_reshape_read;
 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
 	read_bio->bi_error = 0;

From f0250618361db1447d66c494c6dd2df815f42c87 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:33 +0800
Subject: [PATCH 47/84] md: raid10: don't use bio's vec table to manage resync
 pages

Now we allocate one page array for managing resync pages, instead
of using bio's vec table to do that, and the old way is very hacky
and won't work any more if multipage bvec is enabled.

The introduced cost is that we need to allocate (128 + 16) * copies
bytes per r10_bio, and it is fine because the inflight r10_bio for
resync shouldn't be much, as pointed by Shaohua.

Also bio_reset() in raid10_sync_request() and reshape_request()
are removed because all bios are freshly new now in these functions
and not necessary to reset any more.

This patch can be thought as cleanup too.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 138 +++++++++++++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 54 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 94f5c368c683..629ae45bb18e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -110,6 +110,24 @@ static void end_reshape(struct r10conf *conf);
 #define raid10_log(md, fmt, args...)				\
 	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
 
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ *  IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+	return bio->bi_private;
+}
+
+/*
+ * for resync bio, r10bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r10bio *get_resync_r10bio(struct bio *bio)
+{
+	return get_resync_pages(bio)->raid_bio;
+}
+
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
@@ -140,11 +158,11 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
-	struct page *page;
 	struct r10bio *r10_bio;
 	struct bio *bio;
-	int i, j;
-	int nalloc;
+	int j;
+	int nalloc, nalloc_rp;
+	struct resync_pages *rps;
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
@@ -156,6 +174,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	else
 		nalloc = 2; /* recovery */
 
+	/* allocate once for all bios */
+	if (!conf->have_replacement)
+		nalloc_rp = nalloc;
+	else
+		nalloc_rp = nalloc * 2;
+	rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
+	if (!rps)
+		goto out_free_r10bio;
+
 	/*
 	 * Allocate bios.
 	 */
@@ -175,36 +202,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	 * Allocate RESYNC_PAGES data pages and attach them
 	 * where needed.
 	 */
-	for (j = 0 ; j < nalloc; j++) {
+	for (j = 0; j < nalloc; j++) {
 		struct bio *rbio = r10_bio->devs[j].repl_bio;
-		bio = r10_bio->devs[j].bio;
-		for (i = 0; i < RESYNC_PAGES; i++) {
-			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
-					       &conf->mddev->recovery)) {
-				/* we can share bv_page's during recovery
-				 * and reshape */
-				struct bio *rbio = r10_bio->devs[0].bio;
-				page = rbio->bi_io_vec[i].bv_page;
-				get_page(page);
-			} else
-				page = alloc_page(gfp_flags);
-			if (unlikely(!page))
-				goto out_free_pages;
+		struct resync_pages *rp, *rp_repl;
 
-			bio->bi_io_vec[i].bv_page = page;
-			if (rbio)
-				rbio->bi_io_vec[i].bv_page = page;
+		rp = &rps[j];
+		if (rbio)
+			rp_repl = &rps[nalloc + j];
+
+		bio = r10_bio->devs[j].bio;
+
+		if (!j || test_bit(MD_RECOVERY_SYNC,
+				   &conf->mddev->recovery)) {
+			if (resync_alloc_pages(rp, gfp_flags))
+				goto out_free_pages;
+		} else {
+			memcpy(rp, &rps[0], sizeof(*rp));
+			resync_get_all_pages(rp);
+		}
+
+		rp->idx = 0;
+		rp->raid_bio = r10_bio;
+		bio->bi_private = rp;
+		if (rbio) {
+			memcpy(rp_repl, rp, sizeof(*rp));
+			rbio->bi_private = rp_repl;
 		}
 	}
 
 	return r10_bio;
 
 out_free_pages:
-	for ( ; i > 0 ; i--)
-		safe_put_page(bio->bi_io_vec[i-1].bv_page);
-	while (j--)
-		for (i = 0; i < RESYNC_PAGES ; i++)
-			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
+	while (--j >= 0)
+		resync_free_pages(&rps[j * 2]);
+
 	j = 0;
 out_free_bio:
 	for ( ; j < nalloc; j++) {
@@ -213,30 +244,34 @@ out_free_bio:
 		if (r10_bio->devs[j].repl_bio)
 			bio_put(r10_bio->devs[j].repl_bio);
 	}
+	kfree(rps);
+out_free_r10bio:
 	r10bio_pool_free(r10_bio, conf);
 	return NULL;
 }
 
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
-	int i;
 	struct r10conf *conf = data;
 	struct r10bio *r10bio = __r10_bio;
 	int j;
+	struct resync_pages *rp = NULL;
 
-	for (j=0; j < conf->copies; j++) {
+	for (j = conf->copies; j--; ) {
 		struct bio *bio = r10bio->devs[j].bio;
-		if (bio) {
-			for (i = 0; i < RESYNC_PAGES; i++) {
-				safe_put_page(bio->bi_io_vec[i].bv_page);
-				bio->bi_io_vec[i].bv_page = NULL;
-			}
-			bio_put(bio);
-		}
+
+		rp = get_resync_pages(bio);
+		resync_free_pages(rp);
+		bio_put(bio);
+
 		bio = r10bio->devs[j].repl_bio;
 		if (bio)
 			bio_put(bio);
 	}
+
+	/* resync pages array stored in the 1st bio's .bi_private */
+	kfree(rp);
+
 	r10bio_pool_free(r10bio, conf);
 }
 
@@ -1917,7 +1952,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 
 static void end_sync_read(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 
@@ -1926,6 +1961,7 @@ static void end_sync_read(struct bio *bio)
 
 static void end_reshape_read(struct bio *bio)
 {
+	/* reshape read bio isn't allocated from r10buf_pool */
 	struct r10bio *r10_bio = bio->bi_private;
 
 	__end_sync_read(r10_bio, bio, r10_bio->read_slot);
@@ -1960,7 +1996,7 @@ static void end_sync_request(struct r10bio *r10_bio)
 
 static void end_sync_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
@@ -2040,6 +2076,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	for (i=0 ; i < conf->copies ; i++) {
 		int  j, d;
 		struct md_rdev *rdev;
+		struct resync_pages *rp;
 
 		tbio = r10_bio->devs[i].bio;
 
@@ -2081,11 +2118,13 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
+		rp = get_resync_pages(tbio);
 		bio_reset(tbio);
 
 		tbio->bi_vcnt = vcnt;
 		tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
-		tbio->bi_private = r10_bio;
+		rp->raid_bio = r10_bio;
+		tbio->bi_private = rp;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 		tbio->bi_end_io = end_sync_write;
 		bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
@@ -3149,10 +3188,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 					}
 				}
 				bio = r10_bio->devs[0].bio;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio_set_op_attrs(bio, REQ_OP_READ, 0);
 				if (test_bit(FailFast, &rdev->flags))
@@ -3176,10 +3213,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 				if (!test_bit(In_sync, &mrdev->flags)) {
 					bio = r10_bio->devs[1].bio;
-					bio_reset(bio);
 					bio->bi_next = biolist;
 					biolist = bio;
-					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 					bio->bi_iter.bi_sector = to_addr
@@ -3204,10 +3239,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				if (mreplace == NULL || bio == NULL ||
 				    test_bit(Faulty, &mreplace->flags))
 					break;
-				bio_reset(bio);
 				bio->bi_next = biolist;
 				biolist = bio;
-				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 				bio->bi_iter.bi_sector = to_addr +
@@ -3329,7 +3362,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -3354,7 +3386,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio_set_op_attrs(bio, REQ_OP_READ, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3373,13 +3404,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
-			bio_reset(bio);
 			bio->bi_error = -EIO;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
 			biolist = bio;
-			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 			if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
@@ -3418,7 +3447,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
-			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+			struct resync_pages *rp = get_resync_pages(bio);
+			page = resync_fetch_page(rp, rp->idx++);
 			/*
 			 * won't fail because the vec table is big enough
 			 * to hold all these pages
@@ -3427,7 +3457,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
-	} while (biolist->bi_vcnt < RESYNC_PAGES);
+	} while (get_resync_pages(biolist)->idx < RESYNC_PAGES);
 	r10_bio->sectors = nr_sectors;
 
 	while (biolist) {
@@ -3435,7 +3465,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		biolist = biolist->bi_next;
 
 		bio->bi_next = NULL;
-		r10_bio = bio->bi_private;
+		r10_bio = get_resync_r10bio(bio);
 		r10_bio->sectors = nr_sectors;
 
 		if (bio->bi_end_io == end_sync_read) {
@@ -4326,6 +4356,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	struct bio *blist;
 	struct bio *bio, *read_bio;
 	int sectors_done = 0;
+	struct page **pages;
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
@@ -4476,11 +4507,9 @@ read_more:
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
 
-		bio_reset(b);
 		b->bi_bdev = rdev2->bdev;
 		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
 			rdev2->new_data_offset;
-		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		bio_set_op_attrs(b, REQ_OP_WRITE, 0);
 		b->bi_next = blist;
@@ -4490,8 +4519,9 @@ read_more:
 	/* Now add as many pages as possible to all of these bios. */
 
 	nr_sectors = 0;
+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
-		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
+		struct page *page = pages[s / (PAGE_SIZE >> 9)];
 		int len = (max_sectors - s) << 9;
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
@@ -4675,7 +4705,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 
 static void end_reshape_write(struct bio *bio)
 {
-	struct r10bio *r10_bio = bio->bi_private;
+	struct r10bio *r10_bio = get_resync_r10bio(bio);
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;

From cdb76be31568604f389f951bd0efadd3f530f2dd Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:34 +0800
Subject: [PATCH 48/84] md: raid10: retrieve page from preallocated resync page
 array

Now one page array is allocated for each resync bio, and we can
retrieve page from this table directly.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 629ae45bb18e..827bb5bef53c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2055,6 +2055,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	int i, first;
 	struct bio *tbio, *fbio;
 	int vcnt;
+	struct page **tpages, **fpages;
 
 	atomic_set(&r10_bio->remaining, 1);
 
@@ -2070,6 +2071,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	fbio = r10_bio->devs[i].bio;
 	fbio->bi_iter.bi_size = r10_bio->sectors << 9;
 	fbio->bi_iter.bi_idx = 0;
+	fpages = get_resync_pages(fbio)->pages;
 
 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 	/* now find blocks with errors */
@@ -2084,6 +2086,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 			continue;
 		if (i == first)
 			continue;
+
+		tpages = get_resync_pages(tbio)->pages;
 		d = r10_bio->devs[i].devnum;
 		rdev = conf->mirrors[d].rdev;
 		if (!r10_bio->devs[i].bio->bi_error) {
@@ -2096,8 +2100,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 				int len = PAGE_SIZE;
 				if (sectors < (len / 512))
 					len = sectors * 512;
-				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
-					   page_address(tbio->bi_io_vec[j].bv_page),
+				if (memcmp(page_address(fpages[j]),
+					   page_address(tpages[j]),
 					   len))
 					break;
 				sectors -= len/512;
@@ -2195,6 +2199,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 	int idx = 0;
 	int dr = r10_bio->devs[0].devnum;
 	int dw = r10_bio->devs[1].devnum;
+	struct page **pages = get_resync_pages(bio)->pages;
 
 	while (sectors) {
 		int s = sectors;
@@ -2210,7 +2215,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 		ok = sync_page_io(rdev,
 				  addr,
 				  s << 9,
-				  bio->bi_io_vec[idx].bv_page,
+				  pages[idx],
 				  REQ_OP_READ, 0, false);
 		if (ok) {
 			rdev = conf->mirrors[dw].rdev;
@@ -2218,7 +2223,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
 			ok = sync_page_io(rdev,
 					  addr,
 					  s << 9,
-					  bio->bi_io_vec[idx].bv_page,
+					  pages[idx],
 					  REQ_OP_WRITE, 0, false);
 			if (!ok) {
 				set_bit(WriteErrorSeen, &rdev->flags);

From 2d06e3b7145bb08705615e6e7400024d8e36a5c0 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:35 +0800
Subject: [PATCH 49/84] md: raid10: avoid direct access to bvec table in
 handle_reshape_read_error

All reshape I/O share pages from 1st copy device, so just use that pages
for avoiding direct access to bvec table in handle_reshape_read_error.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 827bb5bef53c..0f13d57ef646 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4653,7 +4653,10 @@ static int handle_reshape_read_error(struct mddev *mddev,
 	struct r10bio *r10b = &on_stack.r10_bio;
 	int slot = 0;
 	int idx = 0;
-	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
+	struct page **pages;
+
+	/* reshape IOs share pages from .devs[0].bio */
+	pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
 
 	r10b->sector = r10_bio->sector;
 	__raid10_find_phys(&conf->prev, r10b);
@@ -4682,7 +4685,7 @@ static int handle_reshape_read_error(struct mddev *mddev,
 			success = sync_page_io(rdev,
 					       addr,
 					       s << 9,
-					       bvec[idx].bv_page,
+					       pages[idx],
 					       REQ_OP_READ, 0, false);
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();

From f45958756fef552436e4a63029a168495920026e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 24 Mar 2017 10:34:43 -0700
Subject: [PATCH 50/84] block: remove bio_clone_bioset_partial()

commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced
bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is
rewritten by Ming. We don't need the API any more, so revert the commit.

Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 61 ++++++++++-----------------------------------
 include/linux/bio.h | 11 ++------
 2 files changed, 15 insertions(+), 57 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 1ccff0dace89..036435995c55 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -631,20 +631,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-				      struct bio_set *bs, int offset,
-				      int size)
+/**
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	struct bio *bio;
-	struct bvec_iter iter_src = bio_src->bi_iter;
-
-	/* for supporting partial clone */
-	if (offset || size != bio_src->bi_iter.bi_size) {
-		bio_advance_iter(bio_src, &iter_src, offset);
-		iter_src.bi_size = size;
-	}
 
 	/*
 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -668,8 +669,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	 *    __bio_clone_fast() anyways.
 	 */
 
-	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
-			       &iter_src), bs);
+	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
@@ -686,7 +686,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 		break;
 	default:
-		__bio_for_each_segment(bv, bio_src, iter, iter_src)
+		bio_for_each_segment(bv, bio_src, iter)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
 		break;
 	}
@@ -705,43 +705,8 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
 	return bio;
 }
-
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
-{
-	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
-				  bio_src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_clone_bioset);
 
-/**
- * 	bio_clone_bioset_partial - clone a partial bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *	@offset: cloned starting from the offset
- *	@size: size for the cloned bio
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
-				     struct bio_set *bs, int offset,
-				     int size)
-{
-	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
-}
-EXPORT_SYMBOL(bio_clone_bioset_partial);
-
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 42b62a0288b0..fafef6343d1b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
+static inline unsigned bio_segments(struct bio *bio)
 {
 	unsigned segs = 0;
 	struct bio_vec bv;
@@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 		break;
 	}
 
-	__bio_for_each_segment(bv, bio, iter, *bvec)
+	bio_for_each_segment(bv, bio, iter)
 		segs++;
 
 	return segs;
 }
 
-static inline unsigned bio_segments(struct bio *bio)
-{
-	return __bio_segments(bio, &bio->bi_iter);
-}
-
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -389,8 +384,6 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
-					    struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 

From 41743c1f046a14c6749fd1808bb3793c08e47a3e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 24 Mar 2017 15:20:47 -0700
Subject: [PATCH 51/84] md/raid1: skip data copy for behind io for discard
 request

discard request doesn't have data attached, so it's meaningless to
allocate memory and copy from original bio for behind IO. And the copy
is bogus because bio_copy_data_partial can't handle discard request.

We don't support writesame/writezeros request so far.

Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c6a671f13bc0..b7d9651286d4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1102,6 +1102,10 @@ static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
 	if (!behind_bio)
 		goto fail;
 
+	/* discard op, we don't support writezero/writesame yet */
+	if (!bio_has_data(bio))
+		goto skip_copy;
+
 	while (i < vcnt && size) {
 		struct page *page;
 		int len = min_t(int, PAGE_SIZE, size);
@@ -1118,7 +1122,7 @@ static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
 
 	bio_copy_data_partial(behind_bio, bio, offset,
 			      behind_bio->bi_iter.bi_size);
-
+skip_copy:
 	r1_bio->behind_master_bio = behind_bio;;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
 

From 1ad45a9bc4e0cd5a6e6fb0e6c5d35d6c87f14c76 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Sat, 25 Mar 2017 09:44:39 +0800
Subject: [PATCH 52/84] md/raid5-cache: fix payload endianness problem in
 raid5-cache

The payload->header.type and payload->size are little-endian, so just
convert them to the right byte order.

Signed-off-by: Jason Yan <yanaijie@huawei.com>
Cc: <stable@vger.kernel.org> #v4.10+
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 25eb048298fe..b6194e082e48 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2002,12 +2002,12 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 		payload = (void *)mb + mb_offset;
 		payload_flush = (void *)mb + mb_offset;
 
-		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
 			if (r5l_recovery_verify_data_checksum(
 				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
 				goto mismatch;
-		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
+		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
 			if (r5l_recovery_verify_data_checksum(
 				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
@@ -2019,12 +2019,12 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 						 BLOCK_SECTORS),
 				    payload->checksum[1]) < 0)
 				goto mismatch;
-		} else if (payload->header.type == R5LOG_PAYLOAD_FLUSH) {
+		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
 			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
 		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
 			goto mismatch;
 
-		if (payload->header.type == R5LOG_PAYLOAD_FLUSH) {
+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
 			mb_offset += sizeof(struct r5l_payload_flush) +
 				le32_to_cpu(payload_flush->size);
 		} else {
@@ -2091,7 +2091,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		payload = (void *)mb + mb_offset;
 		payload_flush = (void *)mb + mb_offset;
 
-		if (payload->header.type == R5LOG_PAYLOAD_FLUSH) {
+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
 			int i, count;
 
 			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
@@ -2113,7 +2113,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		}
 
 		/* DATA or PARITY payload */
-		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
+		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
 			raid5_compute_sector(
 				conf, le64_to_cpu(payload->location), 0, &dd,
 				NULL)
@@ -2151,7 +2151,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 			list_add_tail(&sh->lru, cached_stripe_list);
 		}
 
-		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
@@ -2159,7 +2159,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 			}
 			r5l_recovery_load_data(log, sh, ctx, payload,
 					       log_offset);
-		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
+		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
 			r5l_recovery_load_parity(log, sh, ctx, payload,
 						 log_offset);
 		else
@@ -2361,7 +2361,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 				payload = (void *)mb + offset;
 				payload->header.type = cpu_to_le16(
 					R5LOG_PAYLOAD_DATA);
-				payload->size = BLOCK_SECTORS;
+				payload->size = cpu_to_le32(BLOCK_SECTORS);
 				payload->location = cpu_to_le64(
 					raid5_compute_blocknr(sh, i, 0));
 				addr = kmap_atomic(dev->page);

From 0bb0c10500ba634216238c40e1eeddce92b4d488 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 27 Mar 2017 10:51:33 -0700
Subject: [PATCH 53/84] md/raid5: use consistency_policy to remove journal
 feature

When journal device of an array fails, the array is forced into read-only
mode. To make the array normal without adding another journal device, we
need to remove journal _feature_ from the array.

This patch allows remove journal _feature_ from an array, For journal
existing journal should be either missing or faulty.

To remove journal feature, it is necessary to remove the journal device
first:

  mdadm --fail /dev/md0 /dev/sdb
  mdadm: set /dev/sdb faulty in /dev/md0
  mdadm --remove /dev/md0 /dev/sdb
  mdadm: hot removed /dev/sdb from /dev/md0

Then the journal feature can be removed by echoing into the sysfs file:

 cat /sys/block/md0/md/consistency_policy
 journal

 echo resync > /sys/block/md0/md/consistency_policy
 cat /sys/block/md0/md/consistency_policy
 resync

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 266d661dc69b..6036d5e41ddd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8292,17 +8292,41 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 	}
 
 	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
-		mddev_suspend(mddev);
-		set_bit(MD_HAS_PPL, &mddev->flags);
-		err = log_init(conf, NULL);
-		if (!err)
+		/* ppl only works with RAID 5 */
+		if (conf->level == 5) {
+			mddev_suspend(mddev);
+			set_bit(MD_HAS_PPL, &mddev->flags);
+			err = log_init(conf, NULL);
+			if (!err)
+				raid5_reset_stripe_cache(mddev);
+			mddev_resume(mddev);
+		} else
+			err = -EINVAL;
+	} else if (strncmp(buf, "resync", 6) == 0) {
+		if (raid5_has_ppl(conf)) {
+			mddev_suspend(mddev);
+			log_exit(conf);
 			raid5_reset_stripe_cache(mddev);
-		mddev_resume(mddev);
-	} else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) {
-		mddev_suspend(mddev);
-		log_exit(conf);
-		raid5_reset_stripe_cache(mddev);
-		mddev_resume(mddev);
+			mddev_resume(mddev);
+		} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+			   r5l_log_disk_error(conf)) {
+			bool journal_dev_exists = false;
+			struct md_rdev *rdev;
+
+			rdev_for_each(rdev, mddev)
+				if (test_bit(Journal, &rdev->flags)) {
+					journal_dev_exists = true;
+					break;
+				}
+
+			if (!journal_dev_exists) {
+				mddev_suspend(mddev);
+				clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+				mddev_resume(mddev);
+			} else  /* need remove journal device first */
+				err = -EBUSY;
+		} else
+			err = -EINVAL;
 	} else {
 		err = -EINVAL;
 	}
@@ -8337,6 +8361,7 @@ static struct md_personality raid6_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 static struct md_personality raid5_personality =
 {
@@ -8385,6 +8410,7 @@ static struct md_personality raid4_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid4_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static int __init raid5_init(void)

From 8fc04e6ea02d631fd344f462002078b8067793de Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Tue, 28 Mar 2017 16:17:55 +0800
Subject: [PATCH 54/84] md: raid1: kill warning on powerpc_pseries

This patch kills the warning reported on powerpc_pseries,
and actually we don't need the initialization.

	After merging the md tree, today's linux-next build (powerpc
	pseries_le_defconfig) produced this warning:

	drivers/md/raid1.c: In function 'raid1d':
	drivers/md/raid1.c:2172:9: warning: 'page_len$' may be used uninitialized in this function [-Wmaybe-uninitialized]
	     if (memcmp(page_address(ppages[j]),
	         ^
	drivers/md/raid1.c:2160:7: note: 'page_len$' was declared here
	   int page_len[RESYNC_PAGES];
       ^

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b7d9651286d4..7d6723558fd8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2157,7 +2157,7 @@ static void process_checks(struct r1bio *r1_bio)
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
 		struct bio_vec *bi;
-		int page_len[RESYNC_PAGES];
+		int page_len[RESYNC_PAGES] = { 0 };
 
 		if (sbio->bi_end_io != end_sync_read)
 			continue;

From 583da48e388f472e8818d9bb60ef6a1d40ee9f9d Mon Sep 17 00:00:00 2001
From: Dennis Yang <dennisyang@qnap.com>
Date: Wed, 29 Mar 2017 15:46:13 +0800
Subject: [PATCH 55/84] md: update slab_cache before releasing new stripes when
 stripes resizing

When growing raid5 device on machine with small memory, there is chance that
mdadm will be killed and the following bug report can be observed. The same
bug could also be reproduced in linux-4.10.6.

[57600.075774] BUG: unable to handle kernel NULL pointer dereference at           (null)
[57600.083796] IP: [<ffffffff81a6aa87>] _raw_spin_lock+0x7/0x20
[57600.110378] PGD 421cf067 PUD 4442d067 PMD 0
[57600.114678] Oops: 0002 [#1] SMP
[57600.180799] CPU: 1 PID: 25990 Comm: mdadm Tainted: P           O    4.2.8 #1
[57600.187849] Hardware name: To be filled by O.E.M. To be filled by O.E.M./MAHOBAY, BIOS QV05AR66 03/06/2013
[57600.197490] task: ffff880044e47240 ti: ffff880043070000 task.ti: ffff880043070000
[57600.204963] RIP: 0010:[<ffffffff81a6aa87>]  [<ffffffff81a6aa87>] _raw_spin_lock+0x7/0x20
[57600.213057] RSP: 0018:ffff880043073810  EFLAGS: 00010046
[57600.218359] RAX: 0000000000000000 RBX: 000000000000000c RCX: ffff88011e296dd0
[57600.225486] RDX: 0000000000000001 RSI: ffffe8ffffcb46c0 RDI: 0000000000000000
[57600.232613] RBP: ffff880043073878 R08: ffff88011e5f8170 R09: 0000000000000282
[57600.239739] R10: 0000000000000005 R11: 28f5c28f5c28f5c3 R12: ffff880043073838
[57600.246872] R13: ffffe8ffffcb46c0 R14: 0000000000000000 R15: ffff8800b9706a00
[57600.253999] FS:  00007f576106c700(0000) GS:ffff88011e280000(0000) knlGS:0000000000000000
[57600.262078] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[57600.267817] CR2: 0000000000000000 CR3: 00000000428fe000 CR4: 00000000001406e0
[57600.274942] Stack:
[57600.276949]  ffffffff8114ee35 ffff880043073868 0000000000000282 000000000000eb3f
[57600.284383]  ffffffff81119043 ffff880043073838 ffff880043073838 ffff88003e197b98
[57600.291820]  ffffe8ffffcb46c0 ffff88003e197360 0000000000000286 ffff880043073968
[57600.299254] Call Trace:
[57600.301698]  [<ffffffff8114ee35>] ? cache_flusharray+0x35/0xe0
[57600.307523]  [<ffffffff81119043>] ? __page_cache_release+0x23/0x110
[57600.313779]  [<ffffffff8114eb53>] kmem_cache_free+0x63/0xc0
[57600.319344]  [<ffffffff81579942>] drop_one_stripe+0x62/0x90
[57600.324915]  [<ffffffff81579b5b>] raid5_cache_scan+0x8b/0xb0
[57600.330563]  [<ffffffff8111b98a>] shrink_slab.part.36+0x19a/0x250
[57600.336650]  [<ffffffff8111e38c>] shrink_zone+0x23c/0x250
[57600.342039]  [<ffffffff8111e4f3>] do_try_to_free_pages+0x153/0x420
[57600.348210]  [<ffffffff8111e851>] try_to_free_pages+0x91/0xa0
[57600.353959]  [<ffffffff811145b1>] __alloc_pages_nodemask+0x4d1/0x8b0
[57600.360303]  [<ffffffff8157a30b>] check_reshape+0x62b/0x770
[57600.365866]  [<ffffffff8157a4a5>] raid5_check_reshape+0x55/0xa0
[57600.371778]  [<ffffffff81583df7>] update_raid_disks+0xc7/0x110
[57600.377604]  [<ffffffff81592b73>] md_ioctl+0xd83/0x1b10
[57600.382827]  [<ffffffff81385380>] blkdev_ioctl+0x170/0x690
[57600.388307]  [<ffffffff81195238>] block_ioctl+0x38/0x40
[57600.393525]  [<ffffffff811731c5>] do_vfs_ioctl+0x2b5/0x480
[57600.399010]  [<ffffffff8115e07b>] ? vfs_write+0x14b/0x1f0
[57600.404400]  [<ffffffff811733cc>] SyS_ioctl+0x3c/0x70
[57600.409447]  [<ffffffff81a6ad97>] entry_SYSCALL_64_fastpath+0x12/0x6a
[57600.415875] Code: 00 00 00 00 55 48 89 e5 8b 07 85 c0 74 04 31 c0 5d c3 ba 01 00 00 00 f0 0f b1 17 85 c0 75 ef b0 01 5d c3 90 31 c0 ba 01 00 00 00 <f0> 0f b1 17 85 c0 75 01 c3 55 89 c6 48 89 e5 e8 85 d1 63 ff 5d
[57600.435460] RIP  [<ffffffff81a6aa87>] _raw_spin_lock+0x7/0x20
[57600.441208]  RSP <ffff880043073810>
[57600.444690] CR2: 0000000000000000
[57600.448000] ---[ end trace cbc6b5cc4bf9831d ]---

The problem is that resize_stripes() releases new stripe_heads before assigning new
slab cache to conf->slab_cache. If the shrinker function raid5_cache_scan() gets called
after resize_stripes() starting releasing new stripes but right before new slab cache
being assigned, it is possible that these new stripe_heads will be freed with the old
slab_cache which was already been destoryed and that triggers this bug.

Signed-off-by: Dennis Yang <dennisyang@qnap.com>
Fixes: edbe83ab4c27 ("md/raid5: allow the stripe_cache to grow and shrink.")
Cc: stable@vger.kernel.org (4.1+)
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6036d5e41ddd..a5676559e7a6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2409,6 +2409,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		err = -ENOMEM;
 
 	mutex_unlock(&conf->cache_size_mutex);
+
+	conf->slab_cache = sc;
+	conf->active_name = 1-conf->active_name;
+
 	/* Step 4, return new stripes to service */
 	while(!list_empty(&newstripes)) {
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2426,8 +2430,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	}
 	/* critical section pass, GFP_NOIO no longer needed */
 
-	conf->slab_cache = sc;
-	conf->active_name = 1-conf->active_name;
 	if (!err)
 		conf->pool_size = newsize;
 	return err;

From 7471fb77ce4dc4cb81291189947fcdf621a97987 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 3 Apr 2017 12:11:32 +1000
Subject: [PATCH 56/84] md/raid6: Fix anomily when recovering a single device
 in RAID6.

When recoverying a single missing/failed device in a RAID6,
those stripes where the Q block is on the missing device are
handled a bit differently.  In these cases it is easy to
check that the P block is correct, so we do.  This results
in the P block be destroy.  Consequently the P block needs
to be read a second time in order to compute Q.  This causes
lots of seeks and hurts performance.

It shouldn't be necessary to re-read P as it can be computed
from the DATA.  But we only compute blocks on missing
devices, since c337869d9501 ("md: do not compute parity
unless it is on a failed drive").

So relax the change made in that commit to allow computing
of the P block in a RAID6 which it is the only missing that
block.

This makes RAID6 recovery run much faster as the disk just
"before" the recovering device is no longer seeking
back-and-forth.

Reported-by-tested-by: Brad Campbell <lists2009@fnarfbargle.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5676559e7a6..09d94ad5e52b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3619,9 +3619,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
 		BUG_ON(sh->batch_head);
+
+		/*
+		 * In the raid6 case if the only non-uptodate disk is P
+		 * then we already trusted P to compute the other failed
+		 * drives. It is safe to compute rather than re-read P.
+		 * In other cases we only compute blocks from failed
+		 * devices, otherwise check/repair might fail to detect
+		 * a real inconsistency.
+		 */
+
 		if ((s->uptodate == disks - 1) &&
+		    ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
 		    (s->failed && (disk_idx == s->failed_num[0] ||
-				   disk_idx == s->failed_num[1]))) {
+				   disk_idx == s->failed_num[1])))) {
 			/* have disk failed, and we're requested to fetch it;
 			 * do compute it
 			 */

From 6f287ca6046edd34ed83aafb7f9033c9c2e809e2 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 6 Apr 2017 09:12:18 +0800
Subject: [PATCH 57/84] md/raid10: reset the 'first' at the end of loop

We need to set "first = 0' at the end of rdev_for_each
loop, so we can get the array's min_offset_diff correctly
otherwise min_offset_diff just means the last rdev's
offset diff.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0f13d57ef646..e055ec94b9a8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3769,6 +3769,7 @@ static int raid10_run(struct mddev *mddev)
 
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+		first = 0;
 	}
 
 	if (mddev->queue) {
@@ -4172,6 +4173,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 			if (first || diff < min_offset_diff)
 				min_offset_diff = diff;
 		}
+		first = 0;
 	}
 
 	if (max(before_length, after_length) > min_offset_diff)

From 065e519e71b2c1f41936cce75b46b5ab34adb588 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 6 Apr 2017 11:16:33 +0800
Subject: [PATCH 58/84] md: MD_CLOSING needs to be cleared after called
 md_set_readonly or do_md_stop

if called md_set_readonly and set MD_CLOSING bit, the mddev cannot
be opened any more due to the MD_CLOING bit wasn't cleared. Thus it
needs to be cleared in md_ioctl after any call to md_set_readonly()
or do_md_stop().

Signed-off-by: NeilBrown <neilb@suse.com>
Fixes: af8d8e6f0315 ("md: changes for MD_STILL_CLOSED flag")
Cc: stable@vger.kernel.org (v4.9+)
Signed-off-by: Zhilong Liu <zlliu@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1db88d79549a..a612b9f83069 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6925,6 +6925,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	void __user *argp = (void __user *)arg;
 	struct mddev *mddev = NULL;
 	int ro;
+	bool did_set_md_closing = false;
 
 	if (!md_ioctl_valid(cmd))
 		return -ENOTTY;
@@ -7014,7 +7015,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			err = -EBUSY;
 			goto out;
 		}
+		WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
 		set_bit(MD_CLOSING, &mddev->flags);
+		did_set_md_closing = true;
 		mutex_unlock(&mddev->open_mutex);
 		sync_blockdev(bdev);
 	}
@@ -7207,6 +7210,8 @@ unlock:
 		mddev->hold_active = 0;
 	mddev_unlock(mddev);
 out:
+	if(did_set_md_closing)
+		clear_bit(MD_CLOSING, &mddev->flags);
 	return err;
 }
 #ifdef CONFIG_COMPAT

From b670883bb9e55ba63a278d83e034faefc01ce2cf Mon Sep 17 00:00:00 2001
From: Zhilong Liu <zlliu@suse.com>
Date: Mon, 10 Apr 2017 14:15:55 +0800
Subject: [PATCH 59/84] md.c:didn't unlock the mddev before return EINVAL in
 array_size_store

md.c: it needs to release the mddev lock before
the array_size_store() returns.

Fixes: ab5a98b132fd ("md-cluster: change array_sectors and update size are not supported")

Signed-off-by: Zhilong Liu <zlliu@suse.com>
Reviewed-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a612b9f83069..5022be183390 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4950,8 +4950,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
 		return err;
 
 	/* cluster raid doesn't support change array_sectors */
-	if (mddev_is_clustered(mddev))
+	if (mddev_is_clustered(mddev)) {
+		mddev_unlock(mddev);
 		return -EINVAL;
+	}
 
 	if (strncmp(buf, "default", 7) == 0) {
 		if (mddev->pers)

From 0c9d5b127f695818c2c5a3868c1f28ca2969e905 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 6 Apr 2017 12:06:37 +1000
Subject: [PATCH 60/84] md/raid1: avoid reusing a resync bio after error
 handling.

fix_sync_read_error() modifies a bio on a newly faulty
device by setting bi_end_io to end_sync_write.
This ensure that put_buf() will still call rdev_dec_pending()
as required, but makes sure that subsequent code in
fix_sync_read_error() doesn't try to read from the device.

Unfortunately this interacts badly with sync_request_write()
which assumes that any bio with bi_end_io set to non-NULL
other than end_sync_read is safe to write to.

As the device is now faulty it doesn't make sense to write.
As the bio was recently used for a read, it is "dirty"
and not suitable for immediate submission.
In particular, ->bi_next might be non-NULL, which will cause
generic_make_request() to complain.

Break this interaction by refusing to write to devices
which are marked as Faulty.

Reported-and-tested-by: Michael Wang <yun.wang@profitbricks.com>
Fixes: 2e52d449bcec ("md/raid1: add failfast handling for reads.")
Cc: stable@vger.kernel.org (v4.10+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7d6723558fd8..70278b9a3c43 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2218,6 +2218,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
 		     (i == r1_bio->read_disk ||
 		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
 			continue;
+		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+			continue;
 
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
 		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))

From 94568f64af50bb37c418b200449698cfe7e1da5f Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 4 Apr 2017 13:13:56 +0200
Subject: [PATCH 61/84] raid5-ppl: move no_mem_stripes to struct ppl_conf

Use a single no_mem_stripes list instead of per member device lists for
handling stripes that need retrying in case of failed io_unit
allocation. Because io_units are allocated from a memory pool shared
between all member disks, the no_mem_stripes list should be checked when
an io_unit for any member is freed. This fixes a deadlock that could
happen if there are stripes in more than one no_mem_stripes list.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 86ea9addb51a..355cf3581ef8 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -107,6 +107,10 @@ struct ppl_conf {
 	/* used only for recovery */
 	int recovered_entries;
 	int mismatch_count;
+
+	/* stripes to retry if failed to allocate io_unit */
+	struct list_head no_mem_stripes;
+	spinlock_t no_mem_stripes_lock;
 };
 
 struct ppl_log {
@@ -119,8 +123,6 @@ struct ppl_log {
 					 * always at the end of io_list */
 	spinlock_t io_list_lock;
 	struct list_head io_list;	/* all io_units of this log */
-	struct list_head no_mem_stripes;/* stripes to retry if failed to
-					 * allocate io_unit */
 };
 
 #define PPL_IO_INLINE_BVECS 32
@@ -347,9 +349,9 @@ int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
 	atomic_inc(&sh->count);
 
 	if (ppl_log_stripe(log, sh)) {
-		spin_lock_irq(&log->io_list_lock);
-		list_add_tail(&sh->log_list, &log->no_mem_stripes);
-		spin_unlock_irq(&log->io_list_lock);
+		spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
+		list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
+		spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
 	}
 
 	mutex_unlock(&log->io_mutex);
@@ -492,25 +494,32 @@ void ppl_write_stripe_run(struct r5conf *conf)
 static void ppl_io_unit_finished(struct ppl_io_unit *io)
 {
 	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
 	unsigned long flags;
 
 	pr_debug("%s: seq: %llu\n", __func__, io->seq);
 
-	spin_lock_irqsave(&log->io_list_lock, flags);
+	local_irq_save(flags);
 
+	spin_lock(&log->io_list_lock);
 	list_del(&io->log_sibling);
-	mempool_free(io, log->ppl_conf->io_pool);
+	spin_unlock(&log->io_list_lock);
 
-	if (!list_empty(&log->no_mem_stripes)) {
-		struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
-							  struct stripe_head,
-							  log_list);
+	mempool_free(io, ppl_conf->io_pool);
+
+	spin_lock(&ppl_conf->no_mem_stripes_lock);
+	if (!list_empty(&ppl_conf->no_mem_stripes)) {
+		struct stripe_head *sh;
+
+		sh = list_first_entry(&ppl_conf->no_mem_stripes,
+				      struct stripe_head, log_list);
 		list_del_init(&sh->log_list);
 		set_bit(STRIPE_HANDLE, &sh->state);
 		raid5_release_stripe(sh);
 	}
+	spin_unlock(&ppl_conf->no_mem_stripes_lock);
 
-	spin_unlock_irqrestore(&log->io_list_lock, flags);
+	local_irq_restore(flags);
 }
 
 void ppl_stripe_write_finished(struct stripe_head *sh)
@@ -1135,6 +1144,8 @@ int ppl_init_log(struct r5conf *conf)
 	}
 
 	atomic64_set(&ppl_conf->seq, 0);
+	INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
+	spin_lock_init(&ppl_conf->no_mem_stripes_lock);
 
 	if (!mddev->external) {
 		ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
@@ -1150,7 +1161,6 @@ int ppl_init_log(struct r5conf *conf)
 		mutex_init(&log->io_mutex);
 		spin_lock_init(&log->io_list_lock);
 		INIT_LIST_HEAD(&log->io_list);
-		INIT_LIST_HEAD(&log->no_mem_stripes);
 
 		log->ppl_conf = ppl_conf;
 		log->rdev = rdev;

From 845b9e229fe0716ab6b4d94b4364c99069667b59 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 4 Apr 2017 13:13:57 +0200
Subject: [PATCH 62/84] raid5-ppl: use resize_stripes() when enabling or
 disabling ppl

Use resize_stripes() instead of raid5_reset_stripe_cache() to allocate
or free sh->ppl_page at runtime for all stripes in the stripe cache.
raid5_reset_stripe_cache() required suspending the mddev and could
deadlock because of GFP_KERNEL allocations.

Move the 'newsize' check to check_reshape() to allow reallocating the
stripes with the same number of disks. Allocate sh->ppl_page in
alloc_stripe() instead of grow_buffers(). Pass 'struct r5conf *conf' as
a parameter to alloc_stripe() because it is needed to check whether to
allocate ppl_page. Add free_stripe() and use it to free stripes rather
than directly call kmem_cache_free(). Also free sh->ppl_page in
free_stripe().

Set MD_HAS_PPL at the end of ppl_init_log() instead of explicitly
setting it in advance and add another parameter to log_init() to allow
calling ppl_init_log() without the bit set. Don't try to calculate
partial parity or add a stripe to log if it does not have ppl_page set.

Enabling ppl can now be performed without suspending the mddev, because
the log won't be used until new stripes are allocated with ppl_page.
Calling mddev_suspend/resume is still necessary when disabling ppl,
because we want all stripes to finish before stopping the log, but
resize_stripes() can be called after mddev_resume() when ppl is no
longer active.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-log.h |  5 ++-
 drivers/md/raid5-ppl.c |  3 +-
 drivers/md/raid5.c     | 88 ++++++++++++++++++------------------------
 3 files changed, 43 insertions(+), 53 deletions(-)

diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 738930ff5d17..27097101ccca 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -93,11 +93,12 @@ static inline void log_exit(struct r5conf *conf)
 		ppl_exit_log(conf);
 }
 
-static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev,
+			   bool ppl)
 {
 	if (journal_dev)
 		return r5l_init_log(conf, journal_dev);
-	else if (raid5_has_ppl(conf))
+	else if (ppl)
 		return ppl_init_log(conf);
 
 	return 0;
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 355cf3581ef8..71968cf47d7d 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -328,7 +328,7 @@ int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
 	struct ppl_io_unit *io = sh->ppl_io;
 	struct ppl_log *log;
 
-	if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+	if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
 	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
@@ -1204,6 +1204,7 @@ int ppl_init_log(struct r5conf *conf)
 	}
 
 	conf->log_private = ppl_conf;
+	set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
 
 	return 0;
 err:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 09d94ad5e52b..e04d7b11bc87 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -471,11 +471,6 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
-
-	if (sh->ppl_page) {
-		put_page(sh->ppl_page);
-		sh->ppl_page = NULL;
-	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -493,12 +488,6 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].orig_page = page;
 	}
 
-	if (raid5_has_ppl(sh->raid_conf)) {
-		sh->ppl_page = alloc_page(gfp);
-		if (!sh->ppl_page)
-			return 1;
-	}
-
 	return 0;
 }
 
@@ -2132,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+	if (sh->ppl_page)
+		__free_page(sh->ppl_page);
+	kmem_cache_free(sc, sh);
+}
+
 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
-	int disks)
+	int disks, struct r5conf *conf)
 {
 	struct stripe_head *sh;
 	int i;
@@ -2147,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->r5c);
 		INIT_LIST_HEAD(&sh->log_list);
 		atomic_set(&sh->count, 1);
+		sh->raid_conf = conf;
 		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2154,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 			bio_init(&dev->req, &dev->vec, 1);
 			bio_init(&dev->rreq, &dev->rvec, 1);
 		}
+
+		if (raid5_has_ppl(conf)) {
+			sh->ppl_page = alloc_page(gfp);
+			if (!sh->ppl_page) {
+				free_stripe(sc, sh);
+				sh = NULL;
+			}
+		}
 	}
 	return sh;
 }
@@ -2161,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
 	struct stripe_head *sh;
 
-	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
 	if (!sh)
 		return 0;
 
-	sh->raid_conf = conf;
-
 	if (grow_buffers(sh, gfp)) {
 		shrink_buffers(sh);
-		kmem_cache_free(conf->slab_cache, sh);
+		free_stripe(conf->slab_cache, sh);
 		return 0;
 	}
 	sh->hash_lock_index =
@@ -2314,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	int i;
 	int hash, cnt;
 
-	if (newsize <= conf->pool_size)
-		return 0; /* never bother to shrink */
-
 	err = md_allow_write(conf->mddev);
 	if (err)
 		return err;
@@ -2332,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	mutex_lock(&conf->cache_size_mutex);
 
 	for (i = conf->max_nr_stripes; i; i--) {
-		nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+		nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
 		if (!nsh)
 			break;
 
-		nsh->raid_conf = conf;
 		list_add(&nsh->lru, &newstripes);
 	}
 	if (i) {
@@ -2344,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		while (!list_empty(&newstripes)) {
 			nsh = list_entry(newstripes.next, struct stripe_head, lru);
 			list_del(&nsh->lru);
-			kmem_cache_free(sc, nsh);
+			free_stripe(sc, nsh);
 		}
 		kmem_cache_destroy(sc);
 		mutex_unlock(&conf->cache_size_mutex);
@@ -2370,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 			nsh->dev[i].orig_page = osh->dev[i].page;
 		}
 		nsh->hash_lock_index = hash;
-		kmem_cache_free(conf->slab_cache, osh);
+		free_stripe(conf->slab_cache, osh);
 		cnt++;
 		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
 		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2447,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf)
 		return 0;
 	BUG_ON(atomic_read(&sh->count));
 	shrink_buffers(sh);
-	kmem_cache_free(conf->slab_cache, sh);
+	free_stripe(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	conf->max_nr_stripes--;
 	return 1;
@@ -3170,7 +3169,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
-	if (raid5_has_ppl(sh->raid_conf) &&
+	if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
 	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
 	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
 	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
@@ -7427,7 +7426,7 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 	}
 
-	if (log_init(conf, journal_dev))
+	if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
 		goto abort;
 
 	return 0;
@@ -7636,7 +7635,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * The array is in readonly mode if journal is missing, so no
 		 * write requests running. We should be safe
 		 */
-		log_init(conf, rdev);
+		log_init(conf, rdev, false);
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7786,6 +7785,9 @@ static int check_reshape(struct mddev *mddev)
 				      mddev->chunk_sectors)
 			    ) < 0)
 			return -ENOMEM;
+
+	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+		return 0; /* never bother to shrink */
 	return resize_stripes(conf, (conf->previous_raid_disks
 				     + mddev->delta_disks));
 }
@@ -8276,20 +8278,6 @@ static void *raid6_takeover(struct mddev *mddev)
 	return setup_conf(mddev);
 }
 
-static void raid5_reset_stripe_cache(struct mddev *mddev)
-{
-	struct r5conf *conf = mddev->private;
-
-	mutex_lock(&conf->cache_size_mutex);
-	while (conf->max_nr_stripes &&
-	       drop_one_stripe(conf))
-		;
-	while (conf->min_nr_stripes > conf->max_nr_stripes &&
-	       grow_one_stripe(conf, GFP_KERNEL))
-		;
-	mutex_unlock(&conf->cache_size_mutex);
-}
-
 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 {
 	struct r5conf *conf;
@@ -8304,23 +8292,23 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 		return -ENODEV;
 	}
 
-	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
+	if (strncmp(buf, "ppl", 3) == 0) {
 		/* ppl only works with RAID 5 */
-		if (conf->level == 5) {
-			mddev_suspend(mddev);
-			set_bit(MD_HAS_PPL, &mddev->flags);
-			err = log_init(conf, NULL);
-			if (!err)
-				raid5_reset_stripe_cache(mddev);
-			mddev_resume(mddev);
+		if (!raid5_has_ppl(conf) && conf->level == 5) {
+			err = log_init(conf, NULL, true);
+			if (!err) {
+				err = resize_stripes(conf, conf->pool_size);
+				if (err)
+					log_exit(conf);
+			}
 		} else
 			err = -EINVAL;
 	} else if (strncmp(buf, "resync", 6) == 0) {
 		if (raid5_has_ppl(conf)) {
 			mddev_suspend(mddev);
 			log_exit(conf);
-			raid5_reset_stripe_cache(mddev);
 			mddev_resume(mddev);
+			err = resize_stripes(conf, conf->pool_size);
 		} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
 			   r5l_log_disk_error(conf)) {
 			bool journal_dev_exists = false;

From ae1713e296449caf820635d384a99936ce281a71 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 4 Apr 2017 13:13:58 +0200
Subject: [PATCH 63/84] raid5-ppl: partial parity calculation optimization

In case of read-modify-write, partial partity is the same as the result
of ops_run_prexor5(), so we can just copy sh->dev[pd_idx].page into
sh->ppl_page instead of calculating it again.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 20 ++++++++++----------
 drivers/md/raid5.c     |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 71968cf47d7d..4eb0ebcf9c29 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -153,7 +153,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 		       struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	struct page **srcs = flex_array_get(percpu->scribble, 0);
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct async_submit_ctl submit;
 
@@ -166,18 +166,18 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 	 * differently.
 	 */
 	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
-		/* rmw: xor old data and parity from updated disks */
-		for (i = disks; i--;) {
-			struct r5dev *dev = &sh->dev[i];
-			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
-				xor_srcs[count++] = dev->page;
-		}
+		/*
+		 * rmw: xor old data and parity from updated disks
+		 * This is calculated earlier by ops_run_prexor5() so just copy
+		 * the parity dev page.
+		 */
+		srcs[count++] = sh->dev[pd_idx].page;
 	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
 		/* rcw: xor data from all not updated disks */
 		for (i = disks; i--;) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_UPTODATE, &dev->flags))
-				xor_srcs[count++] = dev->page;
+				srcs[count++] = dev->page;
 		}
 	} else {
 		return tx;
@@ -188,10 +188,10 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 			  + sizeof(struct page *) * (sh->disks + 2));
 
 	if (count == 1)
-		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+		tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
 				  &submit);
 	else
-		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+		tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
 			       &submit);
 
 	return tx;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e04d7b11bc87..f3692ff4262b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2079,9 +2079,6 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
-	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
-		tx = ops_run_partial_parity(sh, percpu, tx);
-
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -2089,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			tx = ops_run_prexor6(sh, percpu, tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;

From c230e7e53526c223a3e1caf40747d6e37c0e4394 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: [PATCH 64/84] md/raid1: simplify the splitting of requests.

raid1 currently splits requests in two different ways for
two different reasons.

First, bio_split() is used to ensure the bio fits within a
resync accounting region.
Second, multiple r1bios are allocated for each bio to handle
the possiblity of known bad blocks on some devices.

This can be simplified to just use bio_split() once, and not
use multiple r1bios.
We delay the split until we know a maximum bio size that can
be handled with a single r1bio, and then split the bio and
queue the remainder for later handling.

This avoids all loops inside raid1.c request handling.  Just
a single read, or a single set of writes, is submitted to
lower-level devices for each bio that comes from
generic_make_request().

When the bio needs to be split, generic_make_request() will
do the necessary looping and call md_make_request() multiple
times.

raid1_make_request() no longer queues request for raid1 to handle,
so we can remove that branch from the 'if'.

This patch also creates a new private bio_set
(conf->bio_split) for splitting bios.  Using fs_bio_set
is wrong, as it is meant to be used by filesystems, not
block devices.  Using it inside md can lead to deadlocks
under high memory pressure.

Delete unused variable in raid1_write_request() (Shaohua)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 141 ++++++++++++++++-----------------------------
 drivers/md/raid1.h |   2 +
 2 files changed, 52 insertions(+), 91 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 70278b9a3c43..94f1d754aa09 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1202,7 +1202,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
 	return r1_bio;
 }
 
-static void raid1_read_request(struct mddev *mddev, struct bio *bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+			       int max_read_sectors)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
@@ -1211,7 +1212,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	struct bitmap *bitmap = mddev->bitmap;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	int sectors_handled;
 	int max_sectors;
 	int rdisk;
 
@@ -1222,12 +1222,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	wait_read_barrier(conf, bio->bi_iter.bi_sector);
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio->sectors = max_read_sectors;
 
 	/*
 	 * make_request() can abort the operation when read-ahead is being
 	 * used and no empty request is available.
 	 */
-read_again:
 	rdisk = read_balance(conf, r1_bio, &max_sectors);
 
 	if (rdisk < 0) {
@@ -1247,11 +1247,20 @@ read_again:
 		wait_event(bitmap->behind_wait,
 			   atomic_read(&bitmap->behind_writes) == 0);
 	}
+
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = max_sectors;
+	}
+
 	r1_bio->read_disk = rdisk;
 
 	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
-		 max_sectors);
 
 	r1_bio->bios[rdisk] = read_bio;
 
@@ -1270,30 +1279,11 @@ read_again:
 	                              read_bio, disk_devt(mddev->gendisk),
 	                              r1_bio->sector);
 
-	if (max_sectors < r1_bio->sectors) {
-		/*
-		 * could not read all from this device, so we will need another
-		 * r1_bio.
-		 */
-		sectors_handled = (r1_bio->sector + max_sectors
-				   - bio->bi_iter.bi_sector);
-		r1_bio->sectors = max_sectors;
-		bio_inc_remaining(bio);
-
-		/*
-		 * Cannot call generic_make_request directly as that will be
-		 * queued in __make_request and subsequent mempool_alloc might
-		 * block waiting for it.  So hand bio over to raid1d.
-		 */
-		reschedule_retry(r1_bio);
-
-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-		goto read_again;
-	} else
-		generic_make_request(read_bio);
+	generic_make_request(read_bio);
 }
 
-static void raid1_write_request(struct mddev *mddev, struct bio *bio)
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+				int max_write_sectors)
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
@@ -1304,9 +1294,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	struct blk_plug_cb *cb;
 	struct raid1_plug_cb *plug = NULL;
 	int first_clone;
-	int sectors_handled;
 	int max_sectors;
-	sector_t offset;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -1345,6 +1333,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	wait_barrier(conf, bio->bi_iter.bi_sector);
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
@@ -1443,17 +1432,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		goto retry_write;
 	}
 
-	if (max_sectors < r1_bio->sectors)
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r1_bio->master_bio = bio;
 		r1_bio->sectors = max_sectors;
-
-	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
+	}
 
 	atomic_set(&r1_bio->remaining, 1);
 	atomic_set(&r1_bio->behind_remaining, 0);
 
 	first_clone = 1;
 
-	offset = r1_bio->sector - bio->bi_iter.bi_sector;
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio = NULL;
 		if (!r1_bio->bios[i])
@@ -1470,7 +1463,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
 				mbio = alloc_behind_master_bio(r1_bio, bio,
-							       offset << 9,
+							       0,
 							       max_sectors << 9);
 			}
 
@@ -1486,10 +1479,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 				mbio = bio_clone_fast(r1_bio->behind_master_bio,
 						      GFP_NOIO,
 						      mddev->bio_set);
-			else {
+			else
 				mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-				bio_trim(mbio, offset, max_sectors);
-			}
 		}
 
 		if (r1_bio->behind_master_bio) {
@@ -1536,19 +1527,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		if (!plug)
 			md_wakeup_thread(mddev->thread);
 	}
-	/* Mustn't call r1_bio_write_done before this next test,
-	 * as it could result in the bio being freed.
-	 */
-	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r1_bio, which must be counted */
-		sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
-
-		inc_pending(conf, sect);
-		bio_inc_remaining(bio);
-		r1_bio_write_done(r1_bio);
-		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
-		goto retry_write;
-	}
 
 	r1_bio_write_done(r1_bio);
 
@@ -1558,7 +1536,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
 static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 {
-	struct bio *split;
 	sector_t sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
@@ -1566,43 +1543,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
-	/* if bio exceeds barrier unit boundary, split it */
-	do {
-		sectors = align_to_barrier_unit_end(
-				bio->bi_iter.bi_sector, bio_sectors(bio));
-		if (sectors < bio_sectors(bio)) {
-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
+	/*
+	 * There is a limit to the maximum size, but
+	 * the read/write handler might find a lower limit
+	 * due to bad blocks.  To avoid multiple splits,
+	 * we pass the maximum number of sectors down
+	 * and let the lower level perform the split.
+	 */
+	sectors = align_to_barrier_unit_end(
+		bio->bi_iter.bi_sector, bio_sectors(bio));
 
-		if (bio_data_dir(split) == READ) {
-			raid1_read_request(mddev, split);
-
-			/*
-			 * If a bio is splitted, the first part of bio will
-			 * pass barrier but the bio is queued in
-			 * current->bio_list (see generic_make_request). If
-			 * there is a raise_barrier() called here, the second
-			 * part of bio can't pass barrier. But since the first
-			 * part bio isn't dispatched to underlaying disks yet,
-			 * the barrier is never released, hence raise_barrier
-			 * will alays wait. We have a deadlock.
-			 * Note, this only happens in read path. For write
-			 * path, the first part of bio is dispatched in a
-			 * schedule() call (because of blk plug) or offloaded
-			 * to raid10d.
-			 * Quitting from the function immediately can change
-			 * the bio order queued in bio_list and avoid the deadlock.
-			 */
-			if (split != bio) {
-				generic_make_request(bio);
-				break;
-			}
-		} else
-			raid1_write_request(mddev, split);
-	} while (split != bio);
+	if (bio_data_dir(bio) == READ)
+		raid1_read_request(mddev, bio, sectors);
+	else
+		raid1_write_request(mddev, bio, sectors);
 }
 
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
@@ -2647,10 +2601,7 @@ static void raid1d(struct md_thread *thread)
 		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
 			handle_read_error(conf, r1_bio);
 		else
-			/* just a partial read to be scheduled from separate
-			 * context
-			 */
-			generic_make_request(r1_bio->bios[r1_bio->read_disk]);
+			WARN_ON_ONCE(1);
 
 		cond_resched();
 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3038,6 +2989,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf->r1bio_pool)
 		goto abort;
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto abort;
+
 	conf->poolinfo->mddev = mddev;
 
 	err = -EINVAL;
@@ -3119,6 +3074,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		kfree(conf->nr_waiting);
 		kfree(conf->nr_queued);
 		kfree(conf->barrier);
+		if (conf->bio_split)
+			bioset_free(conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3224,6 +3181,8 @@ static void raid1_free(struct mddev *mddev, void *priv)
 	kfree(conf->nr_waiting);
 	kfree(conf->nr_queued);
 	kfree(conf->barrier);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 4271cd7ac2de..b0ab0da6e39e 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -107,6 +107,8 @@ struct r1conf {
 	mempool_t		*r1bio_pool;
 	mempool_t		*r1buf_pool;
 
+	struct bio_set		*bio_split;
+
 	/* temporary buffer to synchronous IO when attempting to repair
 	 * a read error.
 	 */

From cb83efcfd26a28b76eef8815a41158c1896fc5ba Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: [PATCH 65/84] md/raid1: simplify alloc_behind_master_bio()

Now that we always always pass an offset of 0 and a size
that matches the bio to alloc_behind_master_bio(),
we can remove the offset/size args and simplify the code.

We could probably remove bio_copy_data_partial() too.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 94f1d754aa09..18af00c86b42 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1091,9 +1091,9 @@ static void unfreeze_array(struct r1conf *conf)
 }
 
 static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
-					   struct bio *bio,
-					   int offset, int size)
+					   struct bio *bio)
 {
+	int size = bio->bi_iter.bi_size;
 	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	int i = 0;
 	struct bio *behind_bio = NULL;
@@ -1120,8 +1120,7 @@ static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
 		i++;
 	}
 
-	bio_copy_data_partial(behind_bio, bio, offset,
-			      behind_bio->bi_iter.bi_size);
+	bio_copy_data(behind_bio, bio);
 skip_copy:
 	r1_bio->behind_master_bio = behind_bio;;
 	set_bit(R1BIO_BehindIO, &r1_bio->state);
@@ -1462,9 +1461,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {
-				mbio = alloc_behind_master_bio(r1_bio, bio,
-							       0,
-							       max_sectors << 9);
+				mbio = alloc_behind_master_bio(r1_bio, bio);
 			}
 
 			bitmap_startwrite(bitmap, r1_bio->sector,

From 50512625da06c41517cb596f51b923ce15f401a4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: [PATCH 66/84] Revert "block: introduce bio_copy_data_partial"

This reverts commit 6f8802852f7e58a12177a86179803b9efaad98e2.
bio_copy_data_partial() is no longer needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 60 ++++++++++-----------------------------------
 include/linux/bio.h |  2 --
 2 files changed, 13 insertions(+), 49 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 036435995c55..12c2837c4277 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -990,8 +990,19 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-static void __bio_copy_data(struct bio *dst, struct bio *src,
-			    int offset, int size)
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1001,12 +1012,6 @@ static void __bio_copy_data(struct bio *dst, struct bio *src,
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
-	/* for supporting partial copy */
-	if (offset || size != src->bi_iter.bi_size) {
-		bio_advance_iter(src, &src_iter, offset);
-		src_iter.bi_size = size;
-	}
-
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1043,47 +1048,8 @@ static void __bio_copy_data(struct bio *dst, struct bio *src,
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
-
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
-{
-	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_copy_data);
 
-/**
- * bio_copy_data_partial - copy partial contents of data buffers from one
- * chain of bios to another
- * @dst: destination bio list
- * @src: source bio list
- * @offset: starting copy from the offset
- * @size: how many bytes to copy
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data_partial(struct bio *dst, struct bio *src,
-			   int offset, int size)
-{
-	__bio_copy_data(dst, src, offset, size);
-
-}
-EXPORT_SYMBOL(bio_copy_data_partial);
-
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fafef6343d1b..7cf8a6c70a3f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -461,8 +461,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
-				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 

From 689389a06ce79fdced85b5115717f71c71e623e0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: [PATCH 67/84] md/raid1: simplify handle_read_error().

handle_read_error() duplicates a lot of the work that raid1_read_request()
does, so it makes sense to just use that function.
This doesn't quite work as handle_read_error() relies on the same r1bio
being re-used so that, in the case of a read-only array, setting
IO_BLOCKED in r1bio->bios[] ensures read_balance() won't re-use
that device.
So we need to allow a r1bio to be passed to raid1_read_request(), and to
have that function mostly initialise the r1bio, but leave the bios[]
array untouched.

Two parts of handle_read_error() that need to be preserved are the warning
message it prints, so they are conditionally added to raid1_read_request().

Note that this highlights a minor bug on alloc_r1bio().  It doesn't
initalise the bios[] array, so it is possible that old content is there,
which might cause read_balance() to ignore some devices with no good reason.

With this change, we no longer need inc_pending(), or the sectors_handled
arg to alloc_r1bio().

As handle_read_error() is called from raid1d() and allocates memory,
there is tiny chance of a deadlock.  All element of various pools
could be queued waiting for raid1 to handle them, and there may be no
extra memory free.
Achieving guaranteed forward progress would probably require a second
thread and another mempool.  Instead of that complexity, add
__GFP_HIGH to any allocations when read1_read_request() is called
from raid1d.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 140 +++++++++++++++++++--------------------------
 1 file changed, 60 insertions(+), 80 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 18af00c86b42..29a9aa9254c3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -988,16 +988,6 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void inc_pending(struct r1conf *conf, sector_t bi_sector)
-{
-	/* The current request requires multiple r1_bio, so
-	 * we need to increment the pending count, and the corresponding
-	 * window count.
-	 */
-	int idx = sector_to_idx(bi_sector);
-	atomic_inc(&conf->nr_pending[idx]);
-}
-
 static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	int idx = sector_to_idx(sector_nr);
@@ -1184,35 +1174,60 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
+static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
+{
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio);
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector;
+}
+
 static inline struct r1bio *
-alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
+alloc_r1bio(struct mddev *mddev, struct bio *bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
 
 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-	r1_bio->state = 0;
-	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
-
+	/* Ensure no bio records IO_BLOCKED */
+	memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
+	init_r1bio(r1_bio, mddev, bio);
 	return r1_bio;
 }
 
 static void raid1_read_request(struct mddev *mddev, struct bio *bio,
-			       int max_read_sectors)
+			       int max_read_sectors, struct r1bio *r1_bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
-	struct r1bio *r1_bio;
 	struct bio *read_bio;
 	struct bitmap *bitmap = mddev->bitmap;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	int max_sectors;
 	int rdisk;
+	bool print_msg = !!r1_bio;
+	char b[BDEVNAME_SIZE];
+
+	/*
+	 * If r1_bio is set, we are blocking the raid1d thread
+	 * so there is a tiny risk of deadlock.  So ask for
+	 * emergency memory if needed.
+	 */
+	gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
+
+	if (print_msg) {
+		/* Need to get the block device name carefully */
+		struct md_rdev *rdev;
+		rcu_read_lock();
+		rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
+		if (rdev)
+			bdevname(rdev->bdev, b);
+		else
+			strcpy(b, "???");
+		rcu_read_unlock();
+	}
 
 	/*
 	 * Still need barrier for READ in case that whole
@@ -1220,7 +1235,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	 */
 	wait_read_barrier(conf, bio->bi_iter.bi_sector);
 
-	r1_bio = alloc_r1bio(mddev, bio, 0);
+	if (!r1_bio)
+		r1_bio = alloc_r1bio(mddev, bio);
+	else
+		init_r1bio(r1_bio, mddev, bio);
 	r1_bio->sectors = max_read_sectors;
 
 	/*
@@ -1231,11 +1249,23 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 
 	if (rdisk < 0) {
 		/* couldn't find anywhere to read from */
+		if (print_msg) {
+			pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
+					    mdname(mddev),
+					    b,
+					    (unsigned long long)r1_bio->sector);
+		}
 		raid_end_bio_io(r1_bio);
 		return;
 	}
 	mirror = conf->mirrors + rdisk;
 
+	if (print_msg)
+		pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
+				    mdname(mddev),
+				    (unsigned long long)r1_bio->sector,
+				    bdevname(mirror->rdev->bdev, b));
+
 	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
 	    bitmap) {
 		/*
@@ -1249,7 +1279,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
-					      GFP_NOIO, conf->bio_split);
+					      gfp, conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1259,7 +1289,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 
 	r1_bio->read_disk = rdisk;
 
-	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
 
 	r1_bio->bios[rdisk] = read_bio;
 
@@ -1331,7 +1361,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	}
 	wait_barrier(conf, bio->bi_iter.bi_sector);
 
-	r1_bio = alloc_r1bio(mddev, bio, 0);
+	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
 	if (conf->pending_count >= max_queued_requests) {
@@ -1551,7 +1581,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio)
 		bio->bi_iter.bi_sector, bio_sectors(bio));
 
 	if (bio_data_dir(bio) == READ)
-		raid1_read_request(mddev, bio, sectors);
+		raid1_read_request(mddev, bio, sectors, NULL);
 	else
 		raid1_write_request(mddev, bio, sectors);
 }
@@ -2443,11 +2473,8 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 
 static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 {
-	int disk;
-	int max_sectors;
 	struct mddev *mddev = conf->mddev;
 	struct bio *bio;
-	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
 	dev_t bio_dev;
 	sector_t bio_sector;
@@ -2463,7 +2490,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	 */
 
 	bio = r1_bio->bios[r1_bio->read_disk];
-	bdevname(bio->bi_bdev, b);
 	bio_dev = bio->bi_bdev->bd_dev;
 	bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
 	bio_put(bio);
@@ -2481,58 +2507,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	}
 
 	rdev_dec_pending(rdev, conf->mddev);
+	allow_barrier(conf, r1_bio->sector);
+	bio = r1_bio->master_bio;
 
-read_more:
-	disk = read_balance(conf, r1_bio, &max_sectors);
-	if (disk == -1) {
-		pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
-				    mdname(mddev), b, (unsigned long long)r1_bio->sector);
-		raid_end_bio_io(r1_bio);
-	} else {
-		const unsigned long do_sync
-			= r1_bio->master_bio->bi_opf & REQ_SYNC;
-		r1_bio->read_disk = disk;
-		bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
-				     mddev->bio_set);
-		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-		r1_bio->bios[r1_bio->read_disk] = bio;
-		rdev = conf->mirrors[disk].rdev;
-		pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
-				    mdname(mddev),
-				    (unsigned long long)r1_bio->sector,
-				    bdevname(rdev->bdev, b));
-		bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
-		bio->bi_bdev = rdev->bdev;
-		bio->bi_end_io = raid1_end_read_request;
-		bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
-		if (test_bit(FailFast, &rdev->flags) &&
-		    test_bit(R1BIO_FailFast, &r1_bio->state))
-			bio->bi_opf |= MD_FAILFAST;
-		bio->bi_private = r1_bio;
-		if (max_sectors < r1_bio->sectors) {
-			/* Drat - have to split this up more */
-			struct bio *mbio = r1_bio->master_bio;
-			int sectors_handled = (r1_bio->sector + max_sectors
-					       - mbio->bi_iter.bi_sector);
-			r1_bio->sectors = max_sectors;
-			bio_inc_remaining(mbio);
-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-					      bio, bio_dev, bio_sector);
-			generic_make_request(bio);
-			bio = NULL;
-
-			r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
-			set_bit(R1BIO_ReadError, &r1_bio->state);
-			inc_pending(conf, r1_bio->sector);
-
-			goto read_more;
-		} else {
-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-					      bio, bio_dev, bio_sector);
-			generic_make_request(bio);
-		}
-	}
+	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
+	r1_bio->state = 0;
+	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
 }
 
 static void raid1d(struct md_thread *thread)

From 673ca68d93879b9ffbbed874c9e70ca6e37cab15 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: [PATCH 68/84] md/raid1: factor out flush_bio_list()

flush_pending_writes() and raid1_unplug() each contain identical
copies of a fairly large slab of code.  So factor that out into
new flush_bio_list() to simplify maintenance.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 66 ++++++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 40 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 29a9aa9254c3..57611f43ed6c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -787,6 +787,30 @@ static int raid1_congested(struct mddev *mddev, int bits)
 	return ret;
 }
 
+static void flush_bio_list(struct r1conf *conf, struct bio *bio)
+{
+	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
+	bitmap_unplug(conf->mddev->bitmap);
+	wake_up(&conf->wait_barrier);
+
+	while (bio) { /* submit pending writes */
+		struct bio *next = bio->bi_next;
+		struct md_rdev *rdev = (void*)bio->bi_bdev;
+		bio->bi_next = NULL;
+		bio->bi_bdev = rdev->bdev;
+		if (test_bit(Faulty, &rdev->flags)) {
+			bio->bi_error = -EIO;
+			bio_endio(bio);
+		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+			/* Just ignore it */
+			bio_endio(bio);
+		else
+			generic_make_request(bio);
+		bio = next;
+	}
+}
+
 static void flush_pending_writes(struct r1conf *conf)
 {
 	/* Any writes that have been queued but are awaiting
@@ -799,27 +823,7 @@ static void flush_pending_writes(struct r1conf *conf)
 		bio = bio_list_get(&conf->pending_bio_list);
 		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
-		/* flush any pending bitmap writes to
-		 * disk before proceeding w/ I/O */
-		bitmap_unplug(conf->mddev->bitmap);
-		wake_up(&conf->wait_barrier);
-
-		while (bio) { /* submit pending writes */
-			struct bio *next = bio->bi_next;
-			struct md_rdev *rdev = (void*)bio->bi_bdev;
-			bio->bi_next = NULL;
-			bio->bi_bdev = rdev->bdev;
-			if (test_bit(Faulty, &rdev->flags)) {
-				bio->bi_error = -EIO;
-				bio_endio(bio);
-			} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
-				/* Just ignore it */
-				bio_endio(bio);
-			else
-				generic_make_request(bio);
-			bio = next;
-		}
+		flush_bio_list(conf, bio);
 	} else
 		spin_unlock_irq(&conf->device_lock);
 }
@@ -1152,25 +1156,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 
 	/* we aren't scheduling, so we can do the write-out directly. */
 	bio = bio_list_get(&plug->pending);
-	bitmap_unplug(mddev->bitmap);
-	wake_up(&conf->wait_barrier);
-
-	while (bio) { /* submit pending writes */
-		struct bio *next = bio->bi_next;
-		struct md_rdev *rdev = (void*)bio->bi_bdev;
-		bio->bi_next = NULL;
-		bio->bi_bdev = rdev->bdev;
-		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_error = -EIO;
-			bio_endio(bio);
-		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
-			/* Just ignore it */
-			bio_endio(bio);
-		else
-			generic_make_request(bio);
-		bio = next;
-	}
+	flush_bio_list(conf, bio);
 	kfree(plug);
 }
 

From fc9977dd069e4f82fcacb262652117c488647319 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: [PATCH 69/84] md/raid10: simplify the splitting of requests.

raid10 splits requests in two different ways for two different
reasons.

First, bio_split() is used to ensure the bio fits with a chunk.
Second, multiple r10bio structures are allocated to represent the
different sections that need to go to different devices, to avoid
known bad blocks.

This can be simplified to just use bio_split() once, and not to use
multiple r10bios.
We delay the split until we know a maximum bio size that can
be handled with a single r10bio, and then split the bio and queue
the remainder for later handling.

As with raid1, we allocate a new bio_set to help with the splitting.
It is not correct to use fs_bio_set in a device driver.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 164 ++++++++++++++------------------------------
 drivers/md/raid10.h |   1 +
 2 files changed, 51 insertions(+), 114 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e055ec94b9a8..41845bae67be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	struct bio *read_bio;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	int sectors_handled;
 	int max_sectors;
 	sector_t sectors;
 	struct md_rdev *rdev;
@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	 */
 	wait_barrier(conf);
 
-	sectors = bio_sectors(bio);
+	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		wait_barrier(conf);
 	}
 
-read_again:
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	if (!rdev) {
 		raid_end_bio_io(r10_bio);
 		return;
 	}
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r10_bio->master_bio = bio;
+		r10_bio->sectors = max_sectors;
+	}
 	slot = r10_bio->read_slot;
 
 	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
-		 max_sectors);
 
 	r10_bio->devs[slot].bio = read_bio;
 	r10_bio->devs[slot].rdev = rdev;
@@ -1186,40 +1191,13 @@ read_again:
 	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
 	                              read_bio, disk_devt(mddev->gendisk),
 	                              r10_bio->sector);
-	if (max_sectors < r10_bio->sectors) {
-		/*
-		 * Could not read all from this device, so we will need another
-		 * r10_bio.
-		 */
-		sectors_handled = (r10_bio->sector + max_sectors
-				   - bio->bi_iter.bi_sector);
-		r10_bio->sectors = max_sectors;
-		inc_pending(conf);
-		bio_inc_remaining(bio);
-		/*
-		 * Cannot call generic_make_request directly as that will be
-		 * queued in __generic_make_request and subsequent
-		 * mempool_alloc might block waiting for it.  so hand bio over
-		 * to raid10d.
-		 */
-		reschedule_retry(r10_bio);
-
-		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-		r10_bio->master_bio = bio;
-		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-		r10_bio->state = 0;
-		r10_bio->mddev = mddev;
-		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
-		goto read_again;
-	} else
-		generic_make_request(read_bio);
+	generic_make_request(read_bio);
 	return;
 }
 
 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 				  struct bio *bio, bool replacement,
-				  int n_copy, int max_sectors)
+				  int n_copy)
 {
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 		rdev = conf->mirrors[devnum].rdev;
 
 	mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 	if (replacement)
 		r10_bio->devs[n_copy].repl_bio = mbio;
 	else
@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	int i;
 	struct md_rdev *blocked_rdev;
 	sector_t sectors;
-	int sectors_handled;
 	int max_sectors;
 
 	md_write_start(mddev, bio);
@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 	wait_barrier(conf);
 
-	sectors = bio_sectors(bio);
+	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1476,44 +1452,29 @@ retry_write:
 
 	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
-	sectors_handled = r10_bio->sector + max_sectors -
-		bio->bi_iter.bi_sector;
+
+	if (r10_bio->sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, r10_bio->sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r10_bio->master_bio = bio;
+	}
 
 	atomic_set(&r10_bio->remaining, 1);
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
 	for (i = 0; i < conf->copies; i++) {
 		if (r10_bio->devs[i].bio)
-			raid10_write_one_disk(mddev, r10_bio, bio, false,
-					      i, max_sectors);
+			raid10_write_one_disk(mddev, r10_bio, bio, false, i);
 		if (r10_bio->devs[i].repl_bio)
-			raid10_write_one_disk(mddev, r10_bio, bio, true,
-					      i, max_sectors);
-	}
-
-	/* Don't remove the bias on 'remaining' (one_write_done) until
-	 * after checking if we need to go around again.
-	 */
-
-	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r10_bio and it needs to be counted */
-		inc_pending(conf);
-		bio_inc_remaining(bio);
-		one_write_done(r10_bio);
-		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-		r10_bio->master_bio = bio;
-		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-
-		r10_bio->mddev = mddev;
-		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
-		r10_bio->state = 0;
-		goto retry_write;
+			raid10_write_one_disk(mddev, r10_bio, bio, true, i);
 	}
 	one_write_done(r10_bio);
 }
 
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 	r10_bio->master_bio = bio;
-	r10_bio->sectors = bio_sectors(bio);
+	r10_bio->sectors = sectors;
 
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_iter.bi_sector;
@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 	struct r10conf *conf = mddev->private;
 	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
 	int chunk_sects = chunk_mask + 1;
-
-	struct bio *split;
+	int sectors = bio_sectors(bio);
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	do {
-
-		/*
-		 * If this request crosses a chunk boundary, we need to split
-		 * it.
-		 */
-		if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
-			     bio_sectors(bio) > chunk_sects
-			     && (conf->geo.near_copies < conf->geo.raid_disks
-				 || conf->prev.near_copies <
-				 conf->prev.raid_disks))) {
-			split = bio_split(bio, chunk_sects -
-					  (bio->bi_iter.bi_sector &
-					   (chunk_sects - 1)),
-					  GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
-
-		/*
-		 * If a bio is splitted, the first part of bio will pass
-		 * barrier but the bio is queued in current->bio_list (see
-		 * generic_make_request). If there is a raise_barrier() called
-		 * here, the second part of bio can't pass barrier. But since
-		 * the first part bio isn't dispatched to underlaying disks
-		 * yet, the barrier is never released, hence raise_barrier will
-		 * alays wait. We have a deadlock.
-		 * Note, this only happens in read path. For write path, the
-		 * first part of bio is dispatched in a schedule() call
-		 * (because of blk plug) or offloaded to raid10d.
-		 * Quitting from the function immediately can change the bio
-		 * order queued in bio_list and avoid the deadlock.
-		 */
-		__make_request(mddev, split);
-		if (split != bio && bio_data_dir(bio) == READ) {
-			generic_make_request(bio);
-			break;
-		}
-	} while (split != bio);
+	/*
+	 * If this request crosses a chunk boundary, we need to split
+	 * it.
+	 */
+	if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+		     sectors > chunk_sects
+		     && (conf->geo.near_copies < conf->geo.raid_disks
+			 || conf->prev.near_copies <
+			 conf->prev.raid_disks)))
+		sectors = chunk_sects -
+			(bio->bi_iter.bi_sector &
+			 (chunk_sects - 1));
+	__make_request(mddev, bio, sectors);
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread)
 			recovery_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
 			handle_read_error(mddev, r10_bio);
-		else {
-			/* just a partial read to be scheduled from a
-			 * separate context
-			 */
-			int slot = r10_bio->read_slot;
-			generic_make_request(r10_bio->devs[slot].bio);
-		}
+		else
+			WARN_ON_ONCE(1);
 
 		cond_resched();
 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	if (!conf->r10bio_pool)
 		goto out;
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto out;
+
 	calc_sectors(conf, mddev->dev_sectors);
 	if (mddev->reshape_position == MaxSector) {
 		conf->prev = conf->geo;
@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 		mempool_destroy(conf->r10bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
+		if (conf->bio_split)
+			bioset_free(conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3899,6 +3833,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
 	kfree(conf->mirrors);
 	kfree(conf->mirrors_old);
 	kfree(conf->mirrors_new);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3162615e57bd..735ce1a3d260 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
 	mempool_t		*r10bio_pool;
 	mempool_t		*r10buf_pool;
 	struct page		*tmppage;
+	struct bio_set		*bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.

From 545250f2480911f053b092d4229d9f83a9dff222 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: [PATCH 70/84] md/raid10: simplify handle_read_error()

handle_read_error() duplicates a lot of the work that raid10_read_request()
does, so it makes sense to just use that function.

handle_read_error() relies on the same r10bio being re-used so that,
in the case of a read-only array, setting IO_BLOCKED in r1bio->devs[].bio
ensures read_balance() won't re-use that device.
So when called from raid10_make_request() we clear that array, but not
when called from handle_read_error().

Two parts of handle_read_error() that need to be preserved are the warning
message it prints, so they are conditionally added to
raid10_read_request().  If the failing rdev can be found, messages
are printed.  Otherwise they aren't.

Not that as rdev_dec_pending() has already been called on the failing
rdev, we need to use rcu_read_lock() to get a new reference from
the conf.  We only use this to get the name of the failing block device.

With this change, we no longer need inc_pending().

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 122 +++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 75 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 41845bae67be..4167091eff9a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1008,15 +1008,6 @@ static void wait_barrier(struct r10conf *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void inc_pending(struct r10conf *conf)
-{
-	/* The current request requires multiple r10_bio, so
-	 * we need to increment the pending count.
-	 */
-	WARN_ON(!atomic_read(&conf->nr_pending));
-	atomic_inc(&conf->nr_pending);
-}
-
 static void allow_barrier(struct r10conf *conf)
 {
 	if ((atomic_dec_and_test(&conf->nr_pending)) ||
@@ -1130,8 +1121,38 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	int max_sectors;
 	sector_t sectors;
 	struct md_rdev *rdev;
-	int slot;
+	char b[BDEVNAME_SIZE];
+	int slot = r10_bio->read_slot;
+	struct md_rdev *err_rdev = NULL;
+	gfp_t gfp = GFP_NOIO;
 
+	if (r10_bio->devs[slot].rdev) {
+		/*
+		 * This is an error retry, but we cannot
+		 * safely dereference the rdev in the r10_bio,
+		 * we must use the one in conf.
+		 * If it has already been disconnected (unlikely)
+		 * we lose the device name in error messages.
+		 */
+		int disk;
+		/*
+		 * As we are blocking raid10, it is a little safer to
+		 * use __GFP_HIGH.
+		 */
+		gfp = GFP_NOIO | __GFP_HIGH;
+
+		rcu_read_lock();
+		disk = r10_bio->devs[slot].devnum;
+		err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+		if (err_rdev)
+			bdevname(err_rdev->bdev, b);
+		else {
+			strcpy(b, "???");
+			/* This never gets dereferenced */
+			err_rdev = r10_bio->devs[slot].rdev;
+		}
+		rcu_read_unlock();
+	}
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
@@ -1158,12 +1179,22 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	if (!rdev) {
+		if (err_rdev) {
+			pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+					    mdname(mddev), b,
+					    (unsigned long long)r10_bio->sector);
+		}
 		raid_end_bio_io(r10_bio);
 		return;
 	}
+	if (err_rdev)
+		pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+				   mdname(mddev),
+				   bdevname(rdev->bdev, b),
+				   (unsigned long long)r10_bio->sector);
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
-					      GFP_NOIO, conf->bio_split);
+					      gfp, conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1172,7 +1203,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
 	r10_bio->devs[slot].rdev = rdev;
@@ -1487,6 +1518,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
 
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
@@ -2556,9 +2588,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	struct bio *bio;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev = r10_bio->devs[slot].rdev;
-	char b[BDEVNAME_SIZE];
-	unsigned long do_sync;
-	int max_sectors;
 	dev_t bio_dev;
 	sector_t bio_last_sector;
 
@@ -2571,7 +2600,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	 * frozen.
 	 */
 	bio = r10_bio->devs[slot].bio;
-	bdevname(bio->bi_bdev, b);
 	bio_dev = bio->bi_bdev->bd_dev;
 	bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
 	bio_put(bio);
@@ -2587,65 +2615,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 		md_error(mddev, rdev);
 
 	rdev_dec_pending(rdev, mddev);
-
-read_more:
-	rdev = read_balance(conf, r10_bio, &max_sectors);
-	if (rdev == NULL) {
-		pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
-				    mdname(mddev), b,
-				    (unsigned long long)r10_bio->sector);
-		raid_end_bio_io(r10_bio);
-		return;
-	}
-
-	do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
-	slot = r10_bio->read_slot;
-	pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
-			   mdname(mddev),
-			   bdevname(rdev->bdev, b),
-			   (unsigned long long)r10_bio->sector);
-	bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
-	r10_bio->devs[slot].bio = bio;
-	r10_bio->devs[slot].rdev = rdev;
-	bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
-		+ choose_data_offset(r10_bio, rdev);
-	bio->bi_bdev = rdev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
-	if (test_bit(FailFast, &rdev->flags) &&
-	    test_bit(R10BIO_FailFast, &r10_bio->state))
-		bio->bi_opf |= MD_FAILFAST;
-	bio->bi_private = r10_bio;
-	bio->bi_end_io = raid10_end_read_request;
-	trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-			      bio, bio_dev,
-			      bio_last_sector - r10_bio->sectors);
-
-	if (max_sectors < r10_bio->sectors) {
-		/* Drat - have to split this up more */
-		struct bio *mbio = r10_bio->master_bio;
-		int sectors_handled =
-			r10_bio->sector + max_sectors
-			- mbio->bi_iter.bi_sector;
-		r10_bio->sectors = max_sectors;
-		bio_inc_remaining(mbio);
-		inc_pending(conf);
-		generic_make_request(bio);
-
-		r10_bio = mempool_alloc(conf->r10bio_pool,
-					GFP_NOIO);
-		r10_bio->master_bio = mbio;
-		r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
-		r10_bio->state = 0;
-		set_bit(R10BIO_ReadError,
-			&r10_bio->state);
-		r10_bio->mddev = mddev;
-		r10_bio->sector = mbio->bi_iter.bi_sector
-			+ sectors_handled;
-
-		goto read_more;
-	} else
-		generic_make_request(bio);
+	allow_barrier(conf);
+	r10_bio->state = 0;
+	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
 }
 
 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)

From dd7a8f5dee81ffb1794df1103f07c63fd4f1d766 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: [PATCH 71/84] md/raid5: make chunk_aligned_read() split bios more
 cleanly.

chunk_aligned_read() currently uses fs_bio_set - which is meant for
filesystems to use - and loops if multiple splits are needed, which is
not best practice.
As this is only used for READ requests, not writes, it is unlikely
to cause a problem.  However it is best to be consistent in how
we split bios, and to follow the pattern used in raid1/raid10.

So create a private bioset, bio_split, and use it to perform a single
split, submitting the remainder to generic_make_request() for later
processing.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 33 +++++++++++++++++----------------
 drivers/md/raid5.h |  1 +
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f3692ff4262b..356cd9c7c753 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5246,24 +5246,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 {
 	struct bio *split;
+	sector_t sector = raid_bio->bi_iter.bi_sector;
+	unsigned chunk_sects = mddev->chunk_sectors;
+	unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
 
-	do {
-		sector_t sector = raid_bio->bi_iter.bi_sector;
-		unsigned chunk_sects = mddev->chunk_sectors;
-		unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
+	if (sectors < bio_sectors(raid_bio)) {
+		struct r5conf *conf = mddev->private;
+		split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+		bio_chain(split, raid_bio);
+		generic_make_request(raid_bio);
+		raid_bio = split;
+	}
 
-		if (sectors < bio_sectors(raid_bio)) {
-			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, raid_bio);
-		} else
-			split = raid_bio;
-
-		if (!raid5_read_one_chunk(mddev, split)) {
-			if (split != raid_bio)
-				generic_make_request(raid_bio);
-			return split;
-		}
-	} while (split != raid_bio);
+	if (!raid5_read_one_chunk(mddev, raid_bio))
+		return raid_bio;
 
 	return NULL;
 }
@@ -6747,6 +6743,8 @@ static void free_conf(struct r5conf *conf)
 		if (conf->disks[i].extra_page)
 			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf->pending_data);
 	kfree(conf);
@@ -6922,6 +6920,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			goto abort;
 	}
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto abort;
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cdc7f92e1806..625c7f16fd6b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -646,6 +646,7 @@ struct r5conf {
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
+	struct bio_set		*bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.

From 868f604b1de2ddbdcf194d7792a0d91d7da719c1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: [PATCH 72/84] md/linear: improve bio splitting.

linear_make_request() uses fs_bio_set, which is meant for filesystems
to use, and loops, possible allocating  from the same bio set multiple
times.
These behaviors can theoretically cause deadlocks, though as
linear requests are hardly ever split, it is unlikely in practice.

Change to use mddev->bio_set - otherwise unused for linear, and submit
the tail of a split request to generic_make_request() for it to
handle.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/linear.c | 67 +++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3e38e0207a3e..f16316fbf658 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -249,53 +249,48 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	struct dev_info *tmp_dev;
-	struct bio *split;
 	sector_t start_sector, end_sector, data_offset;
+	sector_t bio_sector = bio->bi_iter.bi_sector;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	do {
-		sector_t bio_sector = bio->bi_iter.bi_sector;
-		tmp_dev = which_dev(mddev, bio_sector);
-		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-		end_sector = tmp_dev->end_sector;
-		data_offset = tmp_dev->rdev->data_offset;
-		bio->bi_bdev = tmp_dev->rdev->bdev;
+	tmp_dev = which_dev(mddev, bio_sector);
+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+	end_sector = tmp_dev->end_sector;
+	data_offset = tmp_dev->rdev->data_offset;
 
-		if (unlikely(bio_sector >= end_sector ||
-			     bio_sector < start_sector))
-			goto out_of_bounds;
+	if (unlikely(bio_sector >= end_sector ||
+		     bio_sector < start_sector))
+		goto out_of_bounds;
 
-		if (unlikely(bio_end_sector(bio) > end_sector)) {
-			/* This bio crosses a device boundary, so we have to
-			 * split it.
-			 */
-			split = bio_split(bio, end_sector - bio_sector,
-					  GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
+	if (unlikely(bio_end_sector(bio) > end_sector)) {
+		/* This bio crosses a device boundary, so we have to split it */
+		struct bio *split = bio_split(bio, end_sector - bio_sector,
+					      GFP_NOIO, mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+	}
 
-		split->bi_iter.bi_sector = split->bi_iter.bi_sector -
-			start_sector + data_offset;
+	bio->bi_bdev = tmp_dev->rdev->bdev;
+	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+		start_sector + data_offset;
 
-		if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
-			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
-			/* Just ignore it */
-			bio_endio(split);
-		} else {
-			if (mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
-						      split, disk_devt(mddev->gendisk),
-						      bio_sector);
-			mddev_check_writesame(mddev, split);
-			generic_make_request(split);
-		}
-	} while (split != bio);
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio);
+	} else {
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+					      bio, disk_devt(mddev->gendisk),
+					      bio_sector);
+		mddev_check_writesame(mddev, bio);
+		generic_make_request(bio);
+	}
 	return;
 
 out_of_bounds:

From f00d7c85be9e39752ef87047a019dfc4cefbd299 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: [PATCH 73/84] md/raid0: fix up bio splitting.

raid0_make_request() should use a private bio_set rather than the
shared fs_bio_set, which is only meant for filesystems to use.

raid0_make_request() shouldn't loop around using the bio_set
multiple times as that can deadlock.

So use mddev->bio_set and pass the tail to generic_make_request()
instead of looping on it.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid0.c | 71 +++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 56f70c3ad37c..e777e48f55f6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -462,52 +462,53 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
-	struct bio *split;
+	sector_t bio_sector;
+	sector_t sector;
+	unsigned chunk_sects;
+	unsigned sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	do {
-		sector_t bio_sector = bio->bi_iter.bi_sector;
-		sector_t sector = bio_sector;
-		unsigned chunk_sects = mddev->chunk_sectors;
+	bio_sector = bio->bi_iter.bi_sector;
+	sector = bio_sector;
+	chunk_sects = mddev->chunk_sectors;
 
-		unsigned sectors = chunk_sects -
-			(likely(is_power_of_2(chunk_sects))
-			 ? (sector & (chunk_sects-1))
-			 : sector_div(sector, chunk_sects));
+	sectors = chunk_sects -
+		(likely(is_power_of_2(chunk_sects))
+		 ? (sector & (chunk_sects-1))
+		 : sector_div(sector, chunk_sects));
 
-		/* Restore due to sector_div */
-		sector = bio_sector;
+	/* Restore due to sector_div */
+	sector = bio_sector;
 
-		if (sectors < bio_sectors(bio)) {
-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
+	if (sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+	}
 
-		zone = find_zone(mddev->private, &sector);
-		tmp_dev = map_sector(mddev, zone, sector, &sector);
-		split->bi_bdev = tmp_dev->bdev;
-		split->bi_iter.bi_sector = sector + zone->dev_start +
-			tmp_dev->data_offset;
+	zone = find_zone(mddev->private, &sector);
+	tmp_dev = map_sector(mddev, zone, sector, &sector);
+	bio->bi_bdev = tmp_dev->bdev;
+	bio->bi_iter.bi_sector = sector + zone->dev_start +
+		tmp_dev->data_offset;
 
-		if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
-			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
-			/* Just ignore it */
-			bio_endio(split);
-		} else {
-			if (mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
-						      split, disk_devt(mddev->gendisk),
-						      bio_sector);
-			mddev_check_writesame(mddev, split);
-			generic_make_request(split);
-		}
-	} while (split != bio);
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio);
+	} else {
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+					      bio, disk_devt(mddev->gendisk),
+					      bio_sector);
+		mddev_check_writesame(mddev, bio);
+		generic_make_request(bio);
+	}
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)

From fcd403aff643a120dbb3f1851337df5353ac8970 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 11 Apr 2017 20:50:51 +0200
Subject: [PATCH 74/84] raid5-ppl: use a single mempool for ppl_io_unit and
 header_page

Allocate both struct ppl_io_unit and its header_page from a shared
mempool to avoid a possible deadlock. Implement allocate and free
functions for the mempool, remove the second pool for allocating
header_page. The header_pages are now freed with their io_units, not
when the ppl bio completes. Also, use GFP_NOWAIT instead of GFP_ATOMIC
for allocating ppl_io_unit because we can handle failed allocations and
there is no reason to utilize emergency reserves.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 53 +++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 4eb0ebcf9c29..5d25bebf3328 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -102,7 +102,6 @@ struct ppl_conf {
 	struct kmem_cache *io_kc;
 	mempool_t *io_pool;
 	struct bio_set *bs;
-	mempool_t *meta_pool;
 
 	/* used only for recovery */
 	int recovered_entries;
@@ -197,25 +196,55 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 	return tx;
 }
 
+static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	struct kmem_cache *kc = pool_data;
+	struct ppl_io_unit *io;
+
+	io = kmem_cache_alloc(kc, gfp_mask);
+	if (!io)
+		return NULL;
+
+	io->header_page = alloc_page(gfp_mask);
+	if (!io->header_page) {
+		kmem_cache_free(kc, io);
+		return NULL;
+	}
+
+	return io;
+}
+
+static void ppl_io_pool_free(void *element, void *pool_data)
+{
+	struct kmem_cache *kc = pool_data;
+	struct ppl_io_unit *io = element;
+
+	__free_page(io->header_page);
+	kmem_cache_free(kc, io);
+}
+
 static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
 					  struct stripe_head *sh)
 {
 	struct ppl_conf *ppl_conf = log->ppl_conf;
 	struct ppl_io_unit *io;
 	struct ppl_header *pplhdr;
+	struct page *header_page;
 
-	io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
+	io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
 	if (!io)
 		return NULL;
 
+	header_page = io->header_page;
 	memset(io, 0, sizeof(*io));
+	io->header_page = header_page;
+
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
 	atomic_set(&io->pending_stripes, 0);
 	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
 
-	io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
 	pplhdr = page_address(io->header_page);
 	clear_page(pplhdr);
 	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
@@ -371,8 +400,6 @@ static void ppl_log_endio(struct bio *bio)
 	if (bio->bi_error)
 		md_error(ppl_conf->mddev, log->rdev);
 
-	mempool_free(io->header_page, ppl_conf->meta_pool);
-
 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 		list_del_init(&sh->log_list);
 
@@ -1007,7 +1034,6 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 
 	kfree(ppl_conf->child_logs);
 
-	mempool_destroy(ppl_conf->meta_pool);
 	if (ppl_conf->bs)
 		bioset_free(ppl_conf->bs);
 	mempool_destroy(ppl_conf->io_pool);
@@ -1113,25 +1139,20 @@ int ppl_init_log(struct r5conf *conf)
 
 	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
 	if (!ppl_conf->io_kc) {
-		ret = -EINVAL;
+		ret = -ENOMEM;
 		goto err;
 	}
 
-	ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
+	ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
+					   ppl_io_pool_free, ppl_conf->io_kc);
 	if (!ppl_conf->io_pool) {
-		ret = -EINVAL;
+		ret = -ENOMEM;
 		goto err;
 	}
 
 	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
 	if (!ppl_conf->bs) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
-	if (!ppl_conf->meta_pool) {
-		ret = -EINVAL;
+		ret = -ENOMEM;
 		goto err;
 	}
 

From 039b7225e6e98783a7a7e79c52b29c437f29967d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 12 Apr 2017 16:26:13 +1000
Subject: [PATCH 75/84] md: allow creation of mdNNN arrays via
 md_mod/parameters/new_array

The intention when creating the "new_array" parameter and the
possibility of having array names line "md_HOME" was to transition
away from the old way of creating arrays and to eventually only use
this new way.

The "old" way of creating array is to create a device node in /dev
and then open it.  The act of opening creates the array.
This is problematic because sometimes the device node can be opened
when we don't want to create an array.  This can easily happen
when some rule triggered by udev looks at a device as it is being
destroyed.  The node in /dev continues to exist for a short period
after an array is stopped, and opening it during this time recreates
the array (as an inactive array).

Unfortunately no clear plan for the transition was created.  It is now
time to fix that.

This patch allows devices with numeric names, like "md999" to be
created by writing to "new_array".  This will only work if the minor
number given is not already in use.  This will allow mdadm to
support the creation of arrays with numbers > 511 (currently not
possible) by writing to new_array.
mdadm can, at some point, use this approach to create *all* arrays,
which will allow the transition to only using the new-way.

Signed-off-by: NeilBrown <neilb@suse.com>
Acted-by: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5022be183390..554bf213c043 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5162,6 +5162,15 @@ static void no_op(struct percpu_ref *r) {}
 
 static int md_alloc(dev_t dev, char *name)
 {
+	/*
+	 * If dev is zero, name is the name of a device to allocate with
+	 * an arbitrary minor number.  It will be "md_???"
+	 * If dev is non-zero it must be a device number with a MAJOR of
+	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
+	 * the device is being created by opening a node in /dev.
+	 * If "name" is not NULL, the device is being created by
+	 * writing to /sys/module/md_mod/parameters/new_array.
+	 */
 	static DEFINE_MUTEX(disks_mutex);
 	struct mddev *mddev = mddev_find(dev);
 	struct gendisk *disk;
@@ -5187,7 +5196,7 @@ static int md_alloc(dev_t dev, char *name)
 	if (mddev->gendisk)
 		goto abort;
 
-	if (name) {
+	if (name && !dev) {
 		/* Need to ensure that 'name' is not a duplicate.
 		 */
 		struct mddev *mddev2;
@@ -5201,6 +5210,11 @@ static int md_alloc(dev_t dev, char *name)
 			}
 		spin_unlock(&all_mddevs_lock);
 	}
+	if (name && dev)
+		/*
+		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
+		 */
+		mddev->hold_active = UNTIL_STOP;
 
 	error = -ENOMEM;
 	mddev->queue = blk_alloc_queue(GFP_KERNEL);
@@ -5277,21 +5291,31 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 
 static int add_named_array(const char *val, struct kernel_param *kp)
 {
-	/* val must be "md_*" where * is not all digits.
-	 * We allocate an array with a large free minor number, and
+	/*
+	 * val must be "md_*" or "mdNNN".
+	 * For "md_*" we allocate an array with a large free minor number, and
 	 * set the name to val.  val must not already be an active name.
+	 * For "mdNNN" we allocate an array with the minor number NNN
+	 * which must not already be in use.
 	 */
 	int len = strlen(val);
 	char buf[DISK_NAME_LEN];
+	unsigned long devnum;
 
 	while (len && val[len-1] == '\n')
 		len--;
 	if (len >= DISK_NAME_LEN)
 		return -E2BIG;
 	strlcpy(buf, val, len+1);
-	if (strncmp(buf, "md_", 3) != 0)
-		return -EINVAL;
-	return md_alloc(0, buf);
+	if (strncmp(buf, "md_", 3) == 0)
+		return md_alloc(0, buf);
+	if (strncmp(buf, "md", 2) == 0 &&
+	    isdigit(buf[2]) &&
+	    kstrtoul(buf+2, 10, &devnum) == 0 &&
+	    devnum <= MINORMASK)
+		return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
+
+	return -EINVAL;
 }
 
 static void md_safemode_timeout(unsigned long data)

From 78b6350dcaadb03b4a2970b16387227ba6744876 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 12 Apr 2017 16:26:13 +1000
Subject: [PATCH 76/84] md: support disabling of create-on-open semantics.

md allows a new array device to be created by simply
opening a device file.  This make it difficult to
remove the device and udev is likely to open the device file
as part of processing the REMOVE event.

There is an alternate mechanism for creating arrays
by writing to the new_array module parameter.
When using tools that work with this parameter, it is
best to disable the old semantics.
This new module parameter allows that.

Signed-off-by: NeilBrown <neilb@suse.com>
Acted-by: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 554bf213c043..6cc6dd74c153 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -174,6 +174,16 @@ static const struct block_device_operations md_fops;
 
 static int start_readonly;
 
+/*
+ * The original mechanism for creating an md device is to create
+ * a device node in /dev and to open it.  This causes races with device-close.
+ * The preferred method is to write to the "new_array" module parameter.
+ * This can avoid races.
+ * Setting create_on_open to false disables the original mechanism
+ * so all the races disappear.
+ */
+static bool create_on_open = true;
+
 /* bio_clone_mddev
  * like bio_clone, but with a local bio set
  */
@@ -5285,7 +5295,8 @@ static int md_alloc(dev_t dev, char *name)
 
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
-	md_alloc(dev, NULL);
+	if (create_on_open)
+		md_alloc(dev, NULL);
 	return NULL;
 }
 
@@ -9200,6 +9211,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
+module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MD RAID framework");

From 835d89e92fa77b6a1effea3db80202e1ffec413a Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 14 Apr 2017 08:08:03 +0200
Subject: [PATCH 77/84] md-cluster: Fix a memleak in an error handling path

We know that 'bm_lockres' is NULL here, so 'lockres_free(bm_lockres)' is a
no-op. According to resource handling in case of error a few lines below,
it is likely that 'bitmap_free(bitmap)' was expected instead.

Fixes: b98938d16a10 ("md-cluster: introduce cluster_check_sync_size")

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b21ef58819f6..7299ce2f08a8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1127,7 +1127,7 @@ int cluster_check_sync_size(struct mddev *mddev)
 		bm_lockres = lockres_init(mddev, str, NULL, 1);
 		if (!bm_lockres) {
 			pr_err("md-cluster: Cannot initialize %s\n", str);
-			lockres_free(bm_lockres);
+			bitmap_free(bitmap);
 			return -1;
 		}
 		bm_lockres->flags |= DLM_LKF_NOQUEUE;

From fc6d2a3ca59d5656d5b0ac3b25ecf493e4614abd Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Wed, 19 Apr 2017 10:48:06 +0200
Subject: [PATCH 78/84] uapi: fix linux/raid/md_p.h userspace compilation error

Use __le32 and __le64 instead of u32 and u64.

This fixes klibc build error:
  In file included from /klibc/usr/klibc/../include/sys/md.h:30:0,
                   from /klibc/usr/kinit/do_mounts_md.c:19:
  /linux-next/usr/include/linux/raid/md_p.h:414:51: error: 'u32' undeclared here (not in a function)
    (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))

Reported-by: Greg Thelen <gthelen@google.com>
Reported-by: Nigel Croxon <ncroxon@redhat.com>
Tested-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/uapi/linux/raid/md_p.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d9a1ead867b9..d500bd224979 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -411,7 +411,7 @@ struct ppl_header_entry {
 #define PPL_HEADER_SIZE 4096
 #define PPL_HDR_RESERVED 512
 #define PPL_HDR_ENTRY_SPACE \
-	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__le32) - sizeof(__le64))
 #define PPL_HDR_MAX_ENTRIES \
 	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
 

From cf25ae78fc50010f66b9be945017796da34c434d Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 17 Apr 2017 17:11:05 +0800
Subject: [PATCH 79/84] md/raid10: wait up frozen array in
 handle_write_completed

Since nr_queued is changed, we need to call wake_up here
if the array is already frozen and waiting for condition
"nr_pending == nr_queued + extra" to be true.

And commit 824e47daddbf ("RAID1: avoid unnecessary spin
locks in I/O barrier code") which has already added the
wake_up for raid1.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4167091eff9a..acb3f46f522f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2704,6 +2704,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
 			conf->nr_queued++;
 			spin_unlock_irq(&conf->device_lock);
+			/*
+			 * In case freeze_array() is waiting for condition
+			 * nr_pending == nr_queued + extra to be true.
+			 */
+			wake_up(&conf->wait_barrier);
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
 			if (test_bit(R10BIO_WriteError,

From 97b20ef784388753becb41b27876a048e905fe9b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 13 Apr 2017 08:53:48 +1000
Subject: [PATCH 80/84] md: handle read-only member devices better.

1/ If an array has any read-only devices when it is started,
   the array itself must be read-only
2/ A read-only device cannot be added to an array after it is
   started.
3/ Setting an array to read-write should not succeed
   if any member devices are read-only

Reported-and-Tested-by: Nanda Kishore Chinnaram <Nanda_Kishore_Chinna@dell.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6cc6dd74c153..82f798be964f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2103,6 +2103,10 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 	if (find_rdev(mddev, rdev->bdev->bd_dev))
 		return -EEXIST;
 
+	if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
+	    mddev->pers)
+		return -EROFS;
+
 	/* make sure rdev->sectors exceeds mddev->dev_sectors */
 	if (!test_bit(Journal, &rdev->flags) &&
 	    rdev->sectors &&
@@ -5382,6 +5386,13 @@ int md_run(struct mddev *mddev)
 			continue;
 		sync_blockdev(rdev->bdev);
 		invalidate_bdev(rdev->bdev);
+		if (mddev->ro != 1 &&
+		    (bdev_read_only(rdev->bdev) ||
+		     bdev_read_only(rdev->meta_bdev))) {
+			mddev->ro = 1;
+			if (mddev->gendisk)
+				set_disk_ro(mddev->gendisk, 1);
+		}
 
 		/* perform some consistency tests on the device.
 		 * We don't want the data to overlap the metadata,
@@ -5606,6 +5617,9 @@ out:
 static int restart_array(struct mddev *mddev)
 {
 	struct gendisk *disk = mddev->gendisk;
+	struct md_rdev *rdev;
+	bool has_journal = false;
+	bool has_readonly = false;
 
 	/* Complain if it has no devices */
 	if (list_empty(&mddev->disks))
@@ -5614,24 +5628,21 @@ static int restart_array(struct mddev *mddev)
 		return -EINVAL;
 	if (!mddev->ro)
 		return -EBUSY;
-	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
-		struct md_rdev *rdev;
-		bool has_journal = false;
 
-		rcu_read_lock();
-		rdev_for_each_rcu(rdev, mddev) {
-			if (test_bit(Journal, &rdev->flags) &&
-			    !test_bit(Faulty, &rdev->flags)) {
-				has_journal = true;
-				break;
-			}
-		}
-		rcu_read_unlock();
-
-		/* Don't restart rw with journal missing/faulty */
-		if (!has_journal)
-			return -EINVAL;
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev) {
+		if (test_bit(Journal, &rdev->flags) &&
+		    !test_bit(Faulty, &rdev->flags))
+			has_journal = true;
+		if (bdev_read_only(rdev->bdev))
+			has_readonly = true;
 	}
+	rcu_read_unlock();
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
+		/* Don't restart rw with journal missing/faulty */
+			return -EINVAL;
+	if (has_readonly)
+		return -EROFS;
 
 	mddev->safemode = 0;
 	mddev->ro = 0;

From 296617581eac713b3fda588216ae6d16d1e76dd5 Mon Sep 17 00:00:00 2001
From: Lidong Zhong <lidong.zhong@suse.com>
Date: Fri, 21 Apr 2017 15:21:38 +0800
Subject: [PATCH 81/84] md/raid1/10: remove unused queue

A queue is declared and get from the disk of the array, but it's not
used anywhere. So removing it from the source.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Acted-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 3 ---
 drivers/md/raid10.c | 2 --
 2 files changed, 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 57611f43ed6c..14a9d36b25b8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2961,7 +2961,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	err = -EINVAL;
 	spin_lock_init(&conf->device_lock);
 	rdev_for_each(rdev, mddev) {
-		struct request_queue *q;
 		int disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
@@ -2974,8 +2973,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 		if (disk->rdev)
 			goto abort;
 		disk->rdev = rdev;
-		q = bdev_get_queue(rdev->bdev);
-
 		disk->head_position = 0;
 		disk->seq_start = MaxSector;
 	}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index acb3f46f522f..5de951bcd24c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3644,7 +3644,6 @@ static int raid10_run(struct mddev *mddev)
 
 	rdev_for_each(rdev, mddev) {
 		long long diff;
-		struct request_queue *q;
 
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0)
@@ -3663,7 +3662,6 @@ static int raid10_run(struct mddev *mddev)
 				goto out_free_conf;
 			disk->rdev = rdev;
 		}
-		q = bdev_get_queue(rdev->bdev);
 		diff = (rdev->new_data_offset - rdev->data_offset);
 		if (!mddev->reshape_backwards)
 			diff = -diff;

From e5bc9c3c5432f5531a58e6fdd9f6c6587f2137b3 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 24 Apr 2017 15:58:04 +0800
Subject: [PATCH 82/84] md: clear WantReplacement once disk is removed

We can clear 'WantReplacement' flag directly no
matter it's replacement existed or not since the
semantic is same as before.

Also since the disk is removed from array, then
it is straightforward to remove 'WantReplacement'
flag and the comments in raid10/5 can be removed
as well.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 6 +++---
 drivers/md/raid10.c | 8 ++------
 drivers/md/raid5.c  | 9 +++------
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 14a9d36b25b8..70a596c10306 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1831,9 +1831,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			p->rdev = repl;
 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
 			unfreeze_array(conf);
-			clear_bit(WantReplacement, &rdev->flags);
-		} else
-			clear_bit(WantReplacement, &rdev->flags);
+		}
+
+		clear_bit(WantReplacement, &rdev->flags);
 		err = md_integrity_register(mddev);
 	}
 abort:
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5de951bcd24c..2883b720a265 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1874,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   * but will never see neither -- if they are careful.
 			   */
 		p->replacement = NULL;
-		clear_bit(WantReplacement, &rdev->flags);
-	} else
-		/* We might have just remove the Replacement as faulty
-		 * Clear the flag just in case
-		 */
-		clear_bit(WantReplacement, &rdev->flags);
+	}
 
+	clear_bit(WantReplacement, &rdev->flags);
 	err = md_integrity_register(mddev);
 
 abort:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 356cd9c7c753..3d971e5a1b0e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7603,15 +7603,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   * but will never see neither - if they are careful
 			   */
 		p->replacement = NULL;
-		clear_bit(WantReplacement, &rdev->flags);
 
 		if (!err)
 			err = log_modify(conf, p->rdev, true);
-	} else
-		/* We might have just removed the Replacement as faulty-
-		 * clear the bit just in case
-		 */
-		clear_bit(WantReplacement, &rdev->flags);
+	}
+
+	clear_bit(WantReplacement, &rdev->flags);
 abort:
 
 	print_raid5_conf(conf);

From 43ac9b84a399bc10210a2d9f4e0778b7c6059c07 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Thu, 27 Apr 2017 16:28:49 +0800
Subject: [PATCH 83/84] md/raid1: Use a new variable to count flighting sync
 requests

In new barrier codes, raise_barrier waits if conf->nr_pending[idx] is not zero.
After all the conditions are true, the resync request can go on be handled. But
it adds conf->nr_pending[idx] again. The next resync request hit the same bucket
idx need to wait the resync request which is submitted before. The performance
of resync/recovery is degraded.
So we should use a new variable to count sync requests which are in flight.

I did a simple test:
1. Without the patch, create a raid1 with two disks. The resync speed:
Device:         rrqm/s   wrqm/s     r/s     w/s    rMB/s    wMB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sdb               0.00     0.00  166.00    0.00    10.38     0.00   128.00     0.03    0.20    0.20    0.00   0.19   3.20
sdc               0.00     0.00    0.00  166.00     0.00    10.38   128.00     0.96    5.77    0.00    5.77   5.75  95.50
2. With the patch, the result is:
sdb            2214.00     0.00  766.00    0.00   185.69     0.00   496.46     2.80    3.66    3.66    0.00   1.03  79.10
sdc               0.00  2205.00    0.00  769.00     0.00   186.44   496.52     5.25    6.84    0.00    6.84   1.30 100.10

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: Xiao Ni <xni@redhat.com>
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 7 ++++---
 drivers/md/raid1.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 70a596c10306..5d19c1a7d90a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -885,7 +885,7 @@ static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
 			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
 			    conf->resync_lock);
 
-	atomic_inc(&conf->nr_pending[idx]);
+	atomic_inc(&conf->nr_sync_pending);
 	spin_unlock_irq(&conf->resync_lock);
 }
 
@@ -896,7 +896,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
 	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
 
 	atomic_dec(&conf->barrier[idx]);
-	atomic_dec(&conf->nr_pending[idx]);
+	atomic_dec(&conf->nr_sync_pending);
 	wake_up(&conf->wait_barrier);
 }
 
@@ -1033,7 +1033,8 @@ static int get_unqueued_pending(struct r1conf *conf)
 {
 	int idx, ret;
 
-	for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
+	ret = atomic_read(&conf->nr_sync_pending);
+	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
 		ret += atomic_read(&conf->nr_pending[idx]) -
 			atomic_read(&conf->nr_queued[idx]);
 
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index b0ab0da6e39e..c8894ef1e9d2 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -84,6 +84,7 @@ struct r1conf {
 	 */
 	wait_queue_head_t	wait_barrier;
 	spinlock_t		resync_lock;
+	atomic_t		nr_sync_pending;
 	atomic_t		*nr_pending;
 	atomic_t		*nr_waiting;
 	atomic_t		*nr_queued;

From b506335e5d2b4ec687dde392a3bdbf7601778f1d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 1 May 2017 12:15:07 -0700
Subject: [PATCH 84/84] md/raid10: skip spare disk as 'first' disk

Commit 6f287ca(md/raid10: reset the 'first' at the end of loop) ignores
a case in reshape, the first rdev could be a spare disk, which shouldn't
be accounted as the first disk since it doesn't include the offset info.

Fix: 6f287ca(md/raid10: reset the 'first' at the end of loop)
Cc: Guoqing Jiang <gqjiang@suse.com>
Cc: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2883b720a265..cce23be9cc93 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4079,8 +4079,8 @@ static int raid10_start_reshape(struct mddev *mddev)
 				diff = 0;
 			if (first || diff < min_offset_diff)
 				min_offset_diff = diff;
+			first = 0;
 		}
-		first = 0;
 	}
 
 	if (max(before_length, after_length) > min_offset_diff)