From 41a9a0dcf8954654467f979838938e39ef4da590 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:08 -0400
Subject: [PATCH 01/19] md-cluster: change resync lock from asynchronous to
 synchronous

If multiple nodes choose to attempt do resync at the same time
they need to be serialized so they don't duplicate effort. This
serialization is done by locking the 'resync' DLM lock.

Currently if a node cannot get the lock immediately it doesn't
request notification when the lock becomes available (i.e.
DLM_LKF_NOQUEUE is set), so it may not reliably find out when it
is safe to try again.

Rather than trying to arrange an async wake-up when the lock
becomes available, switch to using synchronous locking - this is
a lot easier to think about.  As it is not permitted to block in
the 'raid1d' thread, move the locking to the resync thread.  So
the rsync thread is forked immediately, but it blocks until the
resync lock is available. Once the lock is locked it checks again
if any resync action is needed.

A particular symptom of the current problem is that a node can
get stuck with "resync=pending" indefinitely.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c |  2 --
 drivers/md/md.c         | 23 ++++++++++++++---------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index dd97d4245822..12fbfeced238 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -937,7 +937,6 @@ static void metadata_update_cancel(struct mddev *mddev)
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
 	return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
 }
 
@@ -967,7 +966,6 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 static int resync_finish(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
 	dlm_unlock_sync(cinfo->resync_lockres);
 	return resync_info_update(mddev, 0, 0);
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 14d3b37944df..4fd7d7757f2d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7786,6 +7786,7 @@ void md_do_sync(struct md_thread *thread)
 	char *desc, *action = NULL;
 	struct blk_plug plug;
 	bool cluster_resync_finished = false;
+	int ret;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7795,6 +7796,19 @@ void md_do_sync(struct md_thread *thread)
 		return;
 	}
 
+	if (mddev_is_clustered(mddev)) {
+		ret = md_cluster_ops->resync_start(mddev);
+		if (ret)
+			goto skip;
+
+		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
+			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+		     && ((unsigned long long)mddev->curr_resync_completed
+			 < (unsigned long long)mddev->resync_max_sectors))
+			goto skip;
+	}
+
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
 			desc = "data-check";
@@ -8226,18 +8240,9 @@ static void md_start_sync(struct work_struct *ws)
 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
 	int ret = 0;
 
-	if (mddev_is_clustered(mddev)) {
-		ret = md_cluster_ops->resync_start(mddev);
-		if (ret) {
-			mddev->sync_thread = NULL;
-			goto out;
-		}
-	}
-
 	mddev->sync_thread = md_register_thread(md_do_sync,
 						mddev,
 						"resync");
-out:
 	if (!mddev->sync_thread) {
 		if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
 			printk(KERN_ERR "%s: could not start resync"

From 2c97cf138527a0f0283fcca9acf4a06216bec7da Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:09 -0400
Subject: [PATCH 02/19] md-cluser: make resync_finish only called after
 pers->sync_request

It is not reasonable that cluster raid to release resync
lock before the last pers->sync_request has finished.

As the metadata will be changed when node performs resync,
we need to inform other nodes to update metadata, so the
MD_CHANGE_PENDING flag is set before finish resync.

Then metadata_update_finish is move ahead to ensure that
METADATA_UPDATED msg is sent before finish resync, and
metadata_update_start need to be run after "repeat:" label
accordingly.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4fd7d7757f2d..dd83a50d892c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2291,6 +2291,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 		return;
 	}
 
+repeat:
 	if (mddev_is_clustered(mddev)) {
 		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 			force_change = 1;
@@ -2303,7 +2304,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 			return;
 		}
 	}
-repeat:
+
 	/* First make sure individual recovery_offsets are correct */
 	rdev_for_each(rdev, mddev) {
 		if (rdev->raid_disk >= 0 &&
@@ -2430,6 +2431,9 @@ repeat:
 	md_super_wait(mddev);
 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
 
+	if (mddev_is_clustered(mddev) && ret == 0)
+		md_cluster_ops->metadata_update_finish(mddev);
+
 	spin_lock(&mddev->lock);
 	if (mddev->in_sync != sync_req ||
 	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
@@ -2452,9 +2456,6 @@ repeat:
 		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 	}
-
-	if (mddev_is_clustered(mddev) && ret == 0)
-		md_cluster_ops->metadata_update_finish(mddev);
 }
 EXPORT_SYMBOL(md_update_sb);
 
@@ -7785,7 +7786,6 @@ void md_do_sync(struct md_thread *thread)
 	struct md_rdev *rdev;
 	char *desc, *action = NULL;
 	struct blk_plug plug;
-	bool cluster_resync_finished = false;
 	int ret;
 
 	/* just incase thread restarts... */
@@ -8103,11 +8103,6 @@ void md_do_sync(struct md_thread *thread)
 		mddev->curr_resync_completed = mddev->curr_resync;
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
-	/* tell personality and other nodes that we are finished */
-	if (mddev_is_clustered(mddev)) {
-		md_cluster_ops->resync_finish(mddev);
-		cluster_resync_finished = true;
-	}
 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
@@ -8147,9 +8142,15 @@ void md_do_sync(struct md_thread *thread)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	if (mddev_is_clustered(mddev) &&
-	    test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-	    !cluster_resync_finished)
+	    ret == 0) {
+		/* set CHANGE_PENDING here since maybe another
+		 * update is needed, so other nodes are informed */
+		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait,
+			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 		md_cluster_ops->resync_finish(mddev);
+	}
 
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {

From eb315cd0933b903126996e79c12cb1eae617a036 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:10 -0400
Subject: [PATCH 03/19] md-cluster: wake up thread to continue recovery

In recovery case, we need to set MD_RECOVERY_NEEDED
and wake up thread only if recover is not finished.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 12fbfeced238..0d4ddf85d607 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -284,11 +284,14 @@ static void recover_bitmaps(struct md_thread *thread)
 			goto dlm_unlock;
 		}
 		if (hi > 0) {
-			/* TODO:Wait for current resync to get over */
-			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 			if (lo < mddev->recovery_cp)
 				mddev->recovery_cp = lo;
-			md_check_recovery(mddev);
+			/* wake up thread to continue resync in case resync
+			 * is not finished */
+			if (mddev->recovery_cp != MaxSector) {
+			    set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			    md_wakeup_thread(mddev->thread);
+			}
 		}
 dlm_unlock:
 		dlm_unlock_sync(bm_lockres);

From 5b0fb33e8aa1c8a94f763d1025445146412ca766 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:11 -0400
Subject: [PATCH 04/19] md-cluster: unregister thread if err happened

The two threads need to be unregistered if a node
can't join cluster successfully.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0d4ddf85d607..76f88f731aa1 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -818,6 +818,8 @@ static int join(struct mddev *mddev, int nodes)
 
 	return 0;
 err:
+	md_unregister_thread(&cinfo->recovery_thread);
+	md_unregister_thread(&cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);

From 1535212c542285e430d44a75bfc0a99df610f598 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:12 -0400
Subject: [PATCH 05/19] md-cluster: fix locking when node joins cluster during
 message broadcast

If a node joins the cluster while a message broadcast
is under way, a lock issue could happen as follows.

For a cluster which included two nodes, if node A is
calling __sendmsg before up-convert CR to EX on ack,
and node B released CR on ack. But if a new node C
joins the cluster and it doesn't receive the message
which A sent before, so it could hold CR on ack before
A up-convert CR to EX on ack.

So a node joining the cluster should get an EX lock on
the "token" first to ensure no broadcast is ongoing,
then release it after held CR on ack.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 76f88f731aa1..30f1160142c1 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -781,17 +781,24 @@ static int join(struct mddev *mddev, int nodes)
 	cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
 	if (!cinfo->token_lockres)
 		goto err;
-	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
-	if (!cinfo->ack_lockres)
-		goto err;
 	cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
 	if (!cinfo->no_new_dev_lockres)
 		goto err;
 
+	ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+	if (ret) {
+		ret = -EAGAIN;
+		pr_err("md-cluster: can't join cluster to avoid lock issue\n");
+		goto err;
+	}
+	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
+	if (!cinfo->ack_lockres)
+		goto err;
 	/* get sync CR lock on ACK. */
 	if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
 		pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
 				ret);
+	dlm_unlock_sync(cinfo->token_lockres);
 	/* get sync CR lock on no-new-dev. */
 	if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
 		pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);

From ab5a98b132fd1a08ca35e95498fb45f4a8f3b0c4 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:13 -0400
Subject: [PATCH 06/19] md-cluster: change array_sectors and update size are
 not supported

Currently, some features are not supported yet,
such as change array_sectors and update size, so
return EINVAL for them and listed it in document.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md-cluster.txt | 6 ++++++
 drivers/md/md.c              | 8 ++++++++
 2 files changed, 14 insertions(+)

diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt
index c100c7163507..38883276d31c 100644
--- a/Documentation/md-cluster.txt
+++ b/Documentation/md-cluster.txt
@@ -316,3 +316,9 @@ The algorithm is:
  nodes are using the raid which is achieved by lock all bitmap
  locks within the cluster, and also those locks are unlocked
  accordingly.
+
+7. Unsupported features
+
+There are somethings which are not supported by cluster MD yet.
+
+- update size and change array_sectors.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dd83a50d892c..8cc4bbcf9bf8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4817,6 +4817,10 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 
+	/* cluster raid doesn't support change array_sectors */
+	if (mddev_is_clustered(mddev))
+		return -EINVAL;
+
 	if (strncmp(buf, "default", 7) == 0) {
 		if (mddev->pers)
 			sectors = mddev->pers->size(mddev, 0, 0);
@@ -6438,6 +6442,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 	int rv;
 	int fit = (num_sectors == 0);
 
+	/* cluster raid doesn't support update size */
+	if (mddev_is_clustered(mddev))
+		return -EINVAL;
+
 	if (mddev->pers->resize == NULL)
 		return -EINVAL;
 	/* The "num_sectors" is the number of sectors of each device that

From a578183ed9dff915878ec6c2b3bf729bf72b9bd1 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:33:14 -0400
Subject: [PATCH 07/19] md-cluster: wakeup thread if activated a spare disk

When a device is re-added, it will ultimately need
to be activated and that happens in md_check_recovery,
so we need to set MD_RECOVERY_NEEDED right after
remove_and_add_spares.

A specifical issue without the change is that when
one node perform fail/remove/readd on a disk, but
slave nodes could not add the disk back to array as
expected (added as missed instead of in sync). So
give slave nodes a chance to do resync.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8cc4bbcf9bf8..06f6e81f1516 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8694,6 +8694,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 				ret = remove_and_add_spares(mddev, rdev2);
 				pr_info("Activated spare: %s\n",
 						bdevname(rdev2->bdev,b));
+				/* wakeup mddev->thread here, so array could
+				 * perform resync with the new activated disk */
+				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+				md_wakeup_thread(mddev->thread);
+
 			}
 			/* device faulty
 			 * We just want to do the minimum to mark the disk

From c9d65032282943d11b2773ed6f0279ba4820fed1 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:50:11 -0400
Subject: [PATCH 08/19] md-cluster: always setup in-memory bitmap

The in-memory bitmap for raid is allocated on demand,
then for cluster scenario, it is possible that slave
node which received RESYNCING message doesn't have the
in-memory bitmap when master node is perform resyncing,
so we can't make bitmap is match up well among each
nodes.

So for cluster scenario, we need always preserve the
bitmap, and ensure the page will not be freed. And a
no_hijack flag is introduced to both bitmap_checkpage
and bitmap_get_counter, which makes cluster raid returns
fail once allocate failed.

And the next patch is relied on this change since it
keeps sync bitmap among each nodes during resyncing
stage.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3fe86b54d50b..431da21cb488 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -46,7 +46,7 @@ static inline char *bmname(struct bitmap *bitmap)
  * allocated while we're using it
  */
 static int bitmap_checkpage(struct bitmap_counts *bitmap,
-			    unsigned long page, int create)
+			    unsigned long page, int create, int no_hijack)
 __releases(bitmap->lock)
 __acquires(bitmap->lock)
 {
@@ -90,6 +90,9 @@ __acquires(bitmap->lock)
 
 	if (mappage == NULL) {
 		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
+		/* We don't support hijack for cluster raid */
+		if (no_hijack)
+			return -ENOMEM;
 		/* failed - set the hijacked flag so that we can use the
 		 * pointer as a counter */
 		if (!bitmap->bp[page].map)
@@ -1321,7 +1324,7 @@ __acquires(bitmap->lock)
 	sector_t csize;
 	int err;
 
-	err = bitmap_checkpage(bitmap, page, create);
+	err = bitmap_checkpage(bitmap, page, create, 0);
 
 	if (bitmap->bp[page].hijacked ||
 	    bitmap->bp[page].map == NULL)
@@ -2032,6 +2035,36 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		     chunks << chunkshift);
 
 	spin_lock_irq(&bitmap->counts.lock);
+	/* For cluster raid, need to pre-allocate bitmap */
+	if (mddev_is_clustered(bitmap->mddev)) {
+		unsigned long page;
+		for (page = 0; page < pages; page++) {
+			ret = bitmap_checkpage(&bitmap->counts, page, 1, 1);
+			if (ret) {
+				unsigned long k;
+
+				/* deallocate the page memory */
+				for (k = 0; k < page; k++) {
+					if (new_bp[k].map)
+						kfree(new_bp[k].map);
+				}
+
+				/* restore some fields from old_counts */
+				bitmap->counts.bp = old_counts.bp;
+				bitmap->counts.pages = old_counts.pages;
+				bitmap->counts.missing_pages = old_counts.pages;
+				bitmap->counts.chunkshift = old_counts.chunkshift;
+				bitmap->counts.chunks = old_counts.chunks;
+				bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
+									     BITMAP_BLOCK_SHIFT);
+				blocks = old_counts.chunks << old_counts.chunkshift;
+				pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n");
+				break;
+			} else
+				bitmap->counts.bp[page].count += 1;
+		}
+	}
+
 	for (block = 0; block < blocks; ) {
 		bitmap_counter_t *bmc_old, *bmc_new;
 		int set;

From 18c9ff7f487efa8e88886bee21bd3516dde05bc5 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:50:12 -0400
Subject: [PATCH 09/19] md-cluster: sync bitmap when node received RESYNCING
 msg

If the node received RESYNCING message which means
another node will perform resync with the area, then
we don't want to do it again in another node.

Let's set RESYNC_MASK and clear NEEDED_MASK for the
region from old-low to new-low which has finished
syncing, and the region from old-hi to new-hi is about
to syncing, bitmap_sync_with_cluste is introduced for
the purpose.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c     | 21 +++++++++++++++++++++
 drivers/md/bitmap.h     |  3 +++
 drivers/md/md-cluster.c | 27 +++++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 431da21cb488..ac93d874578a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1597,6 +1597,27 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
 }
 EXPORT_SYMBOL(bitmap_cond_end_sync);
 
+void bitmap_sync_with_cluster(struct mddev *mddev,
+			      sector_t old_lo, sector_t old_hi,
+			      sector_t new_lo, sector_t new_hi)
+{
+	struct bitmap *bitmap = mddev->bitmap;
+	sector_t sector, blocks = 0;
+
+	for (sector = old_lo; sector < new_lo; ) {
+		bitmap_end_sync(bitmap, sector, &blocks, 0);
+		sector += blocks;
+	}
+	WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
+
+	for (sector = old_hi; sector < new_hi; ) {
+		bitmap_start_sync(bitmap, sector, &blocks, 0);
+		sector += blocks;
+	}
+	WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
+}
+EXPORT_SYMBOL(bitmap_sync_with_cluster);
+
 static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
 {
 	/* For each chunk covered by any of these sectors, set the
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5e3fcd6ecf77..5b6dd63dda91 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -258,6 +258,9 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
 void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
+void bitmap_sync_with_cluster(struct mddev *mddev,
+			      sector_t old_lo, sector_t old_hi,
+			      sector_t new_lo, sector_t new_hi);
 
 void bitmap_unplug(struct bitmap *bitmap);
 void bitmap_daemon_work(struct mddev *mddev);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 30f1160142c1..a55b5f4d0dbe 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -85,6 +85,9 @@ struct md_cluster_info {
 	struct completion newdisk_completion;
 	wait_queue_head_t wait;
 	unsigned long state;
+	/* record the region in RESYNCING message */
+	sector_t sync_low;
+	sector_t sync_hi;
 };
 
 enum msg_type {
@@ -411,6 +414,30 @@ static void process_suspend_info(struct mddev *mddev,
 		md_wakeup_thread(mddev->thread);
 		return;
 	}
+
+	/*
+	 * The bitmaps are not same for different nodes
+	 * if RESYNCING is happening in one node, then
+	 * the node which received the RESYNCING message
+	 * probably will perform resync with the region
+	 * [lo, hi] again, so we could reduce resync time
+	 * a lot if we can ensure that the bitmaps among
+	 * different nodes are match up well.
+	 *
+	 * sync_low/hi is used to record the region which
+	 * arrived in the previous RESYNCING message,
+	 *
+	 * Call bitmap_sync_with_cluster to clear
+	 * NEEDED_MASK and set RESYNC_MASK since
+	 * resync thread is running in another node,
+	 * so we don't need to do the resync again
+	 * with the same section */
+	bitmap_sync_with_cluster(mddev, cinfo->sync_low,
+					cinfo->sync_hi,
+					lo, hi);
+	cinfo->sync_low = lo;
+	cinfo->sync_hi = hi;
+
 	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
 	if (!s)
 		return;

From 7f86ffed9b5f3f0b1a29108c5e965942b935ec98 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:50:13 -0400
Subject: [PATCH 10/19] md-cluster/bitmap: fix wrong calcuation of offset

The offset is wrong in bitmap_storage_alloc, we should
set it like below in bitmap_init_from_disk().

node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));

Because 'offset' is only assigned to 'page->index' and
that is usually over-written by read_sb_page. So it does
not cause problem in general, but it still need to be fixed.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ac93d874578a..cf93bb80aff7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -759,7 +759,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		bytes += sizeof(bitmap_super_t);
 
 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
-	offset = slot_number * (num_pages - 1);
+	offset = slot_number * num_pages;
 
 	store->filemap = kmalloc(sizeof(struct page *)
 				 * num_pages, GFP_KERNEL);

From 23cea66a37c76dc6554b862b179a654db24fa83d Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:50:14 -0400
Subject: [PATCH 11/19] md-cluster/bitmap: fix wrong page num in
 bitmap_file_clear_bit and bitmap_file_set_bit

The pnum passed to set_page_attr and test_page_attr should from
0 to storage.file_pages - 1, but bitmap_file_set_bit and
bitmap_file_clear_bit call set_page_attr and test_page_attr with
page->index parameter while page->index has already added node_offset
before.

So we need to minus node_offset in both bitmap_file_clear_bit
and bitmap_file_set_bit.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cf93bb80aff7..de28c8095df8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -903,6 +903,11 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	struct page *page;
 	void *kaddr;
 	unsigned long chunk = block >> bitmap->counts.chunkshift;
+	struct bitmap_storage *store = &bitmap->storage;
+	unsigned long node_offset = 0;
+
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * store->file_pages;
 
 	page = filemap_get_page(&bitmap->storage, chunk);
 	if (!page)
@@ -918,7 +923,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
 	kunmap_atomic(kaddr);
 	pr_debug("set file bit %lu page %lu\n", bit, page->index);
 	/* record page number so it gets flushed to disk when unplug occurs */
-	set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
+	set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
 }
 
 static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@@ -927,6 +932,11 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	struct page *page;
 	void *paddr;
 	unsigned long chunk = block >> bitmap->counts.chunkshift;
+	struct bitmap_storage *store = &bitmap->storage;
+	unsigned long node_offset = 0;
+
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * store->file_pages;
 
 	page = filemap_get_page(&bitmap->storage, chunk);
 	if (!page)
@@ -938,8 +948,8 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	else
 		clear_bit_le(bit, paddr);
 	kunmap_atomic(paddr);
-	if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
-		set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
+	if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
+		set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
 		bitmap->allclean = 0;
 	}
 }

From c84400c89f0f98ae4a73ed89886239b67d1dcd31 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 2 May 2016 11:50:15 -0400
Subject: [PATCH 12/19] md-cluster/bitmap: unplug bitmap to sync dirty pages to
 disk

This patch is doing two distinct but related things.

1. It adds bitmap_unplug() for the main bitmap (mddev->bitmap).  As bit
have been set, BITMAP_PAGE_DIRTY is set so bitmap_deamon_work() will
not write those pages out in its regular scans, only bitmap_unplug()
will.  If there are no writes to the array, bitmap_unplug() won't be
called, so we need to call it explicitly here.

2. bitmap_write_all() is a bit of a confusing interface as it doesn't
actually write anything.  The current code for writing "bitmap" works
but this change makes it a bit clearer.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index de28c8095df8..4a05bacd6601 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1924,14 +1924,14 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 
 	if (clear_bits) {
 		bitmap_update_sb(bitmap);
-		/* Setting this for the ev_page should be enough.
-		 * And we do not require both write_all and PAGE_DIRT either
-		 */
+		/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
+		 * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
 		for (i = 0; i < bitmap->storage.file_pages; i++)
-			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
-		bitmap_write_all(bitmap);
+			if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
+				set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
 		bitmap_unplug(bitmap);
 	}
+	bitmap_unplug(mddev->bitmap);
 	*low = lo;
 	*high = hi;
 err:

From bc47e84258be3e49c14be65a111f2117ecc986f6 Mon Sep 17 00:00:00 2001
From: kbuild test robot <lkp@intel.com>
Date: Mon, 2 May 2016 11:50:16 -0400
Subject: [PATCH 13/19] md-cluster: fix ifnullfree.cocci warnings

drivers/md/bitmap.c:2049:6-11: WARNING: NULL check before freeing functions like kfree, debugfs_remove, debugfs_remove_recursive or usb_free_urb is not needed. Maybe consider reorganizing relevant code to avoid passing NULL values.

 NULL check before some freeing functions is not needed.

 Based on checkpatch warning
 "kfree(NULL) is safe this check is probably not required"
 and kfreeaddr.cocci by Julia Lawall.

Generated by: scripts/coccinelle/free/ifnullfree.cocci

Acked-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 4a05bacd6601..ad5a85847004 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2076,8 +2076,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 
 				/* deallocate the page memory */
 				for (k = 0; k < page; k++) {
-					if (new_bp[k].map)
-						kfree(new_bp[k].map);
+					kfree(new_bp[k].map);
 				}
 
 				/* restore some fields from old_counts */

From 092398dce8c2406bfb0c9eebc3e764ff2ddb62a8 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 3 May 2016 19:43:57 +0200
Subject: [PATCH 14/19] md: md.c: fix oops in mddev_suspend for raid0

Introduced by upstream commit 70d9798b95562abac005d4ba71d28820f9a201eb

The raid0 personality does not create mddev->thread as oposed to
other personalities leading to its unconditional access in
mddev_suspend() causing an oops.

Patch checks for mddev->thread in order to keep the
intention of aforementioned commit.

Fixes: 70d9798b9556 ("MD: warn for potential deadlock")
Cc: stable@vger.kernel.org (4.5+)
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 06f6e81f1516..23c6d732a374 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,7 +307,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
  */
 void mddev_suspend(struct mddev *mddev)
 {
-	WARN_ON_ONCE(current == mddev->thread->tsk);
+	WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
 	if (mddev->suspended++)
 		return;
 	synchronize_rcu();

From 859644f0fa2141b5f7db0e3744999ec3bb5719d3 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 3 May 2016 19:43:24 +0200
Subject: [PATCH 15/19] md: raid10: add prerequisite to run underneath dm-raid

In case md runs underneath the dm-raid target, the mddev does not have
a request queue or gendisk, thus avoid accesses to it.

This patch adds two missing conditionals to the raid10 personality.

Signed-of-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e3fd725d5c4d..84e24e648165 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3782,8 +3782,10 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 			return ret;
 	}
 	md_set_array_sectors(mddev, size);
-	set_capacity(mddev->gendisk, mddev->array_sectors);
-	revalidate_disk(mddev->gendisk);
+	if (mddev->queue) {
+		set_capacity(mddev->gendisk, mddev->array_sectors);
+		revalidate_disk(mddev->gendisk);
+	}
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp > oldsize) {
 		mddev->recovery_cp = oldsize;
@@ -4593,8 +4595,10 @@ static void raid10_finish_reshape(struct mddev *mddev)
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		}
 		mddev->resync_max_sectors = size;
-		set_capacity(mddev->gendisk, mddev->array_sectors);
-		revalidate_disk(mddev->gendisk);
+		if (mddev->queue) {
+			set_capacity(mddev->gendisk, mddev->array_sectors);
+			revalidate_disk(mddev->gendisk);
+		}
 	} else {
 		int d;
 		for (d = conf->geo.raid_disks ;

From fe67d19a2d7b31f1c29efbe1819c921d4a9bb012 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Tue, 3 May 2016 22:15:31 +0200
Subject: [PATCH 16/19] md: raid5: add prerequisite to run underneath dm-raid

In case md runs underneath the dm-raid target, the mddev does not have
a request queue or gendisk, thus avoid accesses.

This patch adds a missing conditional to the raid5 personality.

Signed-of-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e48c262ce032..4d31b235a888 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7572,8 +7572,10 @@ static void raid5_finish_reshape(struct mddev *mddev)
 
 		if (mddev->delta_disks > 0) {
 			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
-			set_capacity(mddev->gendisk, mddev->array_sectors);
-			revalidate_disk(mddev->gendisk);
+			if (mddev->queue) {
+				set_capacity(mddev->gendisk, mddev->array_sectors);
+				revalidate_disk(mddev->gendisk);
+			}
 		} else {
 			int d;
 			spin_lock_irq(&conf->device_lock);

From 85ad1d13ee9b3db00615ea24b031c15e5ba14fd1 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 3 May 2016 22:22:13 -0400
Subject: [PATCH 17/19] md: set MD_CHANGE_PENDING in a atomic region

Some code waits for a metadata update by:

1. flagging that it is needed (MD_CHANGE_DEVS or MD_CHANGE_CLEAN)
2. setting MD_CHANGE_PENDING and waking the management thread
3. waiting for MD_CHANGE_PENDING to be cleared

If the first two are done without locking, the code in md_update_sb()
which checks if it needs to repeat might test if an update is needed
before step 1, then clear MD_CHANGE_PENDING after step 2, resulting
in the wait returning early.

So make sure all places that set MD_CHANGE_PENDING are atomicial, and
bit_clear_unless (suggested by Neil) is introduced for the purpose.

Cc: Martin Kepplinger <martink@posteo.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: <linux-kernel@vger.kernel.org>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c          | 27 ++++++++++++++-------------
 drivers/md/raid1.c       |  4 ++--
 drivers/md/raid10.c      |  8 ++++----
 drivers/md/raid5-cache.c |  4 ++--
 drivers/md/raid5.c       |  4 ++--
 include/linux/bitops.h   | 16 ++++++++++++++++
 6 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 23c6d732a374..a79462dcd5e1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2295,12 +2295,16 @@ repeat:
 	if (mddev_is_clustered(mddev)) {
 		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 			force_change = 1;
+		if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
+			nospares = 1;
 		ret = md_cluster_ops->metadata_update_start(mddev);
 		/* Has someone else has updated the sb */
 		if (!does_sb_need_changing(mddev)) {
 			if (ret == 0)
 				md_cluster_ops->metadata_update_cancel(mddev);
-			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+							 BIT(MD_CHANGE_DEVS) |
+							 BIT(MD_CHANGE_CLEAN));
 			return;
 		}
 	}
@@ -2434,15 +2438,11 @@ repeat:
 	if (mddev_is_clustered(mddev) && ret == 0)
 		md_cluster_ops->metadata_update_finish(mddev);
 
-	spin_lock(&mddev->lock);
 	if (mddev->in_sync != sync_req ||
-	    test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
+	    !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING),
+			       BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN)))
 		/* have to write it out again */
-		spin_unlock(&mddev->lock);
 		goto repeat;
-	}
-	clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-	spin_unlock(&mddev->lock);
 	wake_up(&mddev->sb_wait);
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -8147,18 +8147,18 @@ void md_do_sync(struct md_thread *thread)
 		}
 	}
  skip:
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-
 	if (mddev_is_clustered(mddev) &&
 	    ret == 0) {
 		/* set CHANGE_PENDING here since maybe another
 		 * update is needed, so other nodes are informed */
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 		md_cluster_ops->resync_finish(mddev);
-	}
+	} else
+		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8550,6 +8550,7 @@ EXPORT_SYMBOL(md_finish_reshape);
 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		       int is_new)
 {
+	struct mddev *mddev = rdev->mddev;
 	int rv;
 	if (is_new)
 		s += rdev->new_data_offset;
@@ -8559,8 +8560,8 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 	if (rv == 0) {
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
-		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(rdev->mddev->thread);
 		return 1;
 	} else
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a7f2b9c9f8a0..c7c8cde0ab21 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1474,8 +1474,8 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
 	 * if recovery is running, make sure it aborts.
 	 */
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	printk(KERN_ALERT
 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid1:%s: Operation continuing on %d devices.\n",
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 84e24e648165..c7de2a53e625 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1102,8 +1102,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 		bio->bi_iter.bi_sector < conf->reshape_progress))) {
 		/* Need to update reshape_position in metadata */
 		mddev->reshape_position = conf->reshape_progress;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
@@ -1591,8 +1591,8 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9531f5f05b93..ac51bc5ecb16 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -712,8 +712,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	 * in_teardown check workaround this issue.
 	 */
 	if (!log->in_teardown) {
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		set_bit(MD_CHANGE_PENDING, &mddev->flags);
+		set_mask_bits(&mddev->flags, 0,
+			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4d31b235a888..8959e6dd31dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2514,8 +2514,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
-	set_bit(MD_CHANGE_PENDING, &mddev->flags);
+	set_mask_bits(&mddev->flags, 0,
+		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 	printk(KERN_ALERT
 	       "md/raid:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid:%s: Operation continuing on %d devices.\n",
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index defeaac0745f..299e76b59fe9 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -227,6 +227,22 @@ static inline unsigned long __ffs64(u64 word)
 })
 #endif
 
+#ifndef bit_clear_unless
+#define bit_clear_unless(ptr, _clear, _test)	\
+({								\
+	const typeof(*ptr) clear = (_clear), test = (_test);	\
+	typeof(*ptr) old, new;					\
+								\
+	do {							\
+		old = ACCESS_ONCE(*ptr);			\
+		new = old & ~clear;				\
+	} while (!(old & test) &&				\
+		 cmpxchg(ptr, old, new) != old);		\
+								\
+	!(old & test);						\
+})
+#endif
+
 #ifndef find_last_bit
 /**
  * find_last_bit - find the last set bit in a memory region

From 51e453aecb267b6a99b1d2853bccd5bba7340236 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Wed, 4 May 2016 02:17:09 -0400
Subject: [PATCH 18/19] md-cluster: gather resync infos and enable recv_thread
 after bitmap is ready

The in-memory bitmap is not ready when node joins cluster,
so it doesn't make sense to make gather_all_resync_info()
called so earlier, we need to call it after the node's
bitmap is setup. Also, recv_thread could be wake up after
node joins cluster, but it could cause problem if node
receives RESYNCING message without persionality since
mddev->pers->quiesce is called in process_suspend_info.

This commit introduces a new cluster interface load_bitmaps
to fix above problems, load_bitmaps is called in bitmap_load
where bitmap and persionality are ready, and load_bitmaps
does the following tasks:

1. call gather_all_resync_info to load all the node's
   bitmap info.
2. set MD_CLUSTER_ALREADY_IN_CLUSTER bit to recv_thread
   could be wake up, and wake up recv_thread if there is
   pending recv event.

Then ack_bast only wakes up recv_thread after IN_CLUSTER
bit is ready otherwise MD_CLUSTER_PENDING_RESYNC_EVENT is
set.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c     |  3 +++
 drivers/md/md-cluster.c | 30 ++++++++++++++++++++++++------
 drivers/md/md-cluster.h |  1 +
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ad5a85847004..d8129ec93ebd 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1848,6 +1848,9 @@ int bitmap_load(struct mddev *mddev)
 	if (!bitmap)
 		goto out;
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
+
 	/* Clear out old bitmap info first:  Either there is none, or we
 	 * are resuming after someone else has possibly changed things,
 	 * so we should forget old cached info.
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index a55b5f4d0dbe..bef6a47b443f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -61,6 +61,10 @@ struct resync_info {
  * the lock.
  */
 #define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
+/* We should receive message after node joined cluster and
+ * set up all the related infos such as bitmap and personality */
+#define		MD_CLUSTER_ALREADY_IN_CLUSTER		6
+#define		MD_CLUSTER_PENDING_RECV_EVENT		7
 
 
 struct md_cluster_info {
@@ -376,8 +380,12 @@ static void ack_bast(void *arg, int mode)
 	struct dlm_lock_resource *res = arg;
 	struct md_cluster_info *cinfo = res->mddev->cluster_info;
 
-	if (mode == DLM_LOCK_EX)
-		md_wakeup_thread(cinfo->recv_thread);
+	if (mode == DLM_LOCK_EX) {
+		if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state))
+			md_wakeup_thread(cinfo->recv_thread);
+		else
+			set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state);
+	}
 }
 
 static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
@@ -846,10 +854,6 @@ static int join(struct mddev *mddev, int nodes)
 	if (!cinfo->resync_lockres)
 		goto err;
 
-	ret = gather_all_resync_info(mddev, nodes);
-	if (ret)
-		goto err;
-
 	return 0;
 err:
 	md_unregister_thread(&cinfo->recovery_thread);
@@ -867,6 +871,19 @@ err:
 	return ret;
 }
 
+static void load_bitmaps(struct mddev *mddev, int total_slots)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	/* load all the node's bitmap info for resync */
+	if (gather_all_resync_info(mddev, total_slots))
+		pr_err("md-cluster: failed to gather all resyn infos\n");
+	set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state);
+	/* wake up recv thread in case something need to be handled */
+	if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state))
+		md_wakeup_thread(cinfo->recv_thread);
+}
+
 static void resync_bitmap(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1208,6 +1225,7 @@ static struct md_cluster_operations cluster_ops = {
 	.add_new_disk_cancel = add_new_disk_cancel,
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
+	.load_bitmaps = load_bitmaps,
 	.gather_bitmaps = gather_bitmaps,
 	.lock_all_bitmaps = lock_all_bitmaps,
 	.unlock_all_bitmaps = unlock_all_bitmaps,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 45ce6c97d8bd..e765499ba591 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -23,6 +23,7 @@ struct md_cluster_operations {
 	void (*add_new_disk_cancel)(struct mddev *mddev);
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
+	void (*load_bitmaps)(struct mddev *mddev, int total_slots);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
 	int (*lock_all_bitmaps)(struct mddev *mddev);
 	void (*unlock_all_bitmaps)(struct mddev *mddev);

From 1fa9a1ad0a9db3c745fe0c1bfa73fd87901fd7f3 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 3 May 2016 22:22:15 -0400
Subject: [PATCH 19/19] md-cluster: check the return value of process_recvd_msg

We don't need to run the full path of recv_daemon
if process_recvd_msg doesn't return 0.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index bef6a47b443f..41573f1f626f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -520,11 +520,13 @@ static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
 }
 
-static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
+static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
+	int ret = 0;
+
 	if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
 		"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
-		return;
+		return -1;
 	switch (le32_to_cpu(msg->type)) {
 	case METADATA_UPDATED:
 		process_metadata_update(mddev, msg);
@@ -547,9 +549,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 		__recover_slot(mddev, le32_to_cpu(msg->slot));
 		break;
 	default:
+		ret = -1;
 		pr_warn("%s:%d Received unknown message from %d\n",
 			__func__, __LINE__, msg->slot);
 	}
+	return ret;
 }
 
 /*
@@ -573,7 +577,9 @@ static void recv_daemon(struct md_thread *thread)
 
 	/* read lvb and wake up thread to process this message_lockres */
 	memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
-	process_recvd_msg(thread->mddev, &msg);
+	ret = process_recvd_msg(thread->mddev, &msg);
+	if (ret)
+		goto out;
 
 	/*release CR on ack_lockres*/
 	ret = dlm_unlock_sync(ack_lockres);
@@ -587,6 +593,7 @@ static void recv_daemon(struct md_thread *thread)
 	ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
 	if (unlikely(ret != 0))
 		pr_info("lock CR on ack failed return %d\n", ret);
+out:
 	/*release CR on message_lockres*/
 	ret = dlm_unlock_sync(message_lockres);
 	if (unlikely(ret != 0))