From e566aef12a166732b7fd85897f8736ccf4fc7814 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:34 +0800
Subject: [PATCH 01/20] md-cluster: call md_kick_rdev_from_array once ack
 failed

The new_disk_ack could return failure if WAITING_FOR_NEWDISK
is not set, so we need to kick the dev from array in case
failure happened.

And we missed to check err before call new_disk_ack othwise
we could kick a rdev which isn't in array, thanks for the
reminder from Shaohua.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 915e84d631a2..7eaf5496c8d9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6101,9 +6101,14 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 			export_rdev(rdev);
 
 		if (mddev_is_clustered(mddev)) {
-			if (info->state & (1 << MD_DISK_CANDIDATE))
-				md_cluster_ops->new_disk_ack(mddev, (err == 0));
-			else {
+			if (info->state & (1 << MD_DISK_CANDIDATE)) {
+				if (!err) {
+					err = md_cluster_ops->new_disk_ack(mddev,
+						err == 0);
+					if (err)
+						md_kick_rdev_from_array(rdev);
+				}
+			} else {
 				if (err)
 					md_cluster_ops->add_new_disk_cancel(mddev);
 				else

From 400cb454a4205ec1d7311bc3dd8104859c26ba46 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:35 +0800
Subject: [PATCH 02/20] md-cluster: use FORCEUNLOCK in lockres_free

For dlm_unlock, we need to pass flag to dlm_unlock as the
third parameter instead of set res->flags.

Also, DLM_LKF_FORCEUNLOCK is more suitable for dlm_unlock
since it works even the lock is on waiting or convert queue.

Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 34a840d9df76..ccd756fdc00e 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -194,25 +194,21 @@ out_err:
 
 static void lockres_free(struct dlm_lock_resource *res)
 {
-	int ret;
+	int ret = 0;
 
 	if (!res)
 		return;
 
-	/* cancel a lock request or a conversion request that is blocked */
-	res->flags |= DLM_LKF_CANCEL;
-retry:
-	ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
-	if (unlikely(ret != 0)) {
-		pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
-
-		/* if a lock conversion is cancelled, then the lock is put
-		 * back to grant queue, need to ensure it is unlocked */
-		if (ret == -DLM_ECANCEL)
-			goto retry;
-	}
-	res->flags &= ~DLM_LKF_CANCEL;
-	wait_for_completion(&res->completion);
+	/*
+	 * use FORCEUNLOCK flag, so we can unlock even the lock is on the
+	 * waiting or convert queue
+	 */
+	ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK,
+		&res->lksb, res);
+	if (unlikely(ret != 0))
+		pr_err("failed to unlock %s return %d\n", res->name, ret);
+	else
+		wait_for_completion(&res->completion);
 
 	kfree(res->name);
 	kfree(res->lksb.sb_lvbptr);

From e3f924d3dfc672dd1292bc7eb6f2a305c13981ec Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:36 +0800
Subject: [PATCH 03/20] md-cluster: remove some unnecessary dlm_unlock_sync

Since DLM_LKF_FORCEUNLOCK is used in lockres_free,
we don't need to call dlm_unlock_sync before free
lock resource.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index ccd756fdc00e..67a735840639 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -284,7 +284,7 @@ static void recover_bitmaps(struct md_thread *thread)
 		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
 		if (ret) {
 			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
-			goto dlm_unlock;
+			goto clear_bit;
 		}
 		if (hi > 0) {
 			if (lo < mddev->recovery_cp)
@@ -296,8 +296,6 @@ static void recover_bitmaps(struct md_thread *thread)
 			    md_wakeup_thread(mddev->thread);
 			}
 		}
-dlm_unlock:
-		dlm_unlock_sync(bm_lockres);
 clear_bit:
 		lockres_free(bm_lockres);
 		clear_bit(slot, &cinfo->recovery_map);
@@ -766,7 +764,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 			md_check_recovery(mddev);
 		}
 
-		dlm_unlock_sync(bm_lockres);
 		lockres_free(bm_lockres);
 	}
 out:
@@ -1182,7 +1179,6 @@ static void unlock_all_bitmaps(struct mddev *mddev)
 	if (cinfo->other_bitmap_lockres) {
 		for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
 			if (cinfo->other_bitmap_lockres[i]) {
-				dlm_unlock_sync(cinfo->other_bitmap_lockres[i]);
 				lockres_free(cinfo->other_bitmap_lockres[i]);
 			}
 		}

From af8d8e6f031589ccf32b08eea91def53db8cfa95 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:37 +0800
Subject: [PATCH 04/20] md: changes for MD_STILL_CLOSED flag

When stop clustered raid while it is pending on resync,
MD_STILL_CLOSED flag could be cleared since udev rule
is triggered to open the mddev. So obviously array can't
be stopped soon and returns EBUSY.

	mdadm -Ss          md-raid-arrays.rules
  set MD_STILL_CLOSED          md_open()
	... ... ...          clear MD_STILL_CLOSED
	do_md_stop

We make below changes to resolve this issue:

1. rename MD_STILL_CLOSED to MD_CLOSING since it is set
   when stop array and it means we are stopping array.
2. let md_open returns early if CLOSING is set, so no
   other threads will open array if one thread is trying
   to close it.
3. no need to clear CLOSING bit in md_open because 1 has
   ensure the bit is cleared, then we also don't need to
   test CLOSING bit in do_md_stop.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 14 ++++++++------
 drivers/md/md.h |  5 ++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7eaf5496c8d9..b6ad04b58766 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5573,8 +5573,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 	mutex_lock(&mddev->open_mutex);
 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
 	    mddev->sync_thread ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		if (did_freeze) {
 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -5636,8 +5635,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
 	if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
 	    mddev->sysfs_active ||
 	    mddev->sync_thread ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-	    (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
 		printk("md: %s still in use.\n",mdname(mddev));
 		mutex_unlock(&mddev->open_mutex);
 		if (did_freeze) {
@@ -6826,7 +6824,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 			err = -EBUSY;
 			goto out;
 		}
-		set_bit(MD_STILL_CLOSED, &mddev->flags);
+		set_bit(MD_CLOSING, &mddev->flags);
 		mutex_unlock(&mddev->open_mutex);
 		sync_blockdev(bdev);
 	}
@@ -7075,9 +7073,13 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
 		goto out;
 
+	if (test_bit(MD_CLOSING, &mddev->flags)) {
+		mutex_unlock(&mddev->open_mutex);
+		return -ENODEV;
+	}
+
 	err = 0;
 	atomic_inc(&mddev->openers);
-	clear_bit(MD_STILL_CLOSED, &mddev->flags);
 	mutex_unlock(&mddev->open_mutex);
 
 	check_disk_change(bdev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 20c667579ede..2b2041773e79 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -201,9 +201,8 @@ struct mddev {
 #define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
 #define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
 #define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
-#define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
-				 * md_ioctl checked on it.
-				 */
+#define MD_CLOSING	4	/* If set, we are closing the array, do not open
+				 * it then */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
 #define MD_RELOAD_SB	7	/* Reload the superblock because another node

From c20c33f0e2abdb8bab1ec755ed668d7894bf9336 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:38 +0800
Subject: [PATCH 05/20] md-cluster: clean related infos of cluster

cluster_info and bitmap_info.nodes also need to be
cleared when array is stopped.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b6ad04b58766..cd6797b3cdf7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5454,12 +5454,14 @@ static void md_clean(struct mddev *mddev)
 	mddev->degraded = 0;
 	mddev->safemode = 0;
 	mddev->private = NULL;
+	mddev->cluster_info = NULL;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
 	mddev->bitmap_info.default_space = 0;
 	mddev->bitmap_info.chunksize = 0;
 	mddev->bitmap_info.daemon_sleep = 0;
 	mddev->bitmap_info.max_write_behind = 0;
+	mddev->bitmap_info.nodes = 0;
 }
 
 static void __md_stop_writes(struct mddev *mddev)

From 5f0aa21da6cc620b08e5f69f51db29cb1f722174 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:39 +0800
Subject: [PATCH 06/20] md-cluster: protect md_find_rdev_nr_rcu with rcu lock

We need to use rcu_read_lock/unlock to avoid potential
race.

Reported-by: Shaohua Li <shli@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 67a735840639..b4dc211923c7 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -489,9 +489,10 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 {
-	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
-						   le32_to_cpu(msg->raid_slot));
+	struct md_rdev *rdev;
 
+	rcu_read_lock();
+	rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
 	if (rdev) {
 		set_bit(ClusterRemove, &rdev->flags);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -500,18 +501,21 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 	else
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
+	rcu_read_unlock();
 }
 
 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 {
-	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
-						   le32_to_cpu(msg->raid_slot));
+	struct md_rdev *rdev;
 
+	rcu_read_lock();
+	rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
 	if (rdev && test_bit(Faulty, &rdev->flags))
 		clear_bit(Faulty, &rdev->flags);
 	else
 		pr_warn("%s: %d Could not find disk(%d) which is faulty",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
+	rcu_read_unlock();
 }
 
 static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)

From fccb60a42cdd863aa80f32214ae58ae13936c927 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:41 +0800
Subject: [PATCH 07/20] md-cluster: convert the completion to wait queue

Previously, we used completion to sync between require dlm lock
and sync_ast, however we will have to expose completion.wait
and completion.done in dlm_lock_sync_interruptible (introduced
later), it is not a common usage for completion, so convert
related things to wait queue.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b4dc211923c7..c94715159b9b 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -25,7 +25,8 @@ struct dlm_lock_resource {
 	struct dlm_lksb lksb;
 	char *name; /* lock name. */
 	uint32_t flags; /* flags to pass to dlm_lock() */
-	struct completion completion; /* completion for synchronized locking */
+	wait_queue_head_t sync_locking; /* wait queue for synchronized locking */
+	bool sync_locking_done;
 	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
 	struct mddev *mddev; /* pointing back to mddev. */
 	int mode;
@@ -118,7 +119,8 @@ static void sync_ast(void *arg)
 	struct dlm_lock_resource *res;
 
 	res = arg;
-	complete(&res->completion);
+	res->sync_locking_done = true;
+	wake_up(&res->sync_locking);
 }
 
 static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
@@ -130,7 +132,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 			0, sync_ast, res, res->bast);
 	if (ret)
 		return ret;
-	wait_for_completion(&res->completion);
+	wait_event(res->sync_locking, res->sync_locking_done);
+	res->sync_locking_done = false;
 	if (res->lksb.sb_status == 0)
 		res->mode = mode;
 	return res->lksb.sb_status;
@@ -151,7 +154,8 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 	if (!res)
 		return NULL;
-	init_completion(&res->completion);
+	init_waitqueue_head(&res->sync_locking);
+	res->sync_locking_done = false;
 	res->ls = cinfo->lockspace;
 	res->mddev = mddev;
 	res->mode = DLM_LOCK_IV;
@@ -208,7 +212,7 @@ static void lockres_free(struct dlm_lock_resource *res)
 	if (unlikely(ret != 0))
 		pr_err("failed to unlock %s return %d\n", res->name, ret);
 	else
-		wait_for_completion(&res->completion);
+		wait_event(res->sync_locking, res->sync_locking_done);
 
 	kfree(res->name);
 	kfree(res->lksb.sb_lvbptr);

From 7bcda7149dd6911bee15a51b22477f592f8b9620 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:42 +0800
Subject: [PATCH 08/20] md-cluster: introduce dlm_lock_sync_interruptible to
 fix tasks hang

When some node leaves cluster, then it's bitmap need to be
synced by another node, so "md*_recover" thread is triggered
for the purpose. However, with below steps. we can find tasks
hang happened either in B or C.

1. Node A create a resyncing cluster raid1, assemble it in
   other two nodes (B and C).
2. stop array in B and C.
3. stop array in A.

linux44:~ # ps aux|grep md|grep D
root	5938	0.0  0.1  19852  1964 pts/0    D+   14:52   0:00 mdadm -S md0
root	5939	0.0  0.0      0     0 ?        D    14:52   0:00 [md0_recover]

linux44:~ # cat /proc/5939/stack
[<ffffffffa04cf321>] dlm_lock_sync+0x71/0x90 [md_cluster]
[<ffffffffa04d0705>] recover_bitmaps+0x125/0x220 [md_cluster]
[<ffffffffa052105d>] md_thread+0x16d/0x180 [md_mod]
[<ffffffff8107ad94>] kthread+0xb4/0xc0
[<ffffffff8152a518>] ret_from_fork+0x58/0x90

linux44:~ # cat /proc/5938/stack
[<ffffffff8107afde>] kthread_stop+0x6e/0x120
[<ffffffffa0519da0>] md_unregister_thread+0x40/0x80 [md_mod]
[<ffffffffa04cfd20>] leave+0x70/0x120 [md_cluster]
[<ffffffffa0525e24>] md_cluster_stop+0x14/0x30 [md_mod]
[<ffffffffa05269ab>] bitmap_free+0x14b/0x150 [md_mod]
[<ffffffffa0523f3b>] do_md_stop+0x35b/0x5a0 [md_mod]
[<ffffffffa0524e83>] md_ioctl+0x873/0x1590 [md_mod]
[<ffffffff81288464>] blkdev_ioctl+0x214/0x7d0
[<ffffffff811dd3dd>] block_ioctl+0x3d/0x40
[<ffffffff811b92d4>] do_vfs_ioctl+0x2d4/0x4b0
[<ffffffff811b9538>] SyS_ioctl+0x88/0xa0
[<ffffffff8152a5c9>] system_call_fastpath+0x16/0x1b

The problem is caused by recover_bitmaps can't reliably abort
when the thread is unregistered. So dlm_lock_sync_interruptible
is introduced to detect the thread's situation to fix the problem.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index c94715159b9b..43b90485448d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -10,6 +10,7 @@
 
 
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <linux/dlm.h>
 #include <linux/sched.h>
 #include <linux/raid/md_p.h>
@@ -144,6 +145,43 @@ static int dlm_unlock_sync(struct dlm_lock_resource *res)
 	return dlm_lock_sync(res, DLM_LOCK_NL);
 }
 
+/*
+ * An variation of dlm_lock_sync, which make lock request could
+ * be interrupted
+ */
+static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode,
+				       struct mddev *mddev)
+{
+	int ret = 0;
+
+	ret = dlm_lock(res->ls, mode, &res->lksb,
+			res->flags, res->name, strlen(res->name),
+			0, sync_ast, res, res->bast);
+	if (ret)
+		return ret;
+
+	wait_event(res->sync_locking, res->sync_locking_done
+				      || kthread_should_stop());
+	if (!res->sync_locking_done) {
+		/*
+		 * the convert queue contains the lock request when request is
+		 * interrupted, and sync_ast could still be run, so need to
+		 * cancel the request and reset completion
+		 */
+		ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL,
+			&res->lksb, res);
+		res->sync_locking_done = false;
+		if (unlikely(ret != 0))
+			pr_info("failed to cancel previous lock request "
+				 "%s return %d\n", res->name, ret);
+		return -EPERM;
+	} else
+		res->sync_locking_done = false;
+	if (res->lksb.sb_status == 0)
+		res->mode = mode;
+	return res->lksb.sb_status;
+}
+
 static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 		char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
 {
@@ -279,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread)
 			goto clear_bit;
 		}
 
-		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev);
 		if (ret) {
 			pr_err("md-cluster: Could not DLM lock %s: %d\n",
 					str, ret);

From d6385db94196b253ae5eb3678fa95cdf1f839fcc Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 12 Aug 2016 13:42:43 +0800
Subject: [PATCH 09/20] md-cluster: make resync lock also could be interruptted

When one node is perform resync or recovery, other nodes
can't get resync lock and could block for a while before
it holds the lock, so we can't stop array immediately for
this scenario.

To make array could be stop quickly, we check MD_CLOSING
in dlm_lock_sync_interruptible to make us can interrupt
the lock request.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 43b90485448d..2b13117fb918 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -161,7 +161,8 @@ static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode,
 		return ret;
 
 	wait_event(res->sync_locking, res->sync_locking_done
-				      || kthread_should_stop());
+				      || kthread_should_stop()
+				      || test_bit(MD_CLOSING, &mddev->flags));
 	if (!res->sync_locking_done) {
 		/*
 		 * the convert queue contains the lock request when request is
@@ -1045,7 +1046,7 @@ static void metadata_update_cancel(struct mddev *mddev)
 static int resync_start(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
+	return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
 }
 
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)

From e0a491c1296874a1aca51cc68452f12a4d950029 Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Fri, 12 Aug 2016 18:03:19 -0700
Subject: [PATCH 10/20] lib/raid6: Add AVX512 optimized gen_syndrome functions

Optimize RAID6 gen_syndrom functions to take advantage of
the 512-bit ZMM integer instructions introduced in AVX512.

AVX512 optimized gen_syndrom functions, which is simply based
on avx2.c written by Yuanhan Liu and sse2.c written by hpa.

The patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 arch/x86/Makefile       |   5 +-
 include/linux/raid/pq.h |   3 +
 lib/raid6/Makefile      |   2 +-
 lib/raid6/algos.c       |   9 ++
 lib/raid6/avx512.c      | 294 ++++++++++++++++++++++++++++++++++++++++
 lib/raid6/x86.h         |  10 ++
 6 files changed, 320 insertions(+), 3 deletions(-)
 create mode 100644 lib/raid6/avx512.c

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 830ed391e7ef..2d449337a360 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -163,11 +163,12 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
 sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
 sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index a0118d5929a9..0c529a55b52e 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -102,6 +102,9 @@ extern const struct raid6_calls raid6_altivec8;
 extern const struct raid6_calls raid6_avx2x1;
 extern const struct raid6_calls raid6_avx2x2;
 extern const struct raid6_calls raid6_avx2x4;
+extern const struct raid6_calls raid6_avx512x1;
+extern const struct raid6_calls raid6_avx512x2;
+extern const struct raid6_calls raid6_avx512x4;
 extern const struct raid6_calls raid6_tilegx8;
 
 struct raid6_recov_calls {
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3b10a48fa040..8948268d47b4 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o
 
-raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 975c6e0434bd..f5f090c52dd9 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_avx2x1,
 	&raid6_avx2x2,
 #endif
+#ifdef CONFIG_AS_AVX512
+	&raid6_avx512x1,
+	&raid6_avx512x2,
+#endif
 #endif
 #if defined(__x86_64__) && !defined(__arch_um__)
 	&raid6_sse2x1,
@@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_avx2x2,
 	&raid6_avx2x4,
 #endif
+#ifdef CONFIG_AS_AVX512
+	&raid6_avx512x1,
+	&raid6_avx512x2,
+	&raid6_avx512x4,
+#endif
 #endif
 #ifdef CONFIG_ALTIVEC
 	&raid6_altivec1,
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
new file mode 100644
index 000000000000..b1188a6e51a6
--- /dev/null
+++ b/lib/raid6/avx512.c
@@ -0,0 +1,294 @@
+/* -*- linux-c -*- --------------------------------------------------------
+ *
+ *   Copyright (C) 2016 Intel Corporation
+ *
+ *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ *   Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
+ *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * AVX512 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static const struct raid6_avx512_constants {
+	u64 x1d[8];
+} raid6_avx512_constants __aligned(512) = {
+	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512BW) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0; d < bytes; d += 64) {
+		asm volatile("prefetchnta %0\n\t"
+			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
+			     "prefetchnta %1\n\t"
+			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+			     "vmovdqa64 %1,%%zmm6"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
+		for (z = z0-2; z >= 0; z--) {
+			asm volatile("prefetchnta %0\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+				     "vmovdqa64 %0,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]));
+		}
+		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+			     "vpmovm2b %%k1,%%zmm5\n\t"
+			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+			     "vmovntdq %%zmm2,%0\n\t"
+			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+			     "vmovntdq %%zmm4,%1\n\t"
+			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
+			     :
+			     : "m" (p[d]), "m" (q[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x1 = {
+	raid6_avx5121_gen_syndrome,
+	NULL,                   /* XOR not yet implemented */
+	raid6_have_avx512,
+	"avx512x1",
+	1                       /* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 AVX512 implementation
+ */
+static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	/* We uniformly assume a single prefetch covers at least 64 bytes */
+	for (d = 0; d < bytes; d += 128) {
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
+			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
+			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
+			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
+		for (z = z0-1; z >= 0; z--) {
+			asm volatile("prefetchnta %0\n\t"
+				     "prefetchnta %1\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vmovntdq %%zmm4,%2\n\t"
+			     "vmovntdq %%zmm6,%3"
+			     :
+			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
+			       "m" (q[d+64]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x2 = {
+	raid6_avx5122_gen_syndrome,
+	NULL,                   /* XOR not yet implemented */
+	raid6_have_avx512,
+	"avx512x2",
+	1                       /* Has cache hints */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
+		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
+		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
+		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
+		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
+		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
+		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
+		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
+		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0; d < bytes; d += 256) {
+		for (z = z0; z >= 0; z--) {
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     "prefetchnta %2\n\t"
+			     "prefetchnta %3\n\t"
+			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
+			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
+			     "vpmovm2b %%k1,%%zmm5\n\t"
+			     "vpmovm2b %%k2,%%zmm7\n\t"
+			     "vpmovm2b %%k3,%%zmm13\n\t"
+			     "vpmovm2b %%k4,%%zmm15\n\t"
+			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+			     "vmovdqa64 %0,%%zmm5\n\t"
+			     "vmovdqa64 %1,%%zmm7\n\t"
+			     "vmovdqa64 %2,%%zmm13\n\t"
+			     "vmovdqa64 %3,%%zmm15\n\t"
+			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+			     :
+			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
+			     "vmovntdq %%zmm10,%2\n\t"
+			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
+			     "vmovntdq %%zmm11,%3\n\t"
+			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
+			     "vmovntdq %%zmm4,%4\n\t"
+			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vmovntdq %%zmm6,%5\n\t"
+			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
+			     "vmovntdq %%zmm12,%6\n\t"
+			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
+			     "vmovntdq %%zmm14,%7\n\t"
+			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
+			     :
+			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+			       "m" (q[d+128]), "m" (q[d+192]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x4 = {
+	raid6_avx5124_gen_syndrome,
+	NULL,                   /* XOR not yet implemented */
+	raid6_have_avx512,
+	"avx512x4",
+	1                       /* Has cache hints */
+};
+#endif
+
+#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index 8fe9d9662abb..834d268a4b05 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void)
 #define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_AVX512F     (9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ    (9*32+17) /* AVX-512 DQ (Double/Quad granular)
+					   * Instructions
+					   */
+#define X86_FEATURE_AVX512BW    (9*32+30) /* AVX-512 BW (Byte/Word granular)
+					   * Instructions
+					   */
+#define X86_FEATURE_AVX512VL    (9*32+31) /* AVX-512 VL (128/256 Vector Length)
+					   * Extensions
+					   */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */

From 13c520b2993c9faae6770264d33ff1e1ea4c2ceb Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Fri, 12 Aug 2016 18:03:20 -0700
Subject: [PATCH 11/20] lib/raid6: Add AVX512 optimized recovery functions

Optimize RAID6 recovery functions to take advantage of
the 512-bit ZMM integer instructions introduced in AVX512.

AVX512 optimized recovery functions, which is simply based
on recov_avx2.c written by Jim Kukunas

This patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/linux/raid/pq.h  |   1 +
 lib/raid6/Makefile       |   2 +-
 lib/raid6/algos.c        |   3 +
 lib/raid6/recov_avx512.c | 388 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 393 insertions(+), 1 deletion(-)
 create mode 100644 lib/raid6/recov_avx512.c

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 0c529a55b52e..1abd89584568 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -118,6 +118,7 @@ struct raid6_recov_calls {
 extern const struct raid6_recov_calls raid6_recov_intx1;
 extern const struct raid6_recov_calls raid6_recov_ssse3;
 extern const struct raid6_recov_calls raid6_recov_avx2;
+extern const struct raid6_recov_calls raid6_recov_avx512;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 8948268d47b4..cd05ee1fb809 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o
 
-raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index f5f090c52dd9..149d947a4fec 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -98,6 +98,9 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
 EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#ifdef CONFIG_AS_AVX512
+	&raid6_recov_avx512,
+#endif
 #ifdef CONFIG_AS_AVX2
 	&raid6_recov_avx2,
 #endif
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
new file mode 100644
index 000000000000..625aafa33b61
--- /dev/null
+++ b/lib/raid6/recov_avx512.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512BW) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
+				     int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/* zmm0 = x0f[16] */
+	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+			     "vmovdqa64 %1, %%zmm9\n\t"
+			     "vmovdqa64 %2, %%zmm0\n\t"
+			     "vmovdqa64 %3, %%zmm8\n\t"
+			     "vpxorq %4, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %5, %%zmm9, %%zmm9\n\t"
+			     "vpxorq %6, %%zmm0, %%zmm0\n\t"
+			     "vpxorq %7, %%zmm8, %%zmm8"
+			     :
+			     : "m" (q[0]), "m" (q[64]), "m" (p[0]),
+			       "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
+			       "m" (dp[0]), "m" (dp[64]));
+
+		/*
+		 * 1 = dq[0]  ^ q[0]
+		 * 9 = dq[64] ^ q[64]
+		 * 0 = dp[0]  ^ p[0]
+		 * 8 = dp[64] ^ p[64]
+		 */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm5"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+			     "vpsraw $4, %%zmm9, %%zmm12\n\t"
+			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+			     "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+			     "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
+			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
+			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+			     "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
+			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
+			     :
+			     : );
+
+		/*
+		 * 5 = qx[0]
+		 * 15 = qx[64]
+		 */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1\n\t"
+			     "vpsraw $4, %%zmm0, %%zmm2\n\t"
+			     "vpsraw $4, %%zmm8, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
+			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
+			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
+			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm12, %%zmm13, %%zmm13"
+			     :
+			     : "m" (pbmul[0]), "m" (pbmul[16]));
+
+		/*
+		 * 1  = pbmul[px[0]]
+		 * 13 = pbmul[px[64]]
+		 */
+		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm15, %%zmm13, %%zmm13"
+			     :
+			     : );
+
+		/*
+		 * 1 = db = DQ
+		 * 13 = db[64] = DQ[64]
+		 */
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm13,%1\n\t"
+			     "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+			     "vpxorq %%zmm13, %%zmm8, %%zmm8"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]));
+
+		asm volatile("vmovdqa64 %%zmm0, %0\n\t"
+			     "vmovdqa64 %%zmm8, %1"
+			     :
+			     : "m" (dp[0]), "m" (dp[64]));
+
+		bytes -= 128;
+		p += 128;
+		q += 128;
+		dp += 128;
+		dq += 128;
+#else
+		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+			     "vmovdqa64 %1, %%zmm0\n\t"
+			     "vpxorq %2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %3, %%zmm0, %%zmm0"
+			     :
+			     : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
+
+		/* 1 = dq ^ q;  0 = dp ^ p */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm5"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		/*
+		 * 1 = dq ^ q
+		 * 3 = dq ^ p >> 4
+		 */
+		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
+			     :
+			     : );
+
+		/* 5 = qx */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1"
+			     :
+			     : "m" (pbmul[0]), "m" (pbmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
+			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm4, %%zmm1, %%zmm1"
+			     :
+			     : );
+
+		/* 1 = pbmul[px] */
+		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+			     /* 1 = db = DQ */
+			     "vmovdqa64 %%zmm1, %0\n\t"
+			     :
+			     : "m" (dq[0]));
+
+		asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+			     "vmovdqa64 %%zmm0, %0"
+			     :
+			     : "m" (dp[0]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
+				     void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+			     "vmovdqa64 %1, %%zmm8\n\t"
+			     "vpxorq %2, %%zmm3, %%zmm3\n\t"
+			     "vpxorq %3, %%zmm8, %%zmm8"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
+			       "m" (q[64]));
+
+		/*
+		 * 3 = q[0] ^ dq[0]
+		 * 8 = q[64] ^ dq[64]
+		 */
+		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+			     "vmovapd %%zmm0, %%zmm13\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1\n\t"
+			     "vmovapd %%zmm1, %%zmm14"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+			     "vpsraw $4, %%zmm8, %%zmm12\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+			     "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+			     "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
+			     "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm13, %%zmm14, %%zmm14"
+			     :
+			     : );
+
+		/*
+		 * 1  = qmul[q[0]  ^ dq[0]]
+		 * 14 = qmul[q[64] ^ dq[64]]
+		 */
+		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+			     "vmovdqa64 %1, %%zmm12\n\t"
+			     "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
+			     "vpxorq %%zmm14, %%zmm12, %%zmm12"
+			     :
+			     : "m" (p[0]), "m" (p[64]));
+
+		/*
+		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
+		 * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
+		 */
+
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm14, %1\n\t"
+			     "vmovdqa64 %%zmm2, %2\n\t"
+			     "vmovdqa64 %%zmm12,%3"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
+			       "m" (p[64]));
+
+		bytes -= 128;
+		p += 128;
+		q += 128;
+		dq += 128;
+#else
+		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+			     "vpxorq %1, %%zmm3, %%zmm3"
+			     :
+			     : "m" (dq[0]), "m" (q[0]));
+
+		/* 3 = q ^ dq */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm0, %%zmm1, %%zmm1"
+			     :
+			     : );
+
+		/* 1 = qmul[q ^ dq] */
+
+		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+			     "vpxorq %%zmm1, %%zmm2, %%zmm2"
+			     :
+			     : "m" (p[0]));
+
+		/* 2 = p ^ qmul[q ^ dq] */
+
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm2, %1"
+			     :
+			     : "m" (dq[0]), "m" (p[0]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx512 = {
+	.data2 = raid6_2data_recov_avx512,
+	.datap = raid6_datap_recov_avx512,
+	.valid = raid6_has_avx512,
+#ifdef CONFIG_X86_64
+	.name = "avx512x2",
+#else
+	.name = "avx512x1",
+#endif
+	.priority = 3,
+};
+
+#else
+#warning "your version of binutils lacks AVX512 support"
+#endif

From 161db5d165123a72792e2687ecfd8de146dbae1a Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Fri, 12 Aug 2016 18:03:21 -0700
Subject: [PATCH 12/20] lib/raid6/test/Makefile: Add avx512 gen_syndrome and
 recovery functions

Adding avx512 gen_syndrome and recovery functions so as to allow code to
be compiled and tested successfully in userspace.

This patch is tested in userspace and improvement in performace is
observed.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 lib/raid6/test/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 29090f3db677..2c7b60edea04 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64)
 endif
 
 ifeq ($(IS_X86),yes)
-        OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o
+        OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
         CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" |	\
                     gcc -c -x assembler - >&/dev/null &&	\
                     rm ./-.o && echo -DCONFIG_AS_AVX2=1)
+	CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" |          \
+		    gcc -c -x assembler - >&/dev/null &&        \
+		    rm ./-.o && echo -DCONFIG_AS_AVX512=1)
 else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1

From 694dda62d28674c21c8186d43b8ca0139bf01585 Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Fri, 12 Aug 2016 18:03:22 -0700
Subject: [PATCH 13/20] lib/raid6: Add AVX512 optimized xor_syndrome functions

Optimize RAID6 xor_syndrome functions to take advantage of the 512-bit
ZMM integer instructions introduced in AVX512.

AVX512 optimized xor_syndrome functions, which is simply based on sse2.c
written by hpa.

The patch was tested and benchmarked before submission on
a hardware that has AVX512 flags to support such instructions

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jim Kukunas <james.t.kukunas@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Megha Dey <megha.dey@linux.intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 lib/raid6/avx512.c | 281 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 278 insertions(+), 3 deletions(-)

diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
index b1188a6e51a6..f524a7972006 100644
--- a/lib/raid6/avx512.c
+++ b/lib/raid6/avx512.c
@@ -103,9 +103,68 @@ static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
+				       size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0"
+		     : : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 64) {
+		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+			     "vmovdqa64 %1,%%zmm2\n\t"
+			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
+			     :
+			     : "m" (dptr[z0][d]),  "m" (p[d]));
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
+				     :
+				     : "m" (dptr[z][d]));
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
+				     :
+				     : );
+		}
+		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+		/* Don't use movntdq for r/w memory area < cache line */
+			     "vmovdqa64 %%zmm4,%0\n\t"
+			     "vmovdqa64 %%zmm2,%1"
+			     :
+			     : "m" (q[d]), "m" (p[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx512x1 = {
 	raid6_avx5121_gen_syndrome,
-	NULL,                   /* XOR not yet implemented */
+	raid6_avx5121_xor_syndrome,
 	raid6_have_avx512,
 	"avx512x1",
 	1                       /* Has cache hints */
@@ -176,9 +235,93 @@ static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
+				       size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0"
+		     : : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 128) {
+		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+			     "vmovdqa64 %1,%%zmm6\n\t"
+			     "vmovdqa64 %2,%%zmm2\n\t"
+			     "vmovdqa64 %3,%%zmm3\n\t"
+			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+			       "m" (p[d]), "m" (p[d+64]));
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : );
+		}
+		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
+			     /* Don't use movntdq for r/w
+			      * memory area < cache line
+			      */
+			     "vmovdqa64 %%zmm4,%0\n\t"
+			     "vmovdqa64 %%zmm6,%1\n\t"
+			     "vmovdqa64 %%zmm2,%2\n\t"
+			     "vmovdqa64 %%zmm3,%3"
+			     :
+			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
+			       "m" (p[d+64]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx512x2 = {
 	raid6_avx5122_gen_syndrome,
-	NULL,                   /* XOR not yet implemented */
+	raid6_avx5122_xor_syndrome,
 	raid6_have_avx512,
 	"avx512x2",
 	1                       /* Has cache hints */
@@ -282,9 +425,141 @@ static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
+				       size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0"
+		     :: "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 256) {
+		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+			     "vmovdqa64 %1,%%zmm6\n\t"
+			     "vmovdqa64 %2,%%zmm12\n\t"
+			     "vmovdqa64 %3,%%zmm14\n\t"
+			     "vmovdqa64 %4,%%zmm2\n\t"
+			     "vmovdqa64 %5,%%zmm3\n\t"
+			     "vmovdqa64 %6,%%zmm10\n\t"
+			     "vmovdqa64 %7,%%zmm11\n\t"
+			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
+			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
+			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
+			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]));
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+				     "prefetchnta %0\n\t"
+				     "prefetchnta %2\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpmovm2b %%k3,%%zmm13\n\t"
+				     "vpmovm2b %%k4,%%zmm15\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vmovdqa64 %2,%%zmm13\n\t"
+				     "vmovdqa64 %3,%%zmm15\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+				     :
+				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+				       "m" (dptr[z][d+128]),
+				       "m" (dptr[z][d+192]));
+		}
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     :
+			     : "m" (q[d]), "m" (q[d+128]));
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpmovm2b %%k3,%%zmm13\n\t"
+				     "vpmovm2b %%k4,%%zmm15\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+				     :
+				     : );
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vmovntdq %%zmm10,%2\n\t"
+			     "vmovntdq %%zmm11,%3\n\t"
+			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
+			     "vmovntdq %%zmm4,%4\n\t"
+			     "vmovntdq %%zmm6,%5\n\t"
+			     "vmovntdq %%zmm12,%6\n\t"
+			     "vmovntdq %%zmm14,%7"
+			     :
+			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
+			       "m" (q[d+128]), "m" (q[d+192]));
+	}
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
 const struct raid6_calls raid6_avx512x4 = {
 	raid6_avx5124_gen_syndrome,
-	NULL,                   /* XOR not yet implemented */
+	raid6_avx5124_xor_syndrome,
 	raid6_have_avx512,
 	"avx512x4",
 	1                       /* Has cache hints */

From 1dffddddd8315863f1a6d79c512b737864ef6a1a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 8 Sep 2016 10:49:06 -0700
Subject: [PATCH 14/20] raid5: allow arbitrary max_hw_sectors

raid5 will split bio to proper size internally, there is no point to use
underlayer disk's max_hw_sectors. In my qemu system, without the change,
the raid5 only receives 128k size bio, which reduces the chance of bio
merge sending to underlayer disks.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ee7fc3701700..5883ef0d95bf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7066,6 +7066,8 @@ static int raid5_run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						mddev->queue);
+
+		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 	}
 
 	if (journal_dev) {

From f71f1cf97c781db1be8ae0190e0983e1fceac14a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 13 Sep 2016 10:28:00 -0700
Subject: [PATCH 15/20] md/bitmap: fix wrong cleanup

if bitmap_create fails, the bitmap is already cleaned up and the returned value
is an error number. We can't do the cleanup again.

Reported-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 13041ee37ad6..2d826927a3bf 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1903,10 +1903,8 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	struct bitmap_counts *counts;
 	struct bitmap *bitmap = bitmap_create(mddev, slot);
 
-	if (IS_ERR(bitmap)) {
-		bitmap_free(bitmap);
+	if (IS_ERR(bitmap))
 		return PTR_ERR(bitmap);
-	}
 
 	rv = bitmap_init_from_disk(bitmap, 0);
 	if (rv)

From 90bcf1338193da4c87fb7492c716f225b907acf4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 14 Sep 2016 14:26:54 -0700
Subject: [PATCH 16/20] md: fix a potential deadlock

lockdep reports a potential deadlock. Fix this by droping the mutex
before md_import_device

[ 1137.126601] ======================================================
[ 1137.127013] [ INFO: possible circular locking dependency detected ]
[ 1137.127013] 4.8.0-rc4+ #538 Not tainted
[ 1137.127013] -------------------------------------------------------
[ 1137.127013] mdadm/16675 is trying to acquire lock:
[ 1137.127013]  (&bdev->bd_mutex){+.+.+.}, at: [<ffffffff81243cf3>] __blkdev_get+0x63/0x450
[ 1137.127013]
but task is already holding lock:
[ 1137.127013]  (detected_devices_mutex){+.+.+.}, at: [<ffffffff81a5138c>] md_ioctl+0x2ac/0x1f50
[ 1137.127013]
which lock already depends on the new lock.

[ 1137.127013]
the existing dependency chain (in reverse order) is:
[ 1137.127013]
-> #1 (detected_devices_mutex){+.+.+.}:
[ 1137.127013]        [<ffffffff810b6f19>] lock_acquire+0xb9/0x220
[ 1137.127013]        [<ffffffff81c51647>] mutex_lock_nested+0x67/0x3d0
[ 1137.127013]        [<ffffffff81a4eeaf>] md_autodetect_dev+0x3f/0x90
[ 1137.127013]        [<ffffffff81595be8>] rescan_partitions+0x1a8/0x2c0
[ 1137.127013]        [<ffffffff81590081>] __blkdev_reread_part+0x71/0xb0
[ 1137.127013]        [<ffffffff815900e5>] blkdev_reread_part+0x25/0x40
[ 1137.127013]        [<ffffffff81590c4b>] blkdev_ioctl+0x51b/0xa30
[ 1137.127013]        [<ffffffff81242bf1>] block_ioctl+0x41/0x50
[ 1137.127013]        [<ffffffff81214c96>] do_vfs_ioctl+0x96/0x6e0
[ 1137.127013]        [<ffffffff81215321>] SyS_ioctl+0x41/0x70
[ 1137.127013]        [<ffffffff81c56825>] entry_SYSCALL_64_fastpath+0x18/0xa8
[ 1137.127013]
-> #0 (&bdev->bd_mutex){+.+.+.}:
[ 1137.127013]        [<ffffffff810b6af2>] __lock_acquire+0x1662/0x1690
[ 1137.127013]        [<ffffffff810b6f19>] lock_acquire+0xb9/0x220
[ 1137.127013]        [<ffffffff81c51647>] mutex_lock_nested+0x67/0x3d0
[ 1137.127013]        [<ffffffff81243cf3>] __blkdev_get+0x63/0x450
[ 1137.127013]        [<ffffffff81244307>] blkdev_get+0x227/0x350
[ 1137.127013]        [<ffffffff812444f6>] blkdev_get_by_dev+0x36/0x50
[ 1137.127013]        [<ffffffff81a46d65>] lock_rdev+0x35/0x80
[ 1137.127013]        [<ffffffff81a49bb4>] md_import_device+0xb4/0x1b0
[ 1137.127013]        [<ffffffff81a513d6>] md_ioctl+0x2f6/0x1f50
[ 1137.127013]        [<ffffffff815909b3>] blkdev_ioctl+0x283/0xa30
[ 1137.127013]        [<ffffffff81242bf1>] block_ioctl+0x41/0x50
[ 1137.127013]        [<ffffffff81214c96>] do_vfs_ioctl+0x96/0x6e0
[ 1137.127013]        [<ffffffff81215321>] SyS_ioctl+0x41/0x70
[ 1137.127013]        [<ffffffff81c56825>] entry_SYSCALL_64_fastpath+0x18/0xa8
[ 1137.127013]
other info that might help us debug this:

[ 1137.127013]  Possible unsafe locking scenario:

[ 1137.127013]        CPU0                    CPU1
[ 1137.127013]        ----                    ----
[ 1137.127013]   lock(detected_devices_mutex);
[ 1137.127013]                                lock(&bdev->bd_mutex);
[ 1137.127013]                                lock(detected_devices_mutex);
[ 1137.127013]   lock(&bdev->bd_mutex);
[ 1137.127013]
 *** DEADLOCK ***

Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index cd6797b3cdf7..457b53863117 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8882,7 +8882,9 @@ static void autostart_arrays(int part)
 		list_del(&node_detected_dev->list);
 		dev = node_detected_dev->dev;
 		kfree(node_detected_dev);
+		mutex_unlock(&detected_devices_mutex);
 		rdev = md_import_device(dev,0, 90);
+		mutex_lock(&detected_devices_mutex);
 		if (IS_ERR(rdev))
 			continue;
 

From 6a0f53ff351dfd10e74752e57b9c27d3397a3c4d Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 20 Sep 2016 10:33:57 +0800
Subject: [PATCH 17/20] raid5: fix to detect failure of register_shrinker

register_shrinker can fail after commit 1d3d4437eae1 ("vmscan: per-node
deferred work"), we should detect the failure of it, otherwise we may
fail to register shrinker after raid5 configuration was setup successfully.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5883ef0d95bf..08274b4b4009 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6670,7 +6670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf->shrinker.count_objects = raid5_cache_count;
 	conf->shrinker.batch = 128;
 	conf->shrinker.flags = 0;
-	register_shrinker(&conf->shrinker);
+	if (register_shrinker(&conf->shrinker)) {
+		printk(KERN_ERR
+		       "md/raid:%s: couldn't register shrinker.\n",
+		       mdname(mddev));
+		goto abort;
+	}
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
 	conf->thread = md_register_thread(raid5d, mddev, pers_name);

From 30c8946566f32493f7f1143437319c42c8a542e9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 21 Sep 2016 09:07:13 -0700
Subject: [PATCH 18/20] raid5: handle register_shrinker failure

register_shrinker() now can fail. When it happens, shrinker.nr_deferred is
null. We use it to determine if unregister_shrinker is required.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 08274b4b4009..f94472dd0323 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6372,7 +6372,7 @@ static void free_conf(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_exit_log(conf->log);
-	if (conf->shrinker.seeks)
+	if (conf->shrinker.nr_deferred)
 		unregister_shrinker(&conf->shrinker);
 
 	free_thread_groups(conf);

From 099b548c429217a8306adbd1552d326615c9b903 Mon Sep 17 00:00:00 2001
From: Gayatri Kammela <gayatri.kammela@intel.com>
Date: Mon, 26 Sep 2016 14:37:38 -0700
Subject: [PATCH 19/20] raid6/test/test.c: bug fix: Specify aligned(alignment)
 attributes to the char arrays

Specifying the aligned attributes to the char data[NDISKS][PAGE_SIZE],
char recovi[PAGE_SIZE] and char recovi[PAGE_SIZE] arrays, so that all
malloc memory is page boundary aligned.

Without these alignment attributes, the test causes a segfault in
userspace when the NDISKS are changed to 4 from 16.

The RAID stripes will be page aligned anyway, so we want to test what
the kernel actually will execute.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Gayatri Kammela <gayatri.kammela@intel.com>
Reviewed-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 lib/raid6/test/test.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 3bebbabdb510..b07f4d8e6b03 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -21,12 +21,13 @@
 
 #define NDISKS		16	/* Including P and Q */
 
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
 struct raid6_calls raid6_call;
 
 char *dataptrs[NDISKS];
-char data[NDISKS][PAGE_SIZE];
-char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
+char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
 
 static void makedata(int start, int stop)
 {

From bb086a89a406b5d877ee616f1490fcc81f8e1b2b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 30 Sep 2016 09:45:40 -0700
Subject: [PATCH 20/20] md: set rotational bit

if all disks in an array are non-rotational, set the array
non-rotational.

This only works for array with all disks populated at startup. Support
for disk hotadd/hotremove could be added later if necessary.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 457b53863117..eac84d8ff724 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5297,6 +5297,21 @@ int md_run(struct mddev *mddev)
 		return err;
 	}
 	if (mddev->queue) {
+		bool nonrot = true;
+
+		rdev_for_each(rdev, mddev) {
+			if (rdev->raid_disk >= 0 &&
+			    !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
+				nonrot = false;
+				break;
+			}
+		}
+		if (mddev->degraded)
+			nonrot = false;
+		if (nonrot)
+			queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		mddev->queue->backing_dev_info.congested_fn = md_congested;
 	}