From 61e4947c99c4494336254ec540c50186d186150b Mon Sep 17 00:00:00 2001
From: Lukasz Dorau <lukasz.dorau@intel.com>
Date: Thu, 24 Oct 2013 12:55:17 +1100
Subject: [PATCH 1/4] md: Fix skipping recovery for read-only arrays.

Since:
        commit 7ceb17e87bde79d285a8b988cfed9eaeebe60b86
        md: Allow devices to be re-added to a read-only array.

spares are activated on a read-only array. In case of raid1 and raid10
personalities it causes that not-in-sync devices are marked in-sync
without checking if recovery has been finished.

If a read-only array is degraded and one of its devices is not in-sync
(because the array has been only partially recovered) recovery will be skipped.

This patch adds checking if recovery has been finished before marking a device
in-sync for raid1 and raid10 personalities. In case of raid5 personality
such condition is already present (at raid5.c:6029).

Bug was introduced in 3.10 and causes data corruption.

Cc: stable@vger.kernel.org
Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 1 +
 drivers/md/raid10.c | 1 +
 2 files changed, 2 insertions(+)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d60412c7f995..aacf6bf352d8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1479,6 +1479,7 @@ static int raid1_spare_active(struct mddev *mddev)
 			}
 		}
 		if (rdev
+		    && rdev->recovery_offset == MaxSector
 		    && !test_bit(Faulty, &rdev->flags)
 		    && !test_and_set_bit(In_sync, &rdev->flags)) {
 			count++;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index df7b0a06b0ea..73dc8a377522 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1782,6 +1782,7 @@ static int raid10_spare_active(struct mddev *mddev)
 			}
 			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
 		} else if (tmp->rdev
+			   && tmp->rdev->recovery_offset == MaxSector
 			   && !test_bit(Faulty, &tmp->rdev->flags)
 			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;

From 905b0297a9533d7a6ee00a01a990456636877dd6 Mon Sep 17 00:00:00 2001
From: Bian Yu <bianyu@kedacom.com>
Date: Sat, 12 Oct 2013 01:10:03 -0400
Subject: [PATCH 2/4] md: avoid deadlock when md_set_badblocks.

When operate harddisk and hit errors, md_set_badblocks is called after
scsi_restart_operations which already disabled the irq. but md_set_badblocks
will call write_sequnlock_irq and enable irq. so softirq can preempt the
current thread and that may cause a deadlock. I think this situation should
use write_sequnlock_irqsave/irqrestore instead.

I met the situation and the call trace is below:
[  638.919974] BUG: spinlock recursion on CPU#0, scsi_eh_13/1010
[  638.921923]  lock: 0xffff8800d4d51fc8, .magic: dead4ead, .owner: scsi_eh_13/1010, .owner_cpu: 0
[  638.923890] CPU: 0 PID: 1010 Comm: scsi_eh_13 Not tainted 3.12.0-rc5+ #37
[  638.925844] Hardware name: To be filled by O.E.M. To be filled by O.E.M./MAHOBAY, BIOS 4.6.5 03/05/2013
[  638.927816]  ffff880037ad4640 ffff880118c03d50 ffffffff8172ff85 0000000000000007
[  638.929829]  ffff8800d4d51fc8 ffff880118c03d70 ffffffff81730030 ffff8800d4d51fc8
[  638.931848]  ffffffff81a72eb0 ffff880118c03d90 ffffffff81730056 ffff8800d4d51fc8
[  638.933884] Call Trace:
[  638.935867]  <IRQ>  [<ffffffff8172ff85>] dump_stack+0x55/0x76
[  638.937878]  [<ffffffff81730030>] spin_dump+0x8a/0x8f
[  638.939861]  [<ffffffff81730056>] spin_bug+0x21/0x26
[  638.941836]  [<ffffffff81336de4>] do_raw_spin_lock+0xa4/0xc0
[  638.943801]  [<ffffffff8173f036>] _raw_spin_lock+0x66/0x80
[  638.945747]  [<ffffffff814a73ed>] ? scsi_device_unbusy+0x9d/0xd0
[  638.947672]  [<ffffffff8173fb1b>] ? _raw_spin_unlock+0x2b/0x50
[  638.949595]  [<ffffffff814a73ed>] scsi_device_unbusy+0x9d/0xd0
[  638.951504]  [<ffffffff8149ec47>] scsi_finish_command+0x37/0xe0
[  638.953388]  [<ffffffff814a75e8>] scsi_softirq_done+0xa8/0x140
[  638.955248]  [<ffffffff8130e32b>] blk_done_softirq+0x7b/0x90
[  638.957116]  [<ffffffff8104fddd>] __do_softirq+0xfd/0x330
[  638.958987]  [<ffffffff810b964f>] ? __lock_release+0x6f/0x100
[  638.960861]  [<ffffffff8174a5cc>] call_softirq+0x1c/0x30
[  638.962724]  [<ffffffff81004c7d>] do_softirq+0x8d/0xc0
[  638.964565]  [<ffffffff8105024e>] irq_exit+0x10e/0x150
[  638.966390]  [<ffffffff8174ad4a>] smp_apic_timer_interrupt+0x4a/0x60
[  638.968223]  [<ffffffff817499af>] apic_timer_interrupt+0x6f/0x80
[  638.970079]  <EOI>  [<ffffffff810b964f>] ? __lock_release+0x6f/0x100
[  638.971899]  [<ffffffff8173fa6a>] ? _raw_spin_unlock_irq+0x3a/0x50
[  638.973691]  [<ffffffff8173fa60>] ? _raw_spin_unlock_irq+0x30/0x50
[  638.975475]  [<ffffffff81562393>] md_set_badblocks+0x1f3/0x4a0
[  638.977243]  [<ffffffff81566e07>] rdev_set_badblocks+0x27/0x80
[  638.978988]  [<ffffffffa00d97bb>] raid5_end_read_request+0x36b/0x4e0 [raid456]
[  638.980723]  [<ffffffff811b5a1d>] bio_endio+0x1d/0x40
[  638.982463]  [<ffffffff81304ff3>] req_bio_endio.isra.65+0x83/0xa0
[  638.984214]  [<ffffffff81306b9f>] blk_update_request+0x7f/0x350
[  638.985967]  [<ffffffff81306ea1>] blk_update_bidi_request+0x31/0x90
[  638.987710]  [<ffffffff813085e0>] __blk_end_bidi_request+0x20/0x50
[  638.989439]  [<ffffffff8130862f>] __blk_end_request_all+0x1f/0x30
[  638.991149]  [<ffffffff81308746>] blk_peek_request+0x106/0x250
[  638.992861]  [<ffffffff814a62a9>] ? scsi_kill_request.isra.32+0xe9/0x130
[  638.994561]  [<ffffffff814a633a>] scsi_request_fn+0x4a/0x3d0
[  638.996251]  [<ffffffff813040a7>] __blk_run_queue+0x37/0x50
[  638.997900]  [<ffffffff813045af>] blk_run_queue+0x2f/0x50
[  638.999553]  [<ffffffff814a5750>] scsi_run_queue+0xe0/0x1c0
[  639.001185]  [<ffffffff814a7721>] scsi_run_host_queues+0x21/0x40
[  639.002798]  [<ffffffff814a2e87>] scsi_restart_operations+0x177/0x200
[  639.004391]  [<ffffffff814a4fe9>] scsi_error_handler+0xc9/0xe0
[  639.005996]  [<ffffffff814a4f20>] ? scsi_unjam_host+0xd0/0xd0
[  639.007600]  [<ffffffff81072f6b>] kthread+0xdb/0xe0
[  639.009205]  [<ffffffff81072e90>] ? flush_kthread_worker+0x170/0x170
[  639.010821]  [<ffffffff81748cac>] ret_from_fork+0x7c/0xb0
[  639.012437]  [<ffffffff81072e90>] ? flush_kthread_worker+0x170/0x170

This bug was introduce in commit  2e8ac30312973dd20e68073653
(the first time rdev_set_badblock was call from interrupt context),
so this patch is appropriate for 3.5 and subsequent kernels.

Cc: <stable@vger.kernel.org> (3.5+)
Signed-off-by: Bian Yu <bianyu@kedacom.com>
Reviewed-by: Jianpeng Ma <majianpeng@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index adf4d7e1d5e1..561a65f82e26 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8111,6 +8111,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	u64 *p;
 	int lo, hi;
 	int rv = 1;
+	unsigned long flags;
 
 	if (bb->shift < 0)
 		/* badblocks are disabled */
@@ -8125,7 +8126,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 		sectors = next - s;
 	}
 
-	write_seqlock_irq(&bb->lock);
+	write_seqlock_irqsave(&bb->lock, flags);
 
 	p = bb->page;
 	lo = 0;
@@ -8241,7 +8242,7 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
 	bb->changed = 1;
 	if (!acknowledged)
 		bb->unacked_exist = 1;
-	write_sequnlock_irq(&bb->lock);
+	write_sequnlock_irqrestore(&bb->lock, flags);
 
 	return rv;
 }

From 37c61ff31e9b5e3fcf3cc6579f5c68f6ad40c4b1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Sat, 19 Oct 2013 14:50:28 +0800
Subject: [PATCH 3/4] raid5: set bio bi_vcnt 0 for discard request

SCSI layer will add new payload for discard request. If two bios are merged
to one, the second bio has bi_vcnt 1 which is set in raid5. This will confuse
SCSI and cause oops.

Suitable for backport to 3.7+

Cc: stable@vger.kernel.org (v3.7+)
Reported-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/md/raid5.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7ff4f252ca1a..302d7cd2076c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -778,6 +778,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
 			bi->bi_size = STRIPE_SIZE;
+			/*
+			 * If this is discard request, set bi_vcnt 0. We don't
+			 * want to confuse SCSI because SCSI will replace payload
+			 */
+			if (rw & REQ_DISCARD)
+				bi->bi_vcnt = 0;
 			if (rrdev)
 				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
 
@@ -816,6 +822,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			rbi->bi_io_vec[0].bv_offset = 0;
 			rbi->bi_size = STRIPE_SIZE;
+			/*
+			 * If this is discard request, set bi_vcnt 0. We don't
+			 * want to confuse SCSI because SCSI will replace payload
+			 */
+			if (rw & REQ_DISCARD)
+				rbi->bi_vcnt = 0;
 			if (conf->mddev->gendisk)
 				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
 						      rbi, disk_devt(conf->mddev->gendisk),

From d47648fcf0611812286f68131b40251c6fa54f5e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Sat, 19 Oct 2013 14:51:42 +0800
Subject: [PATCH 4/4] raid5: avoid finding "discard" stripe

SCSI discard will damage discard stripe bio setting, eg, some fields are
changed. If the stripe is reused very soon, we have wrong bios setting. We
remove discard stripe from hash list, so next time the strip will be fully
initialized.

Suitable for backport to 3.7+.

Cc: <stable@vger.kernel.org> (3.7+)
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 302d7cd2076c..f8b906843926 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2922,6 +2922,14 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		}
 		/* now that discard is done we can proceed with any sync */
 		clear_bit(STRIPE_DISCARD, &sh->state);
+		/*
+		 * SCSI discard will change some bio fields and the stripe has
+		 * no updated data, so remove it from hash list and the stripe
+		 * will be reinitialized
+		 */
+		spin_lock_irq(&conf->device_lock);
+		remove_hash(sh);
+		spin_unlock_irq(&conf->device_lock);
 		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
 			set_bit(STRIPE_HANDLE, &sh->state);