From b76b4715eba0d0ed574f58918b29c1b2f0fa37a8 Mon Sep 17 00:00:00 2001 From: Nigel Croxon Date: Fri, 6 Sep 2019 09:21:33 -0400 Subject: [PATCH 1/6] raid5: don't increment read_errors on EILSEQ return While MD continues to count read errors returned by the lower layer. If those errors are -EILSEQ, instead of -EIO, it should NOT increase the read_errors count. When RAID6 is set up on dm-integrity target that detects massive corruption, the leg will be ejected from the array. Even if the issue is correctable with a sector re-write and the array has necessary redundancy to correct it. The leg is ejected because it runs up the rdev->read_errors beyond conf->max_nr_stripes. The return status in dm-drypt when there is a data integrity error is -EILSEQ (BLK_STS_PROTECTION). Signed-off-by: Nigel Croxon Signed-off-by: Song Liu --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index da6a86e28318..8ea8443e09d5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2526,7 +2526,8 @@ static void raid5_end_read_request(struct bio * bi) int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); - atomic_inc(&rdev->read_errors); + if (!(bi->bi_status == BLK_STS_PROTECTION)) + atomic_inc(&rdev->read_errors); if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) pr_warn_ratelimited( "md/raid:%s: read error on replacement device (sector %llu on %s).\n", From 6ce220dd2f8ea71d6afc29b9a7524c12e39f374a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 11 Sep 2019 10:06:29 +0200 Subject: [PATCH 2/6] raid5: don't set STRIPE_HANDLE to stripe which is in batch list If stripe in batch list is set with STRIPE_HANDLE flag, then the stripe could be set with STRIPE_ACTIVE by the handle_stripe function. And if error happens to the batch_head at the same time, break_stripe_batch_list is called, then below warning could happen (the same report in [1]), it means a member of batch list was set with STRIPE_ACTIVE. [7028915.431770] stripe state: 2001 [7028915.431815] ------------[ cut here ]------------ [7028915.431828] WARNING: CPU: 18 PID: 29089 at drivers/md/raid5.c:4614 break_stripe_batch_list+0x203/0x240 [raid456] [...] [7028915.431879] CPU: 18 PID: 29089 Comm: kworker/u82:5 Tainted: G O 4.14.86-1-storage #4.14.86-1.2~deb9 [7028915.431881] Hardware name: Supermicro SSG-2028R-ACR24L/X10DRH-iT, BIOS 3.1 06/18/2018 [7028915.431888] Workqueue: raid5wq raid5_do_work [raid456] [7028915.431890] task: ffff9ab0ef36d7c0 task.stack: ffffb72926f84000 [7028915.431896] RIP: 0010:break_stripe_batch_list+0x203/0x240 [raid456] [7028915.431898] RSP: 0018:ffffb72926f87ba8 EFLAGS: 00010286 [7028915.431900] RAX: 0000000000000012 RBX: ffff9aaa84a98000 RCX: 0000000000000000 [7028915.431901] RDX: 0000000000000000 RSI: ffff9ab2bfa15458 RDI: ffff9ab2bfa15458 [7028915.431902] RBP: ffff9aaa8fb4e900 R08: 0000000000000001 R09: 0000000000002eb4 [7028915.431903] R10: 00000000ffffffff R11: 0000000000000000 R12: ffff9ab1736f1b00 [7028915.431904] R13: 0000000000000000 R14: ffff9aaa8fb4e900 R15: 0000000000000001 [7028915.431906] FS: 0000000000000000(0000) GS:ffff9ab2bfa00000(0000) knlGS:0000000000000000 [7028915.431907] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [7028915.431908] CR2: 00007ff953b9f5d8 CR3: 0000000bf4009002 CR4: 00000000003606e0 [7028915.431909] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [7028915.431910] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [7028915.431910] Call Trace: [7028915.431923] handle_stripe+0x8e7/0x2020 [raid456] [7028915.431930] ? __wake_up_common_lock+0x89/0xc0 [7028915.431935] handle_active_stripes.isra.58+0x35f/0x560 [raid456] [7028915.431939] raid5_do_work+0xc6/0x1f0 [raid456] Also commit 59fc630b8b5f9f ("RAID5: batch adjacent full stripe write") said "If a stripe is added to batch list, then only the first stripe of the list should be put to handle_list and run handle_stripe." So don't set STRIPE_HANDLE to stripe which is already in batch list, otherwise the stripe could be put to handle_list and run handle_stripe, then the above warning could be triggered. [1]. https://www.spinics.net/lists/raid/msg62552.html Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8ea8443e09d5..9fc6737e9713 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5727,7 +5727,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - set_bit(STRIPE_HANDLE, &sh->state); + if (!sh->batch_head) + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && From c84a1372df929033cb1a0441fb57bd3932f39ac9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Sep 2019 16:30:02 +1000 Subject: [PATCH 3/6] md/raid0: avoid RAID0 data corruption due to layout confusion. If the drives in a RAID0 are not all the same size, the array is divided into zones. The first zone covers all drives, to the size of the smallest. The second zone covers all drives larger than the smallest, up to the size of the second smallest - etc. A change in Linux 3.14 unintentionally changed the layout for the second and subsequent zones. All the correct data is still stored, but each chunk may be assigned to a different device than in pre-3.14 kernels. This can lead to data corruption. It is not possible to determine what layout to use - it depends which kernel the data was written by. So we add a module parameter to allow the old (0) or new (1) layout to be specified, and refused to assemble an affected array if that parameter is not set. Fixes: 20d0189b1012 ("block: Introduce new bio_split()") cc: stable@vger.kernel.org (3.14+) Acked-by: Guoqing Jiang Signed-off-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/raid0.c | 32 +++++++++++++++++++++++++++++++- drivers/md/raid0.h | 14 ++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index bc422eae2c95..ec611abda835 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -19,6 +19,9 @@ #include "raid0.h" #include "raid5.h" +static int default_layout = 0; +module_param(default_layout, int, 0644); + #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ (1L << MD_JOURNAL_CLEAN) | \ @@ -139,6 +142,19 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + + if (conf->nr_strip_zones == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); + err = -ENOTSUPP; + goto abort; + } /* * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size @@ -547,10 +563,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) static bool raid0_make_request(struct mddev *mddev, struct bio *bio) { + struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; sector_t bio_sector; sector_t sector; + sector_t orig_sector; unsigned chunk_sects; unsigned sectors; @@ -584,8 +602,20 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + orig_sector = sector; zone = find_zone(mddev->private, §or); - tmp_dev = map_sector(mddev, zone, sector, §or); + switch (conf->layout) { + case RAID0_ORIG_LAYOUT: + tmp_dev = map_sector(mddev, zone, orig_sector, §or); + break; + case RAID0_ALT_MULTIZONE_LAYOUT: + tmp_dev = map_sector(mddev, zone, sector, §or); + break; + default: + WARN("md/raid0:%s: Invalid layout\n", mdname(mddev)); + bio_io_error(bio); + return true; + } if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) { bio_io_error(bio); diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 540e65d92642..3816e5477db1 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -8,11 +8,25 @@ struct strip_zone { int nb_dev; /* # of devices attached to the zone */ }; +/* Linux 3.14 (20d0189b101) made an unintended change to + * the RAID0 layout for multi-zone arrays (where devices aren't all + * the same size. + * RAID0_ORIG_LAYOUT restores the original layout + * RAID0_ALT_MULTIZONE_LAYOUT uses the altered layout + * The layouts are identical when there is only one zone (all + * devices the same size). + */ + +enum r0layout { + RAID0_ORIG_LAYOUT = 1, + RAID0_ALT_MULTIZONE_LAYOUT = 2, +}; struct r0conf { struct strip_zone *strip_zone; struct md_rdev **devlist; /* lists of rdevs, pointed to * by strip_zone->dev */ int nr_strip_zones; + enum r0layout layout; }; #endif From 33f2c35a54dfd75ad0e7e86918dcbe4de799a56c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Sep 2019 16:52:29 +1000 Subject: [PATCH 4/6] md: add feature flag MD_FEATURE_RAID0_LAYOUT Due to a bug introduced in Linux 3.14 we cannot determine the correctly layout for a multi-zone RAID0 array - there are two possibilities. It is possible to tell the kernel which to chose using a module parameter, but this can be clumsy to use. It would be best if the choice were recorded in the metadata. So add a feature flag for this purpose. If it is set, then the 'layout' field of the superblock is used to determine which layout to use. If this flag is not set, then mddev->layout gets set to -1, which causes the module parameter to be required. Acked-by: Guoqing Jiang Signed-off-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/md.c | 13 +++++++++++++ drivers/md/raid0.c | 3 +++ include/uapi/linux/raid/md_p.h | 2 ++ 3 files changed, 18 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 73d5a1b04022..1be7abeb24fd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1237,6 +1237,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0) + mddev->layout = -1; if (sb->state & (1<recovery_cp = MaxSector; @@ -1652,6 +1654,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + if (!refdev) { ret = 1; } else { @@ -1762,6 +1768,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); @@ -6898,6 +6908,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ec611abda835..f61693e59684 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -145,6 +145,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (conf->nr_strip_zones == 1) { conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; } else if (default_layout == RAID0_ORIG_LAYOUT || default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { conf->layout = default_layout; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index b0d15c73f6d7..1f2d8c81f0e0 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -329,6 +329,7 @@ struct mdp_superblock_1 { #define MD_FEATURE_JOURNAL 512 /* support write cache */ #define MD_FEATURE_PPL 1024 /* support PPL */ #define MD_FEATURE_MULTIPLE_PPLS 2048 /* support for multiple PPLs */ +#define MD_FEATURE_RAID0_LAYOUT 4096 /* layout is meaningful for RAID0 */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -341,6 +342,7 @@ struct mdp_superblock_1 { |MD_FEATURE_JOURNAL \ |MD_FEATURE_PPL \ |MD_FEATURE_MULTIPLE_PPLS \ + |MD_FEATURE_RAID0_LAYOUT \ ) struct r5l_payload_header { From feb9bf9849e2aa0dd2833285af7c25aee07df7bb Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 12 Sep 2019 12:10:15 +0200 Subject: [PATCH 5/6] raid5: remove STRIPE_OPS_REQ_PENDING This stripe state is not used anymore after commit 51acbcec6c42b24 ("md: remove CONFIG_MULTICORE_RAID456"), so remove the obsoleted state. gjiang@nb01257:~/md$ grep STRIPE_OPS_REQ_PENDING drivers/md/ -r drivers/md/raid5.c: (1 << STRIPE_OPS_REQ_PENDING) | drivers/md/raid5.h: STRIPE_OPS_REQ_PENDING, Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 1 - drivers/md/raid5.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9fc6737e9713..223e97ab27e6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4621,7 +4621,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_FULL_WRITE) | (1 << STRIPE_BIOFILL_RUN) | (1 << STRIPE_COMPUTE_RUN) | - (1 << STRIPE_OPS_REQ_PENDING) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | (1 << STRIPE_BATCH_ERR) | diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index cf991f13403e..877e7d3f4bd1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -357,7 +357,6 @@ enum { STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ STRIPE_BIOFILL_RUN, STRIPE_COMPUTE_RUN, - STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, STRIPE_DISCARD, STRIPE_ON_RELEASE_LIST, From 067df25c83902e2950ef24fed713f0fa38282f34 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 12 Sep 2019 12:10:16 +0200 Subject: [PATCH 6/6] raid5: use bio_end_sector in r5_next_bio Actually, we calculate bio's end sector here, so use the common way for the purpose. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 877e7d3f4bd1..f90e0704bed9 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -492,9 +492,7 @@ struct disk_info { */ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) { - int sectors = bio_sectors(bio); - - if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) + if (bio_end_sector(bio) < sector + STRIPE_SECTORS) return bio->bi_next; else return NULL;