From f91389c8d2867d10a206273b8eb3b7955b65591e Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 8 Nov 2018 12:11:06 +0000 Subject: [PATCH 1/9] md: remove set but not used variable 'bi_rdev' Fixes gcc '-Wunused-but-set-variable' warning: drivers/md/md.c: In function 'md_integrity_add_rdev': drivers/md/md.c:2149:24: warning: variable 'bi_rdev' set but not used [-Wunused-but-set-variable] It not used any more after commit 1501efadc524 ("md/raid: only permit hot-add of compatible integrity profiles") Signed-off-by: Yue Haibing Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fc488cb30a94..69778419aa20 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2148,14 +2148,12 @@ EXPORT_SYMBOL(md_integrity_register); */ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev; struct blk_integrity *bi_mddev; char name[BDEVNAME_SIZE]; if (!mddev->gendisk) return 0; - bi_rdev = bdev_get_integrity(rdev->bdev); bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ From e731f3e28b7e7d1c745b03084e01036ee00018eb Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Mon, 12 Nov 2018 15:22:16 -0800 Subject: [PATCH 2/9] lib/raid6: add missing include for raid6test Add #include for gettimeofday() to fix the compiler warning about an implicitly defined functions. Signed-off-by: Daniel Verkamp Signed-off-by: Shaohua Li --- include/linux/raid/pq.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index ea8505204fdf..0c245dcb8b48 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -35,6 +35,7 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #include #include #include +#include #include /* Not standard, but glibc defines it */ From 58af3110a7c5d161f72f94a98c6f2b9b75bf5cf9 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Mon, 12 Nov 2018 15:22:17 -0800 Subject: [PATCH 3/9] lib/raid6: avoid __attribute_const__ redefinition This is defined in glibc's sys/cdefs.h on my system with the same definition as the raid6test fallback definition. Add a #ifndef check to avoid a compiler warning about redefining it. Signed-off-by: Daniel Verkamp Signed-off-by: Shaohua Li --- include/linux/raid/pq.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 0c245dcb8b48..d7c99161bba2 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -53,7 +53,9 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #define __init #define __exit -#define __attribute_const__ __attribute__((const)) +#ifndef __attribute_const__ +# define __attribute_const__ __attribute__((const)) +#endif #define noinline __attribute__((noinline)) #define preempt_enable() From 86919f9dd2dbb88bbbe4e381dc8e6b9e9a1b9eb5 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Mon, 12 Nov 2018 15:22:18 -0800 Subject: [PATCH 4/9] lib/raid6: check for assembler SSSE3 support Allow the x86 SSSE3 recovery function to be tested in raid6test. Signed-off-by: Daniel Verkamp Signed-off-by: Shaohua Li --- lib/raid6/test/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 79777645cac9..3ab8720aa2f8 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -34,6 +34,9 @@ endif ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o + CFLAGS += $(shell echo "pshufb %xmm0, %xmm0" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_SSSE3=1) CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ gcc -c -x assembler - >&/dev/null && \ rm ./-.o && echo -DCONFIG_AS_AVX2=1) From 0437de4fa09fe59b57d12b785e4afb73b0f34c05 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Mon, 12 Nov 2018 15:26:51 -0800 Subject: [PATCH 5/9] lib/raid6: sort algos in rough performance order Sort the list of RAID6 algorithms in roughly decreasing order of expected performance: newer instruction sets first (within each architecture) and wider unrollings first. This doesn't make any difference right now, since all functions are benchmarked; a follow-up change will make use of this by optionally choosing the first valid function rather than testing all of them. The Itanium raid6_intx{16,32} entries are also moved down to be near the other raid6_intx entries for clarity. Signed-off-by: Daniel Verkamp Signed-off-by: Shaohua Li --- lib/raid6/algos.c | 78 +++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 5065b1e7e327..a753ff56670f 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -34,64 +34,64 @@ struct raid6_calls raid6_call; EXPORT_SYMBOL_GPL(raid6_call); const struct raid6_calls * const raid6_algos[] = { -#if defined(__ia64__) - &raid6_intx16, - &raid6_intx32, -#endif #if defined(__i386__) && !defined(__arch_um__) - &raid6_mmxx1, - &raid6_mmxx2, - &raid6_sse1x1, - &raid6_sse1x2, - &raid6_sse2x1, - &raid6_sse2x2, -#ifdef CONFIG_AS_AVX2 - &raid6_avx2x1, - &raid6_avx2x2, -#endif #ifdef CONFIG_AS_AVX512 - &raid6_avx512x1, &raid6_avx512x2, + &raid6_avx512x1, #endif +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x2, + &raid6_avx2x1, +#endif + &raid6_sse2x2, + &raid6_sse2x1, + &raid6_sse1x2, + &raid6_sse1x1, + &raid6_mmxx2, + &raid6_mmxx1, #endif #if defined(__x86_64__) && !defined(__arch_um__) - &raid6_sse2x1, - &raid6_sse2x2, - &raid6_sse2x4, -#ifdef CONFIG_AS_AVX2 - &raid6_avx2x1, - &raid6_avx2x2, - &raid6_avx2x4, -#endif #ifdef CONFIG_AS_AVX512 - &raid6_avx512x1, - &raid6_avx512x2, &raid6_avx512x4, + &raid6_avx512x2, + &raid6_avx512x1, #endif +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x4, + &raid6_avx2x2, + &raid6_avx2x1, +#endif + &raid6_sse2x4, + &raid6_sse2x2, + &raid6_sse2x1, #endif #ifdef CONFIG_ALTIVEC - &raid6_altivec1, - &raid6_altivec2, - &raid6_altivec4, - &raid6_altivec8, - &raid6_vpermxor1, - &raid6_vpermxor2, - &raid6_vpermxor4, &raid6_vpermxor8, + &raid6_vpermxor4, + &raid6_vpermxor2, + &raid6_vpermxor1, + &raid6_altivec8, + &raid6_altivec4, + &raid6_altivec2, + &raid6_altivec1, #endif #if defined(CONFIG_S390) &raid6_s390vx8, #endif - &raid6_intx1, - &raid6_intx2, - &raid6_intx4, - &raid6_intx8, #ifdef CONFIG_KERNEL_MODE_NEON - &raid6_neonx1, - &raid6_neonx2, - &raid6_neonx4, &raid6_neonx8, + &raid6_neonx4, + &raid6_neonx2, + &raid6_neonx1, #endif +#if defined(__ia64__) + &raid6_intx32, + &raid6_intx16, +#endif + &raid6_intx8, + &raid6_intx4, + &raid6_intx2, + &raid6_intx1, NULL }; From be85f93ae2df32dea0b20908316f1d894c3e0f64 Mon Sep 17 00:00:00 2001 From: Daniel Verkamp Date: Mon, 12 Nov 2018 15:26:52 -0800 Subject: [PATCH 6/9] lib/raid6: add option to skip algo benchmarking This is helpful for systems where fast startup time is important. It is especially nice to avoid benchmarking RAID functions that are never used (for example, BTRFS selects RAID6_PQ even if the parity RAID mode is not in use). This saves 250+ milliseconds of boot time on modern x86 and ARM systems with a dozen or more available implementations. The new option is defaulted to 'y' to match the previous behavior of always benchmarking on init. Signed-off-by: Daniel Verkamp Signed-off-by: Shaohua Li --- include/linux/raid/pq.h | 3 +++ lib/Kconfig | 8 ++++++++ lib/raid6/algos.c | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index d7c99161bba2..605cf46c17bd 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -70,6 +70,9 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; #define MODULE_DESCRIPTION(desc) #define subsys_initcall(x) #define module_exit(x) + +#define IS_ENABLED(x) (x) +#define CONFIG_RAID6_PQ_BENCHMARK 1 #endif /* __KERNEL__ */ /* Routine choices */ diff --git a/lib/Kconfig b/lib/Kconfig index a9965f4af4dd..fcb05305a5a2 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -10,6 +10,14 @@ menu "Library routines" config RAID6_PQ tristate +config RAID6_PQ_BENCHMARK + bool "Automatically choose fastest RAID6 PQ functions" + depends on RAID6_PQ + default y + help + Benchmark all available RAID6 PQ functions on init and choose the + fastest one. + config BITREVERSE tristate diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index a753ff56670f..7e4f7a8ffa8e 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -163,6 +163,11 @@ static inline const struct raid6_calls *raid6_choose_gen( if ((*algo)->valid && !(*algo)->valid()) continue; + if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + best = *algo; + break; + } + perf = 0; preempt_disable(); From 37b22c28946046e587c12b2b5bdaba7062e23f75 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 14 Nov 2018 07:33:09 +0800 Subject: [PATCH 7/9] md: remvoe redundant condition check mempool_destroy() can handle NULL pointer correctly, so there is no need to check NULL pointer before calling mempool_destroy(). Signed-off-by: Chengguang Xu Signed-off-by: Shaohua Li --- drivers/md/md.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 69778419aa20..4f9b5827355e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5692,14 +5692,10 @@ int md_run(struct mddev *mddev) return 0; abort: - if (mddev->flush_bio_pool) { - mempool_destroy(mddev->flush_bio_pool); - mddev->flush_bio_pool = NULL; - } - if (mddev->flush_pool){ - mempool_destroy(mddev->flush_pool); - mddev->flush_pool = NULL; - } + mempool_destroy(mddev->flush_bio_pool); + mddev->flush_bio_pool = NULL; + mempool_destroy(mddev->flush_pool); + mddev->flush_pool = NULL; return err; } From caea3c47ad515210129f06b6bbe53f82f69efebe Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 7 Dec 2018 18:24:21 +0800 Subject: [PATCH 8/9] raid10: refactor common wait code from regular read/write request Both raid10_read_request and raid10_write_request share the same code at the beginning of them, so introduce regular_request_wait to clean up code, and call it in both request functions. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 72 ++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b98e746e7fc4..76c92e31afc0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1124,6 +1124,29 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } +/* + * 1. Register the new request and wait if the reconstruction thread has put + * up a bar for new requests. Continue immediately if no resync is active + * currently. + * 2. If IO spans the reshape position. Need to wait for reshape to pass. + */ +static void regular_request_wait(struct mddev *mddev, struct r10conf *conf, + struct bio *bio, sector_t sectors) +{ + wait_barrier(conf); + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio->bi_iter.bi_sector < conf->reshape_progress && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { + raid10_log(conf->mddev, "wait reshape"); + allow_barrier(conf); + wait_event(conf->wait_barrier, + conf->reshape_progress <= bio->bi_iter.bi_sector || + conf->reshape_progress >= bio->bi_iter.bi_sector + + sectors); + wait_barrier(conf); + } +} + static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { @@ -1132,7 +1155,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, const int op = bio_op(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); int max_sectors; - sector_t sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; int slot = r10_bio->read_slot; @@ -1166,30 +1188,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } rcu_read_unlock(); } - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - wait_barrier(conf); - - sectors = r10_bio->sectors; - while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - bio->bi_iter.bi_sector < conf->reshape_progress && - bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { - /* - * IO spans the reshape position. Need to wait for reshape to - * pass - */ - raid10_log(conf->mddev, "wait reshape"); - allow_barrier(conf); - wait_event(conf->wait_barrier, - conf->reshape_progress <= bio->bi_iter.bi_sector || - conf->reshape_progress >= bio->bi_iter.bi_sector + - sectors); - wait_barrier(conf); - } + regular_request_wait(mddev, conf, bio, r10_bio->sectors); rdev = read_balance(conf, r10_bio, &max_sectors); if (!rdev) { if (err_rdev) { @@ -1332,30 +1332,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, finish_wait(&conf->wait_barrier, &w); } - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - wait_barrier(conf); - sectors = r10_bio->sectors; - while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - bio->bi_iter.bi_sector < conf->reshape_progress && - bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { - /* - * IO spans the reshape position. Need to wait for reshape to - * pass - */ - raid10_log(conf->mddev, "wait reshape"); - allow_barrier(conf); - wait_event(conf->wait_barrier, - conf->reshape_progress <= bio->bi_iter.bi_sector || - conf->reshape_progress >= bio->bi_iter.bi_sector + - sectors); - wait_barrier(conf); - } - + regular_request_wait(mddev, conf, bio, sectors); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && (mddev->reshape_backwards ? (bio->bi_iter.bi_sector < conf->reshape_safe && From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 19 Dec 2018 14:19:25 +0800 Subject: [PATCH 9/9] md: fix raid10 hang issue caused by barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When both regular IO and resync IO happen at the same time, and if we also need to split regular. Then we can see tasks hang due to barrier. 1. resync thread [ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds. [ 1463.757207] Not tainted 4.19.5-1-default #1 [ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1463.757212] md1_resync D 0 5215 2 0x80000000 [ 1463.757216] Call Trace: [ 1463.757223] ? __schedule+0x29a/0x880 [ 1463.757231] ? raise_barrier+0x8d/0x140 [raid10] [ 1463.757236] schedule+0x78/0x110 [ 1463.757243] raise_barrier+0x8d/0x140 [raid10] [ 1463.757248] ? wait_woken+0x80/0x80 [ 1463.757257] raid10_sync_request+0x1f6/0x1e30 [raid10] [ 1463.757265] ? _raw_spin_unlock_irq+0x22/0x40 [ 1463.757284] ? is_mddev_idle+0x125/0x137 [md_mod] [ 1463.757302] md_do_sync.cold.78+0x404/0x969 [md_mod] [ 1463.757311] ? wait_woken+0x80/0x80 [ 1463.757336] ? md_rdev_init+0xb0/0xb0 [md_mod] [ 1463.757351] md_thread+0xe9/0x140 [md_mod] [ 1463.757358] ? _raw_spin_unlock_irqrestore+0x2e/0x60 [ 1463.757364] ? __kthread_parkme+0x4c/0x70 [ 1463.757369] kthread+0x112/0x130 [ 1463.757374] ? kthread_create_worker_on_cpu+0x40/0x40 [ 1463.757380] ret_from_fork+0x3a/0x50 2. regular IO [ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds. [ 1463.760683] Not tainted 4.19.5-1-default #1 [ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1463.760687] kworker/0:8 D 0 5367 2 0x80000000 [ 1463.760718] Workqueue: md submit_flushes [md_mod] [ 1463.760721] Call Trace: [ 1463.760731] ? __schedule+0x29a/0x880 [ 1463.760741] ? wait_barrier+0xdd/0x170 [raid10] [ 1463.760746] schedule+0x78/0x110 [ 1463.760753] wait_barrier+0xdd/0x170 [raid10] [ 1463.760761] ? wait_woken+0x80/0x80 [ 1463.760768] raid10_write_request+0xf2/0x900 [raid10] [ 1463.760774] ? wait_woken+0x80/0x80 [ 1463.760778] ? mempool_alloc+0x55/0x160 [ 1463.760795] ? md_write_start+0xa9/0x270 [md_mod] [ 1463.760801] ? try_to_wake_up+0x44/0x470 [ 1463.760810] raid10_make_request+0xc1/0x120 [raid10] [ 1463.760816] ? wait_woken+0x80/0x80 [ 1463.760831] md_handle_request+0x121/0x190 [md_mod] [ 1463.760851] md_make_request+0x78/0x190 [md_mod] [ 1463.760860] generic_make_request+0x1c6/0x470 [ 1463.760870] raid10_write_request+0x77a/0x900 [raid10] [ 1463.760875] ? wait_woken+0x80/0x80 [ 1463.760879] ? mempool_alloc+0x55/0x160 [ 1463.760895] ? md_write_start+0xa9/0x270 [md_mod] [ 1463.760904] raid10_make_request+0xc1/0x120 [raid10] [ 1463.760910] ? wait_woken+0x80/0x80 [ 1463.760926] md_handle_request+0x121/0x190 [md_mod] [ 1463.760931] ? _raw_spin_unlock_irq+0x22/0x40 [ 1463.760936] ? finish_task_switch+0x74/0x260 [ 1463.760954] submit_flushes+0x21/0x40 [md_mod] So resync io is waiting for regular write io to complete to decrease nr_pending (conf->barrier++ is called before waiting). The regular write io splits another bio after call wait_barrier which call nr_pending++, then the splitted bio would continue with raid10_write_request -> wait_barrier, so the splitted bio has to wait for barrier to be zero, then deadlock happens as follows. resync io regular io raise_barrier wait_barrier generic_make_request wait_barrier To resolve the issue, we need to call allow_barrier to decrease nr_pending before generic_make_request since regular IO is not issued to underlying devices, and wait_barrier is called again to ensure no internal IO happening. Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.") Reported-and-tested-by: SiniĊĦa Bandin Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 76c92e31afc0..abb5d382f64d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); bio_chain(split, bio); + allow_barrier(conf); generic_make_request(bio); + wait_barrier(conf); bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; @@ -1492,7 +1494,9 @@ retry_write: struct bio *split = bio_split(bio, r10_bio->sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, bio); + allow_barrier(conf); generic_make_request(bio); + wait_barrier(conf); bio = split; r10_bio->master_bio = bio; }