From 9fb407179c6fd910005040bebb040094ef959b6c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 21 Feb 2021 18:28:05 -0800 Subject: [PATCH 01/21] block: Remove unused blk_pm_*() function definitions Commit a1ce35fa4985 ("block: remove dead elevator code") removed the last callers of blk_pm_requeue_request(), blk_pm_add_request() and blk_pm_put_request(). Hence remove the definitions of these functions. Removing these functions removes all users of the struct request nr_pending member. Hence also remove 'nr_pending'. Note: 'nr_pending' is no longer used since commit 7cedffec8e75 ("block: Make blk_get_request() block for non-PM requests while suspended"). Cc: Alan Stern Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-pm.h | 38 -------------------------------------- include/linux/blkdev.h | 1 - 2 files changed, 39 deletions(-) diff --git a/block/blk-pm.h b/block/blk-pm.h index a2283cc9f716..8a5a0d4b357f 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h @@ -21,31 +21,6 @@ static inline void blk_pm_mark_last_busy(struct request *rq) if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } - -static inline void blk_pm_requeue_request(struct request *rq) -{ - lockdep_assert_held(&rq->q->queue_lock); - - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - rq->q->nr_pending--; -} - -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ - lockdep_assert_held(&q->queue_lock); - - if (q->dev && !(rq->rq_flags & RQF_PM)) - q->nr_pending++; -} - -static inline void blk_pm_put_request(struct request *rq) -{ - lockdep_assert_held(&rq->q->queue_lock); - - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - --rq->q->nr_pending; -} #else static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { @@ -55,19 +30,6 @@ static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) static inline void blk_pm_mark_last_busy(struct request *rq) { } - -static inline void blk_pm_requeue_request(struct request *rq) -{ -} - -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ -} - -static inline void blk_pm_put_request(struct request *rq) -{ -} #endif #endif /* _BLOCK_BLK_PM_H_ */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9149f4a5adb3..f2e0697258b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -462,7 +462,6 @@ struct request_queue { #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; - unsigned int nr_pending; #endif /* From 179d1600723670dc0d6ae8ce572e0e2c44b64763 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:55 -0800 Subject: [PATCH 02/21] block: remove superfluous param in blk_fill_rwbs() The last parameter for the function blk_fill_rwbs() was added in 5782138e47 ("tracing/events: convert block trace points to TRACE_EVENT()") in order to signal read request and use of that parameter was replaced with using switch case REQ_OP_READ with 1b9a9ab78b0 ("blktrace: use op accessors"), but the parameter was never removed. Remove the unused parameter and adjust the respective call sites. Fixes: 1b9a9ab78b0 ("blktrace: use op accessors") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 +- include/trace/events/bcache.h | 10 +++++----- include/trace/events/block.h | 16 ++++++++-------- kernel/trace/blktrace.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 05556573b896..11484f1d19a1 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup { #endif -extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); +extern void blk_fill_rwbs(char *rwbs, unsigned int op); static inline sector_t blk_rq_trace_sector(struct request *rq) { diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index e41c611d6d3b..899fdacf57b9 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u", @@ -137,7 +137,7 @@ TRACE_EVENT(bcache_read, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); __entry->cache_hit = hit; __entry->bypass = bypass; ), @@ -168,7 +168,7 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); __entry->writeback = writeback; __entry->bypass = bypass; ), @@ -238,7 +238,7 @@ TRACE_EVENT(bcache_journal_write, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; __entry->nr_keys = keys; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u keys %u", diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 0d782663a005..879cba8bdfca 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -89,7 +89,7 @@ TRACE_EVENT(block_rq_requeue, __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), @@ -133,7 +133,7 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector = nr_bytes >> 9; __entry->error = error; - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), @@ -166,7 +166,7 @@ DECLARE_EVENT_CLASS(block_rq, __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -249,7 +249,7 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", @@ -276,7 +276,7 @@ DECLARE_EVENT_CLASS(block_bio, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -433,7 +433,7 @@ TRACE_EVENT(block_split, __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -474,7 +474,7 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", @@ -518,7 +518,7 @@ TRACE_EVENT(block_rq_remap, __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); - blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e9ee4945043..8a2591c7aa41 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1867,7 +1867,7 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING -void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) +void blk_fill_rwbs(char *rwbs, unsigned int op) { int i = 0; From 1f83bb4b491472310ae7aeca505ed3725149906c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:56 -0800 Subject: [PATCH 03/21] blktrace: add blk_fill_rwbs documentation comment blk_fill_rwbs() is an expoted function, add kernel style documentation comment. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 +- kernel/trace/blktrace.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 11484f1d19a1..e17d04abf6a3 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -119,7 +119,7 @@ struct compat_blk_user_trace_setup { #endif -extern void blk_fill_rwbs(char *rwbs, unsigned int op); +void blk_fill_rwbs(char *rwbs, unsigned int op); static inline sector_t blk_rq_trace_sector(struct request *rq) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8a2591c7aa41..d7ebef83771c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1867,6 +1867,16 @@ void blk_trace_remove_sysfs(struct device *dev) #ifdef CONFIG_EVENT_TRACING +/** + * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. + * @rwbs buffer to be filled + * @op: REQ_OP_XXX for the tracepoint + * + * Description: + * Maps the REQ_OP_XXX to character and fills the buffer provided by the + * caller with resulting string. + * + **/ void blk_fill_rwbs(char *rwbs, unsigned int op) { int i = 0; From c7ff651960a6ef11cef55479658aff504c34872f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:57 -0800 Subject: [PATCH 04/21] blktrace: fix blk_rq_issue documentation The commit 881245dcff29 ("Add DocBook documentation for the block tracepoints.") added the comment for blk_rq_issue() tracepoint. Remove the duplicate word from the tracepoint documentation. Fixes: 881245dcff29 ("Add DocBook documentation for the block tracepoints.") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/trace/events/block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 879cba8bdfca..004cfe34ef37 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -196,7 +196,7 @@ DEFINE_EVENT(block_rq, block_rq_insert, /** * block_rq_issue - issue pending block IO request operation to device driver - * @rq: block IO operation operation request + * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. From b0719245098c27b36a9b52969af0300ae6219591 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:58 -0800 Subject: [PATCH 05/21] blktrace: fix blk_rq_merge documentation The commit f3bdc62fd82e ("blktrace: Provide event for request merging") added the comment for blk_rq_merge() tracepoint. Remove the duplicate word from the tracepoint documentation. Fixes: f3bdc62fd82e ("blktrace: Provide event for request merging") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/trace/events/block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 004cfe34ef37..cc5ab96a7471 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -210,7 +210,7 @@ DEFINE_EVENT(block_rq, block_rq_issue, /** * block_rq_merge - merge request with another one in the elevator - * @rq: block IO operation operation request + * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. From b357e4a694ac4b95096715df253548f7e1f2723f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 21 Feb 2021 21:29:59 -0800 Subject: [PATCH 06/21] block: get rid of the trace rq insert wrapper Get rid of the wrapper for trace_block_rq_insert() and call the function directly. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 +++- block/blk-core.c | 1 + block/blk-mq-sched.c | 6 ------ block/blk-mq-sched.h | 1 - block/kyber-iosched.c | 4 +++- block/mq-deadline.c | 4 +++- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b398dde53af9..ec482e6641ff 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -125,6 +125,8 @@ #include #include +#include + #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" @@ -5621,7 +5623,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_unlock_irq(&bfqd->lock); - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); spin_lock_irq(&bfqd->lock); bfqq = bfq_init_rq(rq); diff --git a/block/blk-core.c b/block/blk-core.c index 5e752840b41a..fc60ff208497 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -59,6 +59,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); DEFINE_IDA(blk_queue_ida); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index deff4e826e23..ddb65e9e6fd9 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -384,12 +384,6 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -void blk_mq_sched_request_inserted(struct request *rq) -{ - trace_block_rq_insert(rq); -} -EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); - static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, bool has_sched, struct request *rq) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0476360f05f1..5b18ab915c65 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -7,7 +7,6 @@ void blk_mq_sched_assign_ioc(struct request *rq); -void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index c25c41d0d061..f13da10953bf 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -13,6 +13,8 @@ #include #include +#include + #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -602,7 +604,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], rq->mq_ctx->index_hw[hctx->type]); - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); spin_unlock(&kcq->lock); } } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b57470e154c8..f3631a287466 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -18,6 +18,8 @@ #include #include +#include + #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -496,7 +498,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, if (blk_mq_sched_try_insert_merge(q, rq)) return; - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); if (at_head || blk_rq_is_passthrough(rq)) { if (at_head) From 6b09b4d33bd964f49d07d3cabfb4204d58cf9811 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 22 Feb 2021 14:54:52 +0800 Subject: [PATCH 07/21] block: fix potential IO hang when turning off io_poll QUEUE_FLAG_POLL flag will be cleared when turning off 'io_poll', while at that moment there may be IOs stuck in hw queue uncompleted. The following polling routine won't help reap these IOs, since blk_poll() will return immediately because of cleared QUEUE_FLAG_POLL flag. Thus these IOs will hang until they finnaly time out. The hang out can be observed by 'fio --engine=io_uring iodepth=1', while turning off 'io_poll' at the same time. To fix this, freeze and flush the request queue first when turning off 'io_poll'. Signed-off-by: Jeffle Xu Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ae39c7f3d83d..0f4f0c8a7825 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -434,10 +434,13 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - if (poll_on) + if (poll_on) { blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else + } else { + blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + blk_mq_unfreeze_queue(q); + } return ret; } From 4ceddce55eb35d15b0f87f5dcf6f0058fd15d3a4 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 22 Feb 2021 12:41:23 -0300 Subject: [PATCH 08/21] loop: fix I/O error on fsync() in detached loop devices There's an I/O error on fsync() in a detached loop device if it has been previously attached. The issue is write cache is enabled in the attach path in loop_configure() but it isn't disabled in the detach path; thus it remains enabled in the block device regardless of whether it is attached or not. Now fsync() can get an I/O request that will just be failed later in loop_queue_rq() as device's state is not 'Lo_bound'. So, disable write cache in the detach path. Do so based on the queue flag, not the loop device flag for read-only (used to enable) as the queue flag can be changed via sysfs even on read-only loop devices (e.g., losetup -r.) Test-case: # DEV=/dev/loop7 # IMG=/tmp/image # truncate --size 1M $IMG # losetup $DEV $IMG # losetup -d $DEV Before: # strace -e fsync parted -s $DEV print 2>&1 | grep fsync fsync(3) = -1 EIO (Input/output error) Warning: Error fsyncing/closing /dev/loop7: Input/output error [ 982.529929] blk_update_request: I/O error, dev loop7, sector 0 op 0x1:(WRITE) flags 0x800 phys_seg 0 prio class 0 After: # strace -e fsync parted -s $DEV print 2>&1 | grep fsync fsync(3) = 0 Co-developed-by: Eric Desrochers Signed-off-by: Eric Desrochers Signed-off-by: Mauricio Faria de Oliveira Tested-by: Gabriel Krisman Bertazi Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 578fc034db3f..81d20ec93375 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1212,6 +1212,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) goto out_unlock; } + if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags)) + blk_queue_write_cache(lo->lo_queue, false, false); + /* freeze request queue during the transition */ blk_mq_freeze_queue(lo->lo_queue); From ffa772cfe9356ce94d3061335c2681f60e7c1c5b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 5 Feb 2021 01:13:10 -0800 Subject: [PATCH 09/21] kyber: introduce kyber_depth_updated() Hang occurs when user changes the scheduler queue depth, by writing to the 'nr_requests' sysfs file of that device. The details of the environment that we found the problem are as follows: an eMMC block device total driver tags: 16 default queue_depth: 32 kqd->async_depth initialized in kyber_init_sched() with queue_depth=32 Then we change queue_depth to 256, by writing to the 'nr_requests' sysfs file. But kqd->async_depth don't be updated after queue_depth changes. Now the value of async depth is too small for queue_depth=256, this may cause hang. This patch introduces kyber_depth_updated(), so that kyber can update async depth when queue depth changes. Signed-off-by: Yang Yang Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index f13da10953bf..33d34d69cade 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -355,19 +355,9 @@ static void kyber_timer_fn(struct timer_list *t) } } -static unsigned int kyber_sched_tags_shift(struct request_queue *q) -{ - /* - * All of the hardware queues have the same depth, so we can just grab - * the shift of the first one. - */ - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; -} - static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; - unsigned int shift; int ret = -ENOMEM; int i; @@ -402,9 +392,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) kqd->latency_targets[i] = kyber_latency_targets[i]; } - shift = kyber_sched_tags_shift(q); - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; - return kqd; err_buckets: @@ -460,9 +447,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags->sb.shift; + + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); +} + +static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ struct kyber_hctx_data *khd; int i; @@ -504,8 +501,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, - kqd->async_depth); + kyber_depth_updated(hctx); return 0; @@ -1024,6 +1020,7 @@ static struct elevator_type kyber_sched = { .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, + .depth_updated = kyber_depth_updated, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, From c9a2f90f4d6b9d42b9912f7aaf68e8d748acfffd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Feb 2021 15:09:53 -0500 Subject: [PATCH 10/21] nbd: handle device refs for DESTROY_ON_DISCONNECT properly There exists a race where we can be attempting to create a new nbd configuration while a previous configuration is going down, both configured with DESTROY_ON_DISCONNECT. Normally devices all have a reference of 1, as they won't be cleaned up until the module is torn down. However with DESTROY_ON_DISCONNECT we'll make sure that there is only 1 reference (generally) on the device for the config itself, and then once the config is dropped, the device is torn down. The race that exists looks like this TASK1 TASK2 nbd_genl_connect() idr_find() refcount_inc_not_zero(nbd) * count is 2 here ^^ nbd_config_put() nbd_put(nbd) (count is 1) setup new config check DESTROY_ON_DISCONNECT put_dev = true if (put_dev) nbd_put(nbd) * free'd here ^^ In nbd_genl_connect() we assume that the nbd ref count will be 2, however clearly that won't be true if the nbd device had been setup as DESTROY_ON_DISCONNECT with its prior configuration. Fix this by getting rid of the runtime flag to check if we need to mess with the nbd device refcount, and use the device NBD_DESTROY_ON_DISCONNECT flag to check if we need to adjust the ref counts. This was reported by syzkaller with the following kasan dump BUG: KASAN: use-after-free in instrument_atomic_read include/linux/instrumented.h:71 [inline] BUG: KASAN: use-after-free in atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] BUG: KASAN: use-after-free in refcount_dec_not_one+0x71/0x1e0 lib/refcount.c:76 Read of size 4 at addr ffff888143bf71a0 by task systemd-udevd/8451 CPU: 0 PID: 8451 Comm: systemd-udevd Not tainted 5.11.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:230 __kasan_report mm/kasan/report.c:396 [inline] kasan_report.cold+0x79/0xd5 mm/kasan/report.c:413 check_memory_region_inline mm/kasan/generic.c:179 [inline] check_memory_region+0x13d/0x180 mm/kasan/generic.c:185 instrument_atomic_read include/linux/instrumented.h:71 [inline] atomic_read include/asm-generic/atomic-instrumented.h:27 [inline] refcount_dec_not_one+0x71/0x1e0 lib/refcount.c:76 refcount_dec_and_mutex_lock+0x19/0x140 lib/refcount.c:115 nbd_put drivers/block/nbd.c:248 [inline] nbd_release+0x116/0x190 drivers/block/nbd.c:1508 __blkdev_put+0x548/0x800 fs/block_dev.c:1579 blkdev_put+0x92/0x570 fs/block_dev.c:1632 blkdev_close+0x8c/0xb0 fs/block_dev.c:1640 __fput+0x283/0x920 fs/file_table.c:280 task_work_run+0xdd/0x190 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:174 [inline] exit_to_user_mode_prepare+0x249/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:283 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fc1e92b5270 Code: 73 01 c3 48 8b 0d 38 7d 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c1 20 00 00 75 10 b8 03 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ee fb ff ff 48 89 04 24 RSP: 002b:00007ffe8beb2d18 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000007 RCX: 00007fc1e92b5270 RDX: 000000000aba9500 RSI: 0000000000000000 RDI: 0000000000000007 RBP: 00007fc1ea16f710 R08: 000000000000004a R09: 0000000000000008 R10: 0000562f8cb0b2a8 R11: 0000000000000246 R12: 0000000000000000 R13: 0000562f8cb0afd0 R14: 0000000000000003 R15: 000000000000000e Allocated by task 1: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:401 [inline] ____kasan_kmalloc.constprop.0+0x82/0xa0 mm/kasan/common.c:429 kmalloc include/linux/slab.h:552 [inline] kzalloc include/linux/slab.h:682 [inline] nbd_dev_add+0x44/0x8e0 drivers/block/nbd.c:1673 nbd_init+0x250/0x271 drivers/block/nbd.c:2394 do_one_initcall+0x103/0x650 init/main.c:1223 do_initcall_level init/main.c:1296 [inline] do_initcalls init/main.c:1312 [inline] do_basic_setup init/main.c:1332 [inline] kernel_init_freeable+0x605/0x689 init/main.c:1533 kernel_init+0xd/0x1b8 init/main.c:1421 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 Freed by task 8451: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track+0x1c/0x30 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:356 ____kasan_slab_free+0xe1/0x110 mm/kasan/common.c:362 kasan_slab_free include/linux/kasan.h:192 [inline] slab_free_hook mm/slub.c:1547 [inline] slab_free_freelist_hook+0x5d/0x150 mm/slub.c:1580 slab_free mm/slub.c:3143 [inline] kfree+0xdb/0x3b0 mm/slub.c:4139 nbd_dev_remove drivers/block/nbd.c:243 [inline] nbd_put.part.0+0x180/0x1d0 drivers/block/nbd.c:251 nbd_put drivers/block/nbd.c:295 [inline] nbd_config_put+0x6dd/0x8c0 drivers/block/nbd.c:1242 nbd_release+0x103/0x190 drivers/block/nbd.c:1507 __blkdev_put+0x548/0x800 fs/block_dev.c:1579 blkdev_put+0x92/0x570 fs/block_dev.c:1632 blkdev_close+0x8c/0xb0 fs/block_dev.c:1640 __fput+0x283/0x920 fs/file_table.c:280 task_work_run+0xdd/0x190 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:174 [inline] exit_to_user_mode_prepare+0x249/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:283 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888143bf7000 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 416 bytes inside of 1024-byte region [ffff888143bf7000, ffff888143bf7400) The buggy address belongs to the page: page:000000005238f4ce refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x143bf0 head:000000005238f4ce order:3 compound_mapcount:0 compound_pincount:0 flags: 0x57ff00000010200(slab|head) raw: 057ff00000010200 ffffea00004b1400 0000000300000003 ffff888010c41140 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888143bf7080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888143bf7100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888143bf7180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888143bf7200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Reported-and-tested-by: syzbot+429d3f82d757c211bff3@syzkaller.appspotmail.com Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 8b9622eb0a21..4ff71b579cfc 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -78,8 +78,7 @@ struct link_dead_args { #define NBD_RT_HAS_PID_FILE 3 #define NBD_RT_HAS_CONFIG_REF 4 #define NBD_RT_BOUND 5 -#define NBD_RT_DESTROY_ON_DISCONNECT 6 -#define NBD_RT_DISCONNECT_ON_CLOSE 7 +#define NBD_RT_DISCONNECT_ON_CLOSE 6 #define NBD_DESTROY_ON_DISCONNECT 0 #define NBD_DISCONNECT_REQUESTED 1 @@ -1904,12 +1903,21 @@ again: if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - set_bit(NBD_RT_DESTROY_ON_DISCONNECT, - &config->runtime_flags); - set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); - put_dev = true; + /* + * We have 1 ref to keep the device around, and then 1 + * ref for our current operation here, which will be + * inherited by the config. If we already have + * DESTROY_ON_DISCONNECT set then we know we don't have + * that extra ref already held so we don't need the + * put_dev. + */ + if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + &nbd->flags)) + put_dev = true; } else { - clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); + if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + &nbd->flags)) + refcount_inc(&nbd->refs); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { set_bit(NBD_RT_DISCONNECT_ON_CLOSE, @@ -2080,15 +2088,13 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, - &config->runtime_flags)) + if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + &nbd->flags)) put_dev = true; - set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } else { - if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, - &config->runtime_flags)) + if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + &nbd->flags)) refcount_inc(&nbd->refs); - clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { From c0ea57608b691d6cde8aff23e11f9858a86b5918 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 Feb 2021 16:52:47 +0100 Subject: [PATCH 11/21] blktrace: remove debugfs file dentries from struct blk_trace These debugfs dentries do not need to be saved for anything as the whole directory and everything in it is properly cleaned up when the parent directory is removed. So remove them from struct blk_trace and don't save them when created as it's not necessary. Cc: Jens Axboe Cc: Steven Rostedt Cc: Ingo Molnar Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 -- kernel/trace/blktrace.c | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index e17d04abf6a3..a083e15df608 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -23,8 +23,6 @@ struct blk_trace { u32 pid; u32 dev; struct dentry *dir; - struct dentry *dropped_file; - struct dentry *msg_file; struct list_head running_list; atomic_t dropped; }; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d7ebef83771c..0c8690d9f6cd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -311,8 +311,6 @@ record_it: static void blk_trace_free(struct blk_trace *bt) { - debugfs_remove(bt->msg_file); - debugfs_remove(bt->dropped_file); relay_close(bt->rchan); debugfs_remove(bt->dir); free_percpu(bt->sequence); @@ -544,10 +542,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, INIT_LIST_HEAD(&bt->running_list); ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, - &blk_dropped_fops); - - bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); + debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); From 75ab6afacda01a6bd2d3ecd4cb8485f7c8fa2fdb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Feb 2021 15:41:25 -0700 Subject: [PATCH 12/21] block: don't skip empty device in in disk_uevent Restore the previous behavior by using the correct flag for the whole device ("part0"). Fixes: 99dfc43ecbf6 ("block: use ->bi_bdev for bio based I/O accounting") Reported-by: John Stultz Tested-by: John Stultz Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 36ff45bbaaaf..6379b000576c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -476,7 +476,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) struct disk_part_iter piter; struct block_device *part; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY_PART0); while ((part = disk_part_iter_next(&piter))) kobject_uevent(bdev_kobj(part), action); disk_part_iter_exit(&piter); From 4601b4b130de2329fe06df80ed5d77265f2058e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 Feb 2021 16:18:22 +0100 Subject: [PATCH 13/21] block: reopen the device in blkdev_reread_part Historically the BLKRRPART ioctls called into the now defunct ->revalidate method, which caused the sd driver to check if any media is present. When the ->revalidate method was removed this revalidation was lost, leading to lots of I/O errors when using the eject command. Fix this by reopening the device to rescan the partitions, and thus calling the revalidation logic in the sd driver. Fixes: 471bd0af544b ("sd: use bdev_check_media_change") Reported--by: Tom Seewald Signed-off-by: Christoph Hellwig Tested-by: Tom Seewald Reviewed-by: Ming Lei Reviewed-by: Minwoo Im Signed-off-by: Jens Axboe --- block/ioctl.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index d61d652078f4..ff241e663c01 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -81,20 +81,27 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blkdev_reread_part(struct block_device *bdev) +static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) { - int ret; + struct block_device *tmp; if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - mutex_lock(&bdev->bd_mutex); - ret = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); + /* + * Reopen the device to revalidate the driver state and force a + * partition rescan. + */ + mode &= ~FMODE_EXCL; + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - return ret; + tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + blkdev_put(tmp, mode); + return 0; } static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, @@ -498,7 +505,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: - return blkdev_reread_part(bdev); + return blkdev_reread_part(bdev, mode); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: From 97f433c3601a24d3513d06f575a389a2ca4e11e4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 23 Feb 2021 19:25:30 -0700 Subject: [PATCH 14/21] blk-settings: align max_sectors on "logical_block_size" boundary We get I/O errors when we run md-raid1 on the top of dm-integrity on the top of ramdisk. device-mapper: integrity: Bio not aligned on 8 sectors: 0xff00, 0xff device-mapper: integrity: Bio not aligned on 8 sectors: 0xff00, 0xff device-mapper: integrity: Bio not aligned on 8 sectors: 0xffff, 0x1 device-mapper: integrity: Bio not aligned on 8 sectors: 0xffff, 0x1 device-mapper: integrity: Bio not aligned on 8 sectors: 0x8048, 0xff device-mapper: integrity: Bio not aligned on 8 sectors: 0x8147, 0xff device-mapper: integrity: Bio not aligned on 8 sectors: 0x8246, 0xff device-mapper: integrity: Bio not aligned on 8 sectors: 0x8345, 0xbb The ramdisk device has logical_block_size 512 and max_sectors 255. The dm-integrity device uses logical_block_size 4096 and it doesn't affect the "max_sectors" value - thus, it inherits 255 from the ramdisk. So, we have a device with max_sectors not aligned on logical_block_size. The md-raid device sees that the underlying leg has max_sectors 255 and it will split the bios on 255-sector boundary, making the bios unaligned on logical_block_size. In order to fix the bug, we round down max_sectors to logical_block_size. Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 7dd8be314ac6..b4aa2f37fab6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -504,6 +504,14 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); +static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) +{ + sectors = round_down(sectors, lbs >> SECTOR_SHIFT); + if (sectors < PAGE_SIZE >> SECTOR_SHIFT) + sectors = PAGE_SIZE >> SECTOR_SHIFT; + return sectors; +} + /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) @@ -630,6 +638,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } + t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size); + t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size); + t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size); + /* Discard alignment and granularity */ if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); From 452c0bf8754fbeffdf579465b82a3c2bbe373c95 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 23 Feb 2021 16:50:15 +0800 Subject: [PATCH 15/21] block: fix logging on capacity change Local variable of 'capacity' stores the previous disk capacity, and 'size' variable records the latest disk capacity, so swap them for fixing logging on capacity change. Cc: Christoph Hellwig Fixes: a782483cc1f8 ("block: remove the nr_sects field in struct hd_struct") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 6379b000576c..fcc530164b5a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -74,7 +74,7 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) return false; pr_info("%s: detected capacity change from %lld to %lld\n", - disk->disk_name, size, capacity); + disk->disk_name, capacity, size); /* * Historically we did not send a uevent for changes to/from an empty From 5407334c53e9922c1c3fb28801e489d0b74f2c8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 08:24:04 +0100 Subject: [PATCH 16/21] block-crypto-fallback: use a bio_set for splitting bios bio_split with a NULL bs argumen used to fall back to kmalloc the bio, which does not guarantee forward progress and could to deadlocks. Now that the overloading of the NULL bs argument to bio_alloc_bioset has been removed it crashes instead. Fix all that by using a special crafted bioset. Fixes: 3175199ab0ac ("block: split bio_kmalloc from bio_alloc_bioset") Reported-by: John Stultz Signed-off-by: Christoph Hellwig Tested-by: John Stultz Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index e8327c50d7c9..c176b7af56a7 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -80,6 +80,7 @@ static struct blk_crypto_keyslot { static struct blk_keyslot_manager blk_crypto_ksm; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; +static struct bio_set crypto_bio_split; /* * This is the key we set when evicting a keyslot. This *should* be the all 0's @@ -224,7 +225,8 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) if (num_sectors < bio_sectors(bio)) { struct bio *split_bio; - split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL); + split_bio = bio_split(bio, num_sectors, GFP_NOIO, + &crypto_bio_split); if (!split_bio) { bio->bi_status = BLK_STS_RESOURCE; return false; @@ -538,9 +540,13 @@ static int blk_crypto_fallback_init(void) prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) goto out; + + err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + if (err) + goto fail_free_bioset; err = -ENOMEM; blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; @@ -591,6 +597,8 @@ fail_free_wq: destroy_workqueue(blk_crypto_wq); fail_free_ksm: blk_ksm_destroy(&blk_crypto_ksm); +fail_free_bioset: + bioset_exit(&crypto_bio_split); out: return err; } From b90994c6ab623baf9268df9710692f14920ce9d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 08:24:05 +0100 Subject: [PATCH 17/21] block: fix bounce_clone_bio for passthrough bios Now that bio_alloc_bioset does not fall back to kmalloc for a NULL bio_set, handle that case explicitly and simplify the calling conventions. Based on an earlier patch from Chaitanya Kulkarni. Fixes: 3175199ab0ac ("block: split bio_kmalloc from bio_alloc_bioset") Reported-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bounce.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index fc55314aa426..79d81221446d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -214,8 +214,7 @@ static void bounce_end_io_read_isa(struct bio *bio) __bounce_end_io_read(bio, &isa_page_pool); } -static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) +static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask) { struct bvec_iter iter; struct bio_vec bv; @@ -242,8 +241,11 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, * asking for trouble and would force extra work on * __bio_clone_fast() anyways. */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (bio_is_passthrough(bio_src)) + bio = bio_kmalloc(gfp_mask, bio_segments(bio_src)); + else + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), + &bounce_bio_set); if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; @@ -296,7 +298,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, unsigned i = 0; bool bounce = false; int sectors = 0; - bool passthrough = bio_is_passthrough(*bio_orig); bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_PAGES) @@ -307,14 +308,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!bounce) return; - if (!passthrough && sectors < bio_sectors(*bio_orig)) { + if (!bio_is_passthrough(*bio_orig) && + sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); submit_bio_noacct(*bio_orig); *bio_orig = bio; } - bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : - &bounce_bio_set); + bio = bounce_clone_bio(*bio_orig, GFP_NOIO); /* * Bvec table can't be updated by bio_for_each_segment_all(), From ebfe4183c77ed18e1d4237ad3b13f32114d9ae1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 08:24:06 +0100 Subject: [PATCH 18/21] block: remove the gfp_mask argument to bounce_clone_bio The only caller always passes GFP_NOIO. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bounce.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index 79d81221446d..417faaac36b6 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -214,7 +214,7 @@ static void bounce_end_io_read_isa(struct bio *bio) __bounce_end_io_read(bio, &isa_page_pool); } -static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask) +static struct bio *bounce_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -242,9 +242,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask) * __bio_clone_fast() anyways. */ if (bio_is_passthrough(bio_src)) - bio = bio_kmalloc(gfp_mask, bio_segments(bio_src)); + bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); else - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), &bounce_bio_set); if (!bio) return NULL; @@ -271,11 +271,11 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask) break; } - if (bio_crypt_clone(bio, bio_src, gfp_mask) < 0) + if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0) goto err_put; if (bio_integrity(bio_src) && - bio_integrity_clone(bio, bio_src, gfp_mask) < 0) + bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0) goto err_put; bio_clone_blkg_association(bio, bio_src); @@ -315,7 +315,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, submit_bio_noacct(*bio_orig); *bio_orig = bio; } - bio = bounce_clone_bio(*bio_orig, GFP_NOIO); + bio = bounce_clone_bio(*bio_orig); /* * Bvec table can't be updated by bio_for_each_segment_all(), From 47dc096ac183f465ffb03e86a203a38661695d72 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 08:24:07 +0100 Subject: [PATCH 19/21] block: memory allocations in bounce_clone_bio must not fail The caller can't cope with a failure from bounce_clone_bio, so use __GFP_NOFAIL for the passthrough case. bio_alloc_bioset already won't fail due to the use of mempools. And yes, we need to get rid of this bock layer bouncing code entirely sooner or later.. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bounce.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index 417faaac36b6..87983a35079c 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -242,12 +242,11 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * __bio_clone_fast() anyways. */ if (bio_is_passthrough(bio_src)) - bio = bio_kmalloc(GFP_NOIO, bio_segments(bio_src)); + bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, + bio_segments(bio_src)); else bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), &bounce_bio_set); - if (!bio) - return NULL; bio->bi_bdev = bio_src->bi_bdev; if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); From 94d4bffdda21baa2c749bc229c41811a7559dd15 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 23 Feb 2021 23:16:23 -0800 Subject: [PATCH 20/21] blktrace: fix documentation for blk_fill_rw() Add missing ":" after rwbs function parameter documentation that fixes following warning :- ./kernel/trace/blktrace.c:1877: warning: Function parameter or member 'rwbs' not described in 'blk_fill_rwbs' Reported-by: Stephen Rothwell Fixes: 1f83bb4b4914 ("blktrace: add blk_fill_rwbs documentation comment") Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0c8690d9f6cd..ca6f0ceba09b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1865,7 +1865,7 @@ void blk_trace_remove_sysfs(struct device *dev) /** * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. - * @rwbs buffer to be filled + * @rwbs: buffer to be filled * @op: REQ_OP_XXX for the tracepoint * * Description: From 5f7136db82996089cdfb2939c7664b29e9da141d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Jan 2021 04:38:57 +0000 Subject: [PATCH 21/21] block: Add bio_max_segs It's often inconvenient to use BIO_MAX_PAGES due to min() requiring the sign to be the same. Introduce bio_max_segs() and change BIO_MAX_PAGES to be unsigned to make it easier for the users. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Jens Axboe --- block/blk-map.c | 4 +--- drivers/block/xen-blkback/blkback.c | 4 +--- drivers/md/dm-io.c | 4 ++-- drivers/md/dm-log-writes.c | 10 +++++----- drivers/nvme/target/io-cmd-bdev.c | 8 ++++---- drivers/nvme/target/passthru.c | 4 ++-- drivers/target/target_core_iblock.c | 9 +++------ drivers/target/target_core_pscsi.c | 2 +- fs/block_dev.c | 10 +++++----- fs/direct-io.c | 2 +- fs/erofs/data.c | 4 +--- fs/ext4/readpage.c | 3 +-- fs/f2fs/data.c | 3 +-- fs/f2fs/node.c | 2 +- fs/iomap/buffered-io.c | 4 ++-- fs/mpage.c | 4 +--- fs/nfs/blocklayout/blocklayout.c | 6 +++--- fs/xfs/xfs_bio_io.c | 2 +- fs/xfs/xfs_buf.c | 4 ++-- include/linux/bio.h | 7 ++++++- 20 files changed, 44 insertions(+), 52 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 21630dccac62..369e204d14d0 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -150,9 +150,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bmd->is_our_pages = !map_data; bmd->is_null_mapped = (map_data && map_data->null_mapped); - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; + nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; bio = bio_kmalloc(gfp_mask, nr_pages); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index da16121140ca..1cdf09ff67b6 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1326,9 +1326,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, pages[i]->page, seg[i].nsec << 9, seg[i].offset) == 0)) { - - int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); - bio = bio_alloc(GFP_KERNEL, nr_iovecs); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(nseg - i)); if (unlikely(bio == NULL)) goto fail_put_bio; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 4312007d2d34..2d3cda0acacb 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -341,8 +341,8 @@ static void do_region(int op, int op_flags, unsigned region, num_bvecs = 1; break; default: - num_bvecs = min_t(int, BIO_MAX_PAGES, - dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); + num_bvecs = bio_max_segs(dm_sector_div_up(remaining, + (PAGE_SIZE >> SECTOR_SHIFT))); } bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, &io->client->bios); diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index e3d35c6c9f71..57882654ffee 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -264,15 +264,14 @@ static int write_inline_data(struct log_writes_c *lc, void *entry, size_t entrylen, void *data, size_t datalen, sector_t sector) { - int num_pages, bio_pages, pg_datalen, pg_sectorlen, i; + int bio_pages, pg_datalen, pg_sectorlen, i; struct page *page; struct bio *bio; size_t ret; void *ptr; while (datalen) { - num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT; - bio_pages = min(num_pages, BIO_MAX_PAGES); + bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE)); atomic_inc(&lc->io_blocks); @@ -364,7 +363,7 @@ static int log_one_block(struct log_writes_c *lc, goto out; atomic_inc(&lc->io_blocks); - bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(block->vec_cnt)); if (!bio) { DMERR("Couldn't alloc log bio"); goto error; @@ -386,7 +385,8 @@ static int log_one_block(struct log_writes_c *lc, if (ret != block->vecs[i].bv_len) { atomic_inc(&lc->io_blocks); submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, + bio_max_segs(block->vec_cnt - i)); if (!bio) { DMERR("Couldn't alloc log bio"); goto error; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 3d9a5d3ed9cd..9a8b3726a37c 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -185,7 +185,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, } bip = bio_integrity_alloc(bio, GFP_NOIO, - min_t(unsigned int, req->metadata_sg_cnt, BIO_MAX_PAGES)); + bio_max_segs(req->metadata_sg_cnt)); if (IS_ERR(bip)) { pr_err("Unable to allocate bio_integrity_payload\n"); return PTR_ERR(bip); @@ -225,7 +225,7 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio, static void nvmet_bdev_execute_rw(struct nvmet_req *req) { - int sg_cnt = req->sg_cnt; + unsigned int sg_cnt = req->sg_cnt; struct bio *bio; struct scatterlist *sg; struct blk_plug plug; @@ -262,7 +262,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) bio = &req->b.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); } else { - bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); } bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; @@ -289,7 +289,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) } } - bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt)); bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; bio->bi_opf = op; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f50c7b2bf21c..26c587ccd152 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -26,7 +26,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl; u16 status = NVME_SC_SUCCESS; struct nvme_id_ctrl *id; - int max_hw_sectors; + unsigned int max_hw_sectors; int page_shift; id = kzalloc(sizeof(*id), GFP_KERNEL); @@ -198,7 +198,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) bio = &req->p.inline_bio; bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); } else { - bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(req->sg_cnt)); bio->bi_end_io = bio_put; } bio->bi_opf = req_op(rq); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 8ed93fd205c7..ee3d52061281 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -315,10 +315,8 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op, * Only allocate as many vector entries as the bio code allows us to, * we'll loop later on until we have handled the whole request. */ - if (sg_num > BIO_MAX_PAGES) - sg_num = BIO_MAX_PAGES; - - bio = bio_alloc_bioset(GFP_NOIO, sg_num, &ib_dev->ibd_bio_set); + bio = bio_alloc_bioset(GFP_NOIO, bio_max_segs(sg_num), + &ib_dev->ibd_bio_set); if (!bio) { pr_err("Unable to allocate memory for bio\n"); return NULL; @@ -638,8 +636,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio, return -ENODEV; } - bip = bio_integrity_alloc(bio, GFP_NOIO, - min_t(unsigned int, cmd->t_prot_nents, BIO_MAX_PAGES)); + bip = bio_integrity_alloc(bio, GFP_NOIO, bio_max_segs(cmd->t_prot_nents)); if (IS_ERR(bip)) { pr_err("Unable to allocate bio_integrity_payload\n"); return PTR_ERR(bip); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 33770e5808ce..3cbc074992bc 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -881,7 +881,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, if (!bio) { new_bio: - nr_vecs = min_t(int, BIO_MAX_PAGES, nr_pages); + nr_vecs = bio_max_segs(nr_pages); nr_pages -= nr_vecs; /* * Calls bio_kmalloc() and sets bio->bi_end_io() diff --git a/fs/block_dev.c b/fs/block_dev.c index ec26179c8062..0f95ff343d6b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - int nr_pages) + unsigned int nr_pages) { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); @@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio) } } -static ssize_t -__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); @@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { - int nr_pages; + unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; @@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } static __init int blkdev_init(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index aa1083ecd623..c9639b4166c2 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES); + nr_pages = bio_max_segs(sdio->pages_in_io); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ea4f693bee22..f88851c5c250 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -215,10 +215,8 @@ submit_bio_retry: /* max # of continuous pages */ if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); - if (nblocks > BIO_MAX_PAGES) - nblocks = BIO_MAX_PAGES; - bio = bio_alloc(GFP_NOIO, nblocks); + bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks)); bio->bi_end_io = erofs_readendio; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f014c5e473a9..3db923403505 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages)); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, page->index); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b9721c8f116c..7c95818639a6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned int post_read_steps = 0; bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES), - &f2fs_bioset); + bio_max_segs(nr_pages), &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8a0fb890e8d..4b0e2e3c2c88 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, BIO_MAX_PAGES); + nrpages = bio_max_segs(last_offset - i); /* readahead node pages */ f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 16a1e82e3aeb..0d9d1a6a947e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); if (ctx->bio) submit_bio(ctx->bio); if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); + ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs)); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates diff --git a/fs/mpage.c b/fs/mpage.c index 830e6cc2a9e7..961234d68779 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -304,9 +304,7 @@ alloc_new: goto out; } args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, args->nr_pages, - BIO_MAX_PAGES), - gfp); + bio_max_segs(args->nr_pages), gfp); if (args->bio == NULL) goto confused; } diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1a96ce28efb0..fe860c538747 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio) return NULL; } -static struct bio * -bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, +static struct bio *bl_alloc_init_bio(unsigned int npg, + struct block_device *bdev, sector_t disk_sector, bio_end_io_t end_io, struct parallel_io *par) { struct bio *bio; - npg = min(npg, BIO_MAX_PAGES); + npg = bio_max_segs(npg); bio = bio_alloc(GFP_NOIO, npg); if (bio) { bio->bi_iter.bi_sector = disk_sector; diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index e2148f2d5d6b..17f36db2f792 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -6,7 +6,7 @@ static inline unsigned int bio_max_vecs(unsigned int count) { - return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); + return bio_max_segs(howmany(count, PAGE_SIZE)); } int diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f6e5235df7c9..37a1d12762d8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map( int op) { int page_index; - int total_nr_pages = bp->b_page_count; + unsigned int total_nr_pages = bp->b_page_count; int nr_pages; struct bio *bio; sector_t sector = bp->b_maps[map].bm_bn; @@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = min(total_nr_pages, BIO_MAX_PAGES); + nr_pages = bio_max_segs(total_nr_pages); bio = bio_alloc(GFP_NOIO, nr_pages); bio_set_dev(bio, bp->b_target->bt_bdev); diff --git a/include/linux/bio.h b/include/linux/bio.h index 5b468f2242ff..983ed2fe7c85 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -20,7 +20,12 @@ #define BIO_BUG_ON #endif -#define BIO_MAX_PAGES 256 +#define BIO_MAX_PAGES 256U + +static inline unsigned int bio_max_segs(unsigned int nr_segs) +{ + return min(nr_segs, BIO_MAX_PAGES); +} #define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio)