From 26bfeb26624071a47acb3a07097efe12044865f2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 16 Aug 2020 16:39:34 -0700 Subject: [PATCH 01/31] block: blk-mq.c: fix @at_head kernel-doc warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a kernel-doc warning in block/blk-mq.c: ../block/blk-mq.c:1844: warning: Function parameter or member 'at_head' not described in 'blk_mq_request_bypass_insert' Fixes: 01e99aeca397 ("blk-mq: insert passthrough request into hctx->dispatch directly") Signed-off-by: Randy Dunlap Cc: André Almeida Cc: Jens Axboe Cc: Ming Lei Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0015a1892153..35f8d0692442 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1834,6 +1834,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. + * @at_head: true if the request should be inserted at the head of the list. * @run_queue: If we should run the hardware queue after inserting the request. * * Should only be used carefully, when the caller knows we want to From 03ef5941a04c68b5eb8254493d09b6e899a377a3 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Mon, 17 Aug 2020 02:16:49 +0000 Subject: [PATCH 02/31] bsg-lib: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Xu Wang Signed-off-by: Jens Axboe --- block/bsg-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index fb7b347f8010..d185396d88bb 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -378,7 +378,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, bset->timeout_fn = timeout; set = &bset->tag_set; - set->ops = &bsg_mq_ops, + set->ops = &bsg_mq_ops; set->nr_hw_queues = 1; set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; From d7d8535f377e9ba87edbf7fbbd634ac942f3f54f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2020 18:01:15 +0800 Subject: [PATCH 03/31] blk-mq: order adding requests to hctx->dispatch and checking SCHED_RESTART SCHED_RESTART code path is relied to re-run queue for dispatch requests in hctx->dispatch. Meantime the SCHED_RSTART flag is checked when adding requests to hctx->dispatch. memory barriers have to be used for ordering the following two pair of OPs: 1) adding requests to hctx->dispatch and checking SCHED_RESTART in blk_mq_dispatch_rq_list() 2) clearing SCHED_RESTART and checking if there is request in hctx->dispatch in blk_mq_sched_restart(). Without the added memory barrier, either: 1) blk_mq_sched_restart() may miss requests added to hctx->dispatch meantime blk_mq_dispatch_rq_list() observes SCHED_RESTART, and not run queue in dispatch side or 2) blk_mq_dispatch_rq_list still sees SCHED_RESTART, and not run queue in dispatch side, meantime checking if there is request in hctx->dispatch from blk_mq_sched_restart() is missed. IO hang in ltp/fs_fill test is reported by kernel test robot: https://lkml.org/lkml/2020/7/26/77 Turns out it is caused by the above out-of-order OPs. And the IO hang can't be observed any more after applying this patch. Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers") Reported-by: kernel test robot Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Bart Van Assche Cc: Christoph Hellwig Cc: David Jeffery Cc: Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 9 +++++++++ block/blk-mq.c | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a19cdf159b75..d2790e5b06d1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -78,6 +78,15 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) return; clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + /* + * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) + * in blk_mq_run_hw_queue(). Its pair is the barrier in + * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, + * meantime new request added to hctx->dispatch is missed to check in + * blk_mq_run_hw_queue(). + */ + smp_mb(); + blk_mq_run_hw_queue(hctx, true); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 35f8d0692442..a80f4986e594 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1437,6 +1437,15 @@ out: list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); + /* + * Order adding requests to hctx->dispatch and checking + * SCHED_RESTART flag. The pair of this smp_mb() is the one + * in blk_mq_sched_restart(). Avoid restart code path to + * miss the new added requests to hctx->dispatch, meantime + * SCHED_RESTART is observed here. + */ + smp_mb(); + /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another From bcb21c8cc9947286211327d663ace69f07d37a76 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2020 18:01:30 +0800 Subject: [PATCH 04/31] block: loop: set discard granularity and alignment for block device backed loop In case of block device backend, if the backend supports write zeros, the loop device will set queue flag of QUEUE_FLAG_DISCARD. However, limits.discard_granularity isn't setup, and this way is wrong, see the following description in Documentation/ABI/testing/sysfs-block: A discard_granularity of 0 means that the device does not support discard functionality. Especially 9b15d109a6b2 ("block: improve discard bio alignment in __blkdev_issue_discard()") starts to take q->limits.discard_granularity for computing max discard sectors. And zero discard granularity may cause kernel oops, or fail discard request even though the loop queue claims discard support via QUEUE_FLAG_DISCARD. Fix the issue by setup discard granularity and alignment. Fixes: c52abf563049 ("loop: Better discard support for block devices") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Acked-by: Coly Li Cc: Hannes Reinecke Cc: Xiao Ni Cc: Martin K. Petersen Cc: Evan Green Cc: Gwendal Grignou Cc: Chaitanya Kulkarni Cc: Andrzej Pietrasiewicz Cc: Christoph Hellwig Cc: Signed-off-by: Jens Axboe --- drivers/block/loop.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2f137d6ce169..3d7a1901bf28 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -878,6 +878,7 @@ static void loop_config_discard(struct loop_device *lo) struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct request_queue *q = lo->lo_queue; + u32 granularity, max_discard_sectors; /* * If the backing device is a block device, mirror its zeroing @@ -890,11 +891,10 @@ static void loop_config_discard(struct loop_device *lo) struct request_queue *backingq; backingq = bdev_get_queue(inode->i_bdev); - blk_queue_max_discard_sectors(q, - backingq->limits.max_write_zeroes_sectors); - blk_queue_max_write_zeroes_sectors(q, - backingq->limits.max_write_zeroes_sectors); + max_discard_sectors = backingq->limits.max_write_zeroes_sectors; + granularity = backingq->limits.discard_granularity ?: + queue_physical_block_size(backingq); /* * We use punch hole to reclaim the free space used by the @@ -903,23 +903,26 @@ static void loop_config_discard(struct loop_device *lo) * useful information. */ } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) { - q->limits.discard_granularity = 0; - q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, 0); - blk_queue_max_write_zeroes_sectors(q, 0); + max_discard_sectors = 0; + granularity = 0; } else { - q->limits.discard_granularity = inode->i_sb->s_blocksize; - q->limits.discard_alignment = 0; - - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); + max_discard_sectors = UINT_MAX >> 9; + granularity = inode->i_sb->s_blocksize; } - if (q->limits.max_write_zeroes_sectors) + if (max_discard_sectors) { + q->limits.discard_granularity = granularity; + blk_queue_max_discard_sectors(q, max_discard_sectors); + blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); - else + } else { + q->limits.discard_granularity = 0; + blk_queue_max_discard_sectors(q, 0); + blk_queue_max_write_zeroes_sectors(q, 0); blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); + } + q->limits.discard_alignment = 0; } static void loop_unprepare_queue(struct loop_device *lo) From 943b40c832beb71115e38a1c4d99b640b5342738 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2020 17:52:39 +0800 Subject: [PATCH 05/31] block: respect queue limit of max discard segment When queue_max_discard_segments(q) is 1, blk_discard_mergable() will return false for discard request, then normal request merge is applied. However, only queue_max_segments() is checked, so max discard segment limit isn't respected. Check max discard segment limit in the request merge code for fixing the issue. Discard request failure of virtio_blk is fixed. Fixes: 69840466086d ("block: fix the DISCARD request merge") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Stefano Garzarella Signed-off-by: Jens Axboe --- block/blk-merge.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 6529e3aab001..7af1f3668a91 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -533,10 +533,17 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(__blk_rq_map_sg); +static inline unsigned int blk_rq_get_max_segments(struct request *rq) +{ + if (req_op(rq) == REQ_OP_DISCARD) + return queue_max_discard_segments(rq->q); + return queue_max_segments(rq->q); +} + static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q)) + if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) goto no_merge; if (blk_integrity_merge_bio(req->q, req, bio) == false) @@ -624,7 +631,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; - if (total_phys_segments > queue_max_segments(q)) + if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; if (blk_integrity_merge_rq(q, req, next) == false) From af822aa68fbdf0a480a17462ed70232998127453 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Aug 2020 17:52:40 +0800 Subject: [PATCH 06/31] block: virtio_blk: fix handling single range discard request 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") starts to support multi-range discard for virtio-blk. However, the virtio-blk disk may report max discard segment as 1, at least that is exactly what qemu is doing. So far, block layer switches to normal request merge if max discard segment limit is 1, and multiple bios can be merged to single segment. This way may cause memory corruption in virtblk_setup_discard_write_zeroes(). Fix the issue by handling single max discard segment in straightforward way. Fixes: 1f23816b8eb8 ("virtio_blk: add discard and write zeroes support") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Changpeng Liu Cc: Daniel Verkamp Cc: Michael S. Tsirkin Cc: Stefan Hajnoczi Cc: Stefano Garzarella Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 63b213e00b37..b2e48dac1ebd 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -126,16 +126,31 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap) if (!range) return -ENOMEM; - __rq_for_each_bio(bio, req) { - u64 sector = bio->bi_iter.bi_sector; - u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT; + /* + * Single max discard segment means multi-range discard isn't + * supported, and block layer only runs contiguity merge like + * normal RW request. So we can't reply on bio for retrieving + * each range info. + */ + if (queue_max_discard_segments(req->q) == 1) { + range[0].flags = cpu_to_le32(flags); + range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req)); + range[0].sector = cpu_to_le64(blk_rq_pos(req)); + n = 1; + } else { + __rq_for_each_bio(bio, req) { + u64 sector = bio->bi_iter.bi_sector; + u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT; - range[n].flags = cpu_to_le32(flags); - range[n].num_sectors = cpu_to_le32(num_sectors); - range[n].sector = cpu_to_le64(sector); - n++; + range[n].flags = cpu_to_le32(flags); + range[n].num_sectors = cpu_to_le32(num_sectors); + range[n].sector = cpu_to_le64(sector); + n++; + } } + WARN_ON_ONCE(n != segments); + req->special_vec.bv_page = virt_to_page(range); req->special_vec.bv_offset = offset_in_page(range); req->special_vec.bv_len = sizeof(*range) * segments; From d81665198b83e55a28339d1f3e4890ed8a434556 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Aug 2020 20:52:06 +0100 Subject: [PATCH 07/31] block: Fix page_is_mergeable() for compound pages If we pass in an offset which is larger than PAGE_SIZE, then page_is_mergeable() thinks it's not mergeable with the previous bio_vec, leading to a large number of bio_vecs being used. Use a slightly more obvious test that the two pages are compatible with each other. Fixes: 52d52d1c98a9 ("block: only allow contiguous page structs in a bio_vec") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/bio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index c63ba04bd629..a9931f23d933 100644 --- a/block/bio.c +++ b/block/bio.c @@ -740,8 +740,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) { - phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + - bv->bv_offset + bv->bv_len - 1; + size_t bv_end = bv->bv_offset + bv->bv_len; + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) @@ -750,9 +750,9 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) - return false; - return true; + if (*same_page) + return true; + return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); } /* From 2de791ab4918969d8108f15238a701968375f235 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 11 Aug 2020 06:43:40 +0000 Subject: [PATCH 08/31] bfq: fix blkio cgroup leakage v4 Changes from v1: - update commit description with proper ref-accounting justification commit db37a34c563b ("block, bfq: get a ref to a group when adding it to a service tree") introduce leak forbfq_group and blkcg_gq objects because of get/put imbalance. In fact whole idea of original commit is wrong because bfq_group entity can not dissapear under us because it is referenced by child bfq_queue's entities from here: -> bfq_init_entity() ->bfqg_and_blkg_get(bfqg); ->entity->parent = bfqg->my_entity -> bfq_put_queue(bfqq) FINAL_PUT ->bfqg_and_blkg_put(bfqq_group(bfqq)) ->kmem_cache_free(bfq_pool, bfqq); So parent entity can not disappear while child entity is in tree, and child entities already has proper protection. This patch revert commit db37a34c563b ("block, bfq: get a ref to a group when adding it to a service tree") bfq_group leak trace caused by bad commit: -> blkg_alloc -> bfq_pq_alloc -> bfqg_get (+1) ->bfq_activate_bfqq ->bfq_activate_requeue_entity -> __bfq_activate_entity ->bfq_get_entity ->bfqg_and_blkg_get (+1) <==== : Note1 ->bfq_del_bfqq_busy ->bfq_deactivate_entity+0x53/0xc0 [bfq] ->__bfq_deactivate_entity+0x1b8/0x210 [bfq] -> bfq_forget_entity(is_in_service = true) entity->on_st_or_in_serv = false <=== :Note2 if (is_in_service) return; ==> do not touch reference -> blkcg_css_offline -> blkcg_destroy_blkgs -> blkg_destroy -> bfq_pd_offline -> __bfq_deactivate_entity if (!entity->on_st_or_in_serv) /* true, because (Note2) return false; -> bfq_pd_free -> bfqg_put() (-1, byt bfqg->ref == 2) because of (Note2) So bfq_group and blkcg_gq will leak forever, see test-case below. ##TESTCASE_BEGIN: #!/bin/bash max_iters=${1:-100} #prep cgroup mounts mount -t tmpfs cgroup_root /sys/fs/cgroup mkdir /sys/fs/cgroup/blkio mount -t cgroup -o blkio none /sys/fs/cgroup/blkio # Prepare blkdev grep blkio /proc/cgroups truncate -s 1M img losetup /dev/loop0 img echo bfq > /sys/block/loop0/queue/scheduler grep blkio /proc/cgroups for ((i=0;i /sys/fs/cgroup/blkio/a/cgroup.procs dd if=/dev/loop0 bs=4k count=1 of=/dev/null iflag=direct 2> /dev/null echo 0 > /sys/fs/cgroup/blkio/cgroup.procs rmdir /sys/fs/cgroup/blkio/a grep blkio /proc/cgroups done ##TESTCASE_END: Fixes: db37a34c563b ("block, bfq: get a ref to a group when adding it to a service tree") Tested-by: Oleksandr Natalenko Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 2 +- block/bfq-iosched.h | 1 - block/bfq-wf2q.c | 12 ++---------- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 68882b9b8f11..b791e2041e49 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -332,7 +332,7 @@ static void bfqg_put(struct bfq_group *bfqg) kfree(bfqg); } -void bfqg_and_blkg_get(struct bfq_group *bfqg) +static void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index cd224aaf9f52..703895224562 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -986,7 +986,6 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); -void bfqg_and_blkg_get(struct bfq_group *bfqg); void bfqg_and_blkg_put(struct bfq_group *bfqg); #ifdef CONFIG_BFQ_GROUP_IOSCHED diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index eb0e2a6daabe..26776bdbdf36 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -533,9 +533,7 @@ static void bfq_get_entity(struct bfq_entity *entity) bfqq->ref++; bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", bfqq, bfqq->ref); - } else - bfqg_and_blkg_get(container_of(entity, struct bfq_group, - entity)); + } } /** @@ -649,14 +647,8 @@ static void bfq_forget_entity(struct bfq_service_tree *st, entity->on_st_or_in_serv = false; st->wsum -= entity->weight; - if (is_in_service) - return; - - if (bfqq) + if (bfqq && !is_in_service) bfq_put_queue(bfqq); - else - bfqg_and_blkg_put(container_of(entity, struct bfq_group, - entity)); } /** From 17bc10300c69bd51b82983cdadafa0a7791f074e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 17 Aug 2020 23:49:25 -0700 Subject: [PATCH 09/31] block/rnbd: Ensure err is always initialized in process_rdma Clang warns: drivers/block/rnbd/rnbd-srv.c:150:6: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (IS_ERR(bio)) { ^~~~~~~~~~~ drivers/block/rnbd/rnbd-srv.c:177:9: note: uninitialized use occurs here return err; ^~~ drivers/block/rnbd/rnbd-srv.c:150:2: note: remove the 'if' if its condition is always false if (IS_ERR(bio)) { ^~~~~~~~~~~~~~~~~~ drivers/block/rnbd/rnbd-srv.c:126:9: note: initialize the variable 'err' to silence this warning int err; ^ = 0 1 warning generated. err is indeed uninitialized when this statement is taken. Ensure that it is assigned the error value of bio before jumping to the error handling label. Fixes: 735d77d4fd28 ("rnbd: remove rnbd_dev_submit_io") Reported-by: Brooke Basile Signed-off-by: Nathan Chancellor Acked-by: Jack Wang Link: https://github.com/ClangBuiltLinux/linux/issues/1134 Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 0fb94843a495..e1bc8b4cd592 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -148,7 +148,8 @@ static int process_rdma(struct rtrs_srv *sess, /* Generate bio with pages pointing to the rdma buffer */ bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL); if (IS_ERR(bio)) { - rnbd_srv_err(sess_dev, "Failed to generate bio, err: %ld\n", PTR_ERR(bio)); + err = PTR_ERR(bio); + rnbd_srv_err(sess_dev, "Failed to generate bio, err: %d\n", err); goto sess_dev_put; } From db03f88fae8a2c8007caafa70287798817df2875 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 18 Aug 2020 17:07:28 +0800 Subject: [PATCH 10/31] blk-mq: insert request not through ->queue_rq into sw/scheduler queue c616cbee97ae ("blk-mq: punt failed direct issue to dispatch list") supposed to add request which has been through ->queue_rq() to the hw queue dispatch list, however it adds request running out of budget or driver tag to hw queue too. This way basically bypasses request merge, and causes too many request dispatched to LLD, and system% is unnecessary increased. Fixes this issue by adding request not through ->queue_rq into sw/scheduler queue, and this way is safe because no ->queue_rq is called on this request yet. High %system can be observed on Azure storvsc device, and even soft lock is observed. This patch reduces %system during heavy sequential IO, meantime decreases soft lockup risk. Fixes: c616cbee97ae ("blk-mq: punt failed direct issue to dispatch list") Signed-off-by: Ming Lei Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a80f4986e594..b3d2785eefe9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2026,7 +2026,8 @@ insert: if (bypass_insert) return BLK_STS_RESOURCE; - blk_mq_request_bypass_insert(rq, false, run_queue); + blk_mq_sched_insert_request(rq, false, run_queue, false); + return BLK_STS_OK; } From e4b469c66f3cbb81c2e94d31123d7bcdf3c1dabd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 6 Aug 2020 14:58:37 -0700 Subject: [PATCH 11/31] block: fix get_max_io_size() A previous commit aligning splits to physical block sizes inadvertently modified one return case such that that it now returns 0 length splits when the number of sectors doesn't exceed the physical offset. This later hits a BUG in bio_split(). Restore the previous working behavior. Fixes: 9cc5169cd478b ("block: Improve physical block alignment of split bios") Reported-by: Eric Deal Signed-off-by: Keith Busch Cc: Bart Van Assche Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 7af1f3668a91..f685d633bcc9 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -154,7 +154,7 @@ static inline unsigned get_max_io_size(struct request_queue *q, if (max_sectors > start_offset) return max_sectors - start_offset; - return sectors & (lbs - 1); + return sectors & ~(lbs - 1); } static inline unsigned get_max_segment_size(const struct request_queue *q, From 0c8b9c3540bddd143091045c99b00d43393c8d62 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 19 Aug 2020 14:32:43 +0200 Subject: [PATCH 12/31] MAINTAINERS: Add missing header files to BLOCK LAYER section The various header files are part of the Block Layer. Add them to the corresponding section in the MAINTAINERS file, so scripts/get_maintainer.pl will pick them up. Signed-off-by: Geert Uytterhoeven Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index deaafb617361..bfa6800346dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3205,6 +3205,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: block/ F: drivers/block/ +F: include/linux/blk* F: kernel/trace/blktrace.c F: lib/sbitmap.c From 27029b4b18aa5d3b060f0bf2c26dae254132cfce Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Mon, 10 Aug 2020 22:21:16 -0400 Subject: [PATCH 13/31] blkcg: fix memleak for iolatency Normally, blkcg_iolatency_exit() will free related memory in iolatency when cleanup queue. But if blk_throtl_init() return error and queue init fail, blkcg_iolatency_exit() will not do that for us. Then it cause memory leak. Fixes: d70675121546 ("block: introduce blk-iolatency io controller") Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 619a79b51068..c195365c9817 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1152,13 +1152,15 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); - ret = blk_iolatency_init(q); - if (ret) - goto err_destroy_all; - ret = blk_throtl_init(q); if (ret) goto err_destroy_all; + + ret = blk_iolatency_init(q); + if (ret) { + blk_throtl_exit(q); + goto err_destroy_all; + } return 0; err_destroy_all: From 382fee1a8b623e2546a3e15e80517389e0e0673e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 30 Jul 2020 11:51:00 -0700 Subject: [PATCH 14/31] nvmet: fix a memory leak We forgot to free new_model_number Fixes: 013b7ebe5a0d ("nvmet: make ctrl model configurable") Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/configfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 74b2b61c773b..37e1d7784e17 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1136,6 +1136,7 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, up_write(&nvmet_config_sem); kfree_rcu(new_model, rcuhead); + kfree(new_model_number); return count; } From 0ceeab96ba598c578f0e87cb25b79349cd35b8f1 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 29 Jul 2020 13:10:09 -0600 Subject: [PATCH 15/31] nvmet-passthru: Reject commands with non-sgl flags set Any command with a non-SGL flag set (like fuse flags) should be rejected. Fixes: c1fef73f793b ("nvmet: add passthru code to process commands") Signed-off-by: Logan Gunthorpe Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/passthru.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 89d91dc999a6..f69c3ac82e58 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -326,6 +326,10 @@ static u16 nvmet_setup_passthru_command(struct nvmet_req *req) u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) { + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + switch (req->cmd->common.opcode) { case nvme_cmd_resv_register: case nvme_cmd_resv_report: @@ -396,6 +400,10 @@ static u16 nvmet_passthru_get_set_features(struct nvmet_req *req) u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) { + /* Reject any commands with non-sgl flags set (ie. fused commands) */ + if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD; + /* * Passthru all vendor specific commands */ From f34448cd0dc697723fb5f4118f8431d9233b370d Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Sun, 2 Aug 2020 19:15:45 +0800 Subject: [PATCH 16/31] nvme-fc: Fix wrong return value in __nvme_fc_init_request() On an error exit path, a negative error code should be returned instead of a positive return value. Fixes: e399441de9115 ("nvme-fabrics: Add host support for FC transport") Cc: James Smart Signed-off-by: Tianjia Zhang Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index eae43bb444e0..07aacfe46368 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2078,7 +2078,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { dev_err(ctrl->dev, "FCP Op failed - cmdiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; goto out_on_error; } @@ -2088,7 +2088,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl, if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { dev_err(ctrl->dev, "FCP Op failed - rspiu dma mapping failed.\n"); - ret = EFAULT; + ret = -EFAULT; } atomic_set(&op->state, FCPOP_STATE_IDLE); From 93eb0381e13d249a18ed4aae203291ff977e7ffb Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 6 Aug 2020 15:19:31 +0200 Subject: [PATCH 17/31] nvme: multipath: round-robin: fix single non-optimized path case If there's only one usable, non-optimized path, nvme_round_robin_path() returns NULL, which is wrong. Fix it by falling back to "old", like in the single optimized path case. Also, if the active path isn't changed, there's no need to re-assign the pointer. Fixes: 3f6e3246db0e ("nvme-multipath: fix logic for non-optimized paths") Signed-off-by: Martin Wilck Signed-off-by: Martin George Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3ded54d2c9c6..a64dfff0d0ce 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -255,12 +255,17 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, fallback = ns; } - /* No optimized path found, re-check the current path */ + /* + * The loop above skips the current path for round-robin semantics. + * Fall back to the current path if either: + * - no other optimized path found and current is optimized, + * - no other usable path found and current is usable. + */ if (!nvme_path_is_disabled(old) && - old->ana_state == NVME_ANA_OPTIMIZED) { - found = old; - goto out; - } + (old->ana_state == NVME_ANA_OPTIMIZED || + (!fallback && old->ana_state == NVME_ANA_NONOPTIMIZED))) + return old; + if (!fallback) return NULL; found = fallback; From e398863b75af24103cee69cc8ee8bf4bc9c8913e Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 6 Aug 2020 15:19:32 +0200 Subject: [PATCH 18/31] nvme: multipath: round-robin: eliminate "fallback" variable If we find an optimized path, we quit the loop immediately. Thus we can use just one variable for the next path, slighly simplifying the code. Signed-off-by: Martin Wilck Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a64dfff0d0ce..760f625cefc8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -233,7 +233,7 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, int node, struct nvme_ns *old) { - struct nvme_ns *ns, *found, *fallback = NULL; + struct nvme_ns *ns, *found = NULL; if (list_is_singular(&head->list)) { if (nvme_path_is_disabled(old)) @@ -252,7 +252,7 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, goto out; } if (ns->ana_state == NVME_ANA_NONOPTIMIZED) - fallback = ns; + found = ns; } /* @@ -263,12 +263,11 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, */ if (!nvme_path_is_disabled(old) && (old->ana_state == NVME_ANA_OPTIMIZED || - (!fallback && old->ana_state == NVME_ANA_NONOPTIMIZED))) + (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) return old; - if (!fallback) + if (!found) return NULL; - found = fallback; out: rcu_assign_pointer(head->current_path[node], found); return found; From 4db69a3d7cfe31e0205bfcb6de331aa9a124cb10 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 6 Aug 2020 13:02:23 -0700 Subject: [PATCH 19/31] nvmet: add ns tear down label for pt-cmd handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current implementation before submitting the passthru cmd we may come across error e.g. getting ns from passthru controller, allocating a request from passthru controller, etc. For all the failure cases it only uses single goto label fail_out. In the target code, we follow the pattern to have a separate label for each error out the case when setting up multiple things before the actual action. This patch follows the same pattern and renames generic fail_out label to out_put_ns and updates the error out cases in the nvmet_passthru_execute_cmd() where it is needed. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/passthru.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f69c3ac82e58..750d7b009b34 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -230,7 +230,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) if (unlikely(!ns)) { pr_err("failed to get passthru ns nsid:%u\n", nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; - goto fail_out; + goto out; } q = ns->queue; @@ -240,14 +240,14 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) if (IS_ERR(rq)) { rq = NULL; status = NVME_SC_INTERNAL; - goto fail_out; + goto out_put_ns; } if (req->sg_cnt) { ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { status = NVME_SC_INTERNAL; - goto fail_out; + goto out_put_ns; } } @@ -274,9 +274,10 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) return; -fail_out: +out_put_ns: if (ns) nvme_put_ns(ns); +out: nvmet_req_complete(req, status); blk_put_request(rq); } From a2138fd49467d0b909b9f19936974f9a3c4e3e1a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 6 Aug 2020 15:48:58 -0700 Subject: [PATCH 20/31] nvmet: fix oops in pt cmd execution In the existing NVMeOF Passthru core command handling on failure of nvme_alloc_request() it errors out with rq value set to NULL. In the error handling path it calls blk_put_request() without checking if rq is set to NULL or not which produces following Oops:- [ 1457.346861] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1457.347838] #PF: supervisor read access in kernel mode [ 1457.348464] #PF: error_code(0x0000) - not-present page [ 1457.349085] PGD 0 P4D 0 [ 1457.349402] Oops: 0000 [#1] SMP NOPTI [ 1457.349851] CPU: 18 PID: 10782 Comm: kworker/18:2 Tainted: G OE 5.8.0-rc4nvme-5.9+ #35 [ 1457.350951] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e3214 [ 1457.352347] Workqueue: events nvme_loop_execute_work [nvme_loop] [ 1457.353062] RIP: 0010:blk_mq_free_request+0xe/0x110 [ 1457.353651] Code: 3f ff ff ff 83 f8 01 75 0d 4c 89 e7 e8 1b db ff ff e9 2d ff ff ff 0f 0b eb ef 66 8 [ 1457.355975] RSP: 0018:ffffc900035b7de0 EFLAGS: 00010282 [ 1457.356636] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000002 [ 1457.357526] RDX: ffffffffa060bd05 RSI: 0000000000000000 RDI: 0000000000000000 [ 1457.358416] RBP: 0000000000000037 R08: 0000000000000000 R09: 0000000000000000 [ 1457.359317] R10: 0000000000000000 R11: 000000000000006d R12: 0000000000000000 [ 1457.360424] R13: ffff8887ffa68600 R14: 0000000000000000 R15: ffff8888150564c8 [ 1457.361322] FS: 0000000000000000(0000) GS:ffff888814600000(0000) knlGS:0000000000000000 [ 1457.362337] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1457.363058] CR2: 0000000000000000 CR3: 000000081c0ac000 CR4: 00000000003406e0 [ 1457.363973] Call Trace: [ 1457.364296] nvmet_passthru_execute_cmd+0x150/0x2c0 [nvmet] [ 1457.364990] process_one_work+0x24e/0x5a0 [ 1457.365493] ? __schedule+0x353/0x840 [ 1457.365957] worker_thread+0x3c/0x380 [ 1457.366426] ? process_one_work+0x5a0/0x5a0 [ 1457.366948] kthread+0x135/0x150 [ 1457.367362] ? kthread_create_on_node+0x60/0x60 [ 1457.367934] ret_from_fork+0x22/0x30 [ 1457.368388] Modules linked in: nvme_loop(OE) nvmet(OE) nvme_fabrics(OE) null_blk nvme(OE) nvme_corer [ 1457.368414] ata_piix crc32c_intel virtio_pci libata virtio_ring serio_raw t10_pi virtio floppy dm_] [ 1457.380849] CR2: 0000000000000000 [ 1457.381288] ---[ end trace c6cab61bfd1f68fd ]--- [ 1457.381861] RIP: 0010:blk_mq_free_request+0xe/0x110 [ 1457.382469] Code: 3f ff ff ff 83 f8 01 75 0d 4c 89 e7 e8 1b db ff ff e9 2d ff ff ff 0f 0b eb ef 66 8 [ 1457.384749] RSP: 0018:ffffc900035b7de0 EFLAGS: 00010282 [ 1457.385393] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000002 [ 1457.386264] RDX: ffffffffa060bd05 RSI: 0000000000000000 RDI: 0000000000000000 [ 1457.387142] RBP: 0000000000000037 R08: 0000000000000000 R09: 0000000000000000 [ 1457.388029] R10: 0000000000000000 R11: 000000000000006d R12: 0000000000000000 [ 1457.388914] R13: ffff8887ffa68600 R14: 0000000000000000 R15: ffff8888150564c8 [ 1457.389798] FS: 0000000000000000(0000) GS:ffff888814600000(0000) knlGS:0000000000000000 [ 1457.390796] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1457.391508] CR2: 0000000000000000 CR3: 000000081c0ac000 CR4: 00000000003406e0 [ 1457.392525] Kernel panic - not syncing: Fatal exception [ 1457.394138] Kernel Offset: disabled [ 1457.394677] ---[ end Kernel panic - not syncing: Fatal exception ]--- We fix this Oops by adding a new goto label out_put_req and reordering the blk_put_request call to avoid calling blk_put_request() with rq value is set to NULL. Here we also update the rest of the code accordingly. Fixes: 06b7164dfdc0 ("nvmet: add passthru code to process commands") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/passthru.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 750d7b009b34..ec223bf56254 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -238,7 +238,6 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); if (IS_ERR(rq)) { - rq = NULL; status = NVME_SC_INTERNAL; goto out_put_ns; } @@ -247,7 +246,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { status = NVME_SC_INTERNAL; - goto out_put_ns; + goto out_put_req; } } @@ -274,12 +273,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) return; +out_put_req: + blk_put_request(rq); out_put_ns: if (ns) nvme_put_ns(ns); out: nvmet_req_complete(req, status); - blk_put_request(rq); } /* From 7ee51cf60a90c2017b8d5b6e936cb52490aac7bd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 6 Aug 2020 15:56:27 -0700 Subject: [PATCH 21/31] nvmet: call blk_mq_free_request() directly Instead of calling blk_put_request() which calls blk_mq_free_request(), call blk_mq_free_request() directly for NVMeOF passthru. This is to mainly avoid an extra function call in the completion path nvmet_passthru_req_done(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Reviewed-by: Logan Gunthorpe Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/passthru.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index ec223bf56254..8bd7f656e240 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -165,7 +165,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, status); - blk_put_request(rq); + blk_mq_free_request(rq); } static void nvmet_passthru_req_done(struct request *rq, @@ -175,7 +175,7 @@ static void nvmet_passthru_req_done(struct request *rq, req->cqe->result = nvme_req(rq)->result; nvmet_req_complete(req, nvme_req(rq)->status); - blk_put_request(rq); + blk_mq_free_request(rq); } static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) @@ -274,7 +274,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) return; out_put_req: - blk_put_request(rq); + blk_mq_free_request(rq); out_put_ns: if (ns) nvme_put_ns(ns); From ecbcdf0c81265f7f780b588eed77fa933fd79f68 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 12 Aug 2020 17:24:44 -0600 Subject: [PATCH 22/31] nvme: Use spin_lock_irq() when taking the ctrl->lock When locking the ctrl->lock spinlock IRQs need to be disabled to avoid a dead lock. The new spin_lock() calls recently added produce the following lockdep warning when running the blktest nvme/003: ================================ WARNING: inconsistent lock state -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. ksoftirqd/2/22 [HC0[0]:SC1[1]:HE0:SE0] takes: ffff888276a8c4c0 (&ctrl->lock){+.?.}-{2:2}, at: nvme_keep_alive_end_io+0x50/0xc0 {SOFTIRQ-ON-W} state was registered at: lock_acquire+0x164/0x500 _raw_spin_lock+0x28/0x40 nvme_get_effects_log+0x37/0x1c0 nvme_init_identify+0x9e4/0x14f0 nvme_reset_work+0xadd/0x2360 process_one_work+0x66b/0xb70 worker_thread+0x6e/0x6c0 kthread+0x1e7/0x210 ret_from_fork+0x22/0x30 irq event stamp: 1449221 hardirqs last enabled at (1449220): [] ktime_get+0xf9/0x140 hardirqs last disabled at (1449221): [] _raw_spin_lock_irqsave+0x25/0x60 softirqs last enabled at (1449210): [] __do_softirq+0x447/0x595 softirqs last disabled at (1449215): [] run_ksoftirqd+0x35/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&ctrl->lock); lock(&ctrl->lock); *** DEADLOCK *** no locks held by ksoftirqd/2/22. stack backtrace: CPU: 2 PID: 22 Comm: ksoftirqd/2 Not tainted 5.8.0-rc4-eid-vmlocalyes-dbg-00157-g7236657c6b3a #1450 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xc8/0x11a print_usage_bug.cold.63+0x235/0x23e mark_lock+0xa9c/0xcf0 __lock_acquire+0xd9a/0x2b50 lock_acquire+0x164/0x500 _raw_spin_lock_irqsave+0x40/0x60 nvme_keep_alive_end_io+0x50/0xc0 blk_mq_end_request+0x158/0x210 nvme_complete_rq+0x146/0x500 nvme_loop_complete_rq+0x26/0x30 [nvme_loop] blk_done_softirq+0x187/0x1e0 __do_softirq+0x118/0x595 run_ksoftirqd+0x35/0x50 smpboot_thread_fn+0x1d3/0x310 kthread+0x1e7/0x210 ret_from_fork+0x22/0x30 Fixes: be93e87e7802 ("nvme: support for multiple Command Sets Supported and Effects log pages") Signed-off-by: Logan Gunthorpe Reviewed-by: Keith Busch Tested-by: Chaitanya Kulkarni Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88cff309d8e4..466c591c05e9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2965,14 +2965,14 @@ static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) { struct nvme_cel *cel, *ret = NULL; - spin_lock(&ctrl->lock); + spin_lock_irq(&ctrl->lock); list_for_each_entry(cel, &ctrl->cels, entry) { if (cel->csi == csi) { ret = cel; break; } } - spin_unlock(&ctrl->lock); + spin_unlock_irq(&ctrl->lock); return ret; } @@ -2999,9 +2999,9 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, cel->csi = csi; - spin_lock(&ctrl->lock); + spin_lock_irq(&ctrl->lock); list_add_tail(&cel->entry, &ctrl->cels); - spin_unlock(&ctrl->lock); + spin_unlock_irq(&ctrl->lock); out: *log = &cel->log; return 0; From 7442ddcedc344b6fa073692f165dffdd1889e780 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 14 Aug 2020 23:34:25 +0800 Subject: [PATCH 23/31] nvme-pci: Use u32 for nvme_dev.q_depth and nvme_queue.q_depth Recently nvme_dev.q_depth was changed from an int to u16 type. This falls over for the queue depth calculation in nvme_pci_enable(), where NVME_CAP_MQES(dev->ctrl.cap) + 1 may overflow as a u16, as NVME_CAP_MQES() is a 16b number also. That happens for me, and this is the result: root@ubuntu:/home/john# [148.272996] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010 Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00000a27bf3c9000 [0000000000000010] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT SMP Modules linked in: nvme nvme_core CPU: 56 PID: 256 Comm: kworker/u195:0 Not tainted 5.8.0-next-20200812 #27 Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI RC0 - V1.16.01 03/15/2019 Workqueue: nvme-reset-wq nvme_reset_work [nvme] pstate: 80c00009 (Nzcv daif +PAN +UAO BTYPE=--) pc : __sg_alloc_table_from_pages+0xec/0x238 lr : __sg_alloc_table_from_pages+0xc8/0x238 sp : ffff800013ccbad0 x29: ffff800013ccbad0 x28: ffff0a27b3d380a8 x27: 0000000000000000 x26: 0000000000002dc2 x25: 0000000000000dc0 x24: 0000000000000000 x23: 0000000000000000 x22: ffff800013ccbbe8 x21: 0000000000000010 x20: 0000000000000000 x19: 00000000fffff000 x18: ffffffffffffffff x17: 00000000000000c0 x16: fffffe289eaf6380 x15: ffff800011b59948 x14: ffff002bc8fe98f8 x13: ff00000000000000 x12: ffff8000114ca000 x11: 0000000000000000 x10: ffffffffffffffff x9 : ffffffffffffffc0 x8 : ffff0a27b5f9b6a0 x7 : 0000000000000000 x6 : 0000000000000001 x5 : ffff0a27b5f9b680 x4 : 0000000000000000 x3 : ffff0a27b5f9b680 x2 : 0000000000000000 x1 : 0000000000000001 x0 : 0000000000000000 Call trace: __sg_alloc_table_from_pages+0xec/0x238 sg_alloc_table_from_pages+0x18/0x28 iommu_dma_alloc+0x474/0x678 dma_alloc_attrs+0xd8/0xf0 nvme_alloc_queue+0x114/0x160 [nvme] nvme_reset_work+0xb34/0x14b4 [nvme] process_one_work+0x1e8/0x360 worker_thread+0x44/0x478 kthread+0x150/0x158 ret_from_fork+0x10/0x34 Code: f94002c3 6b01017f 540007c2 11000486 (f8645aa5) ---[ end trace 89bb2b72d59bf925 ]--- Fix by making onto a u32. Also use u32 for nvme_dev.q_depth, as we assign this value from nvme_dev.q_depth, and nvme_dev.q_depth will possibly hold 65536 - this avoids the same crash as above. Fixes: 61f3b8963097 ("nvme-pci: use unsigned for io queue depth") Signed-off-by: John Garry Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ba725ae47305..1cb09a187ae7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -120,7 +120,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - u16 q_depth; + u32 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -157,13 +157,13 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { int ret; - u16 n; + u32 n; - ret = kstrtou16(val, 10, &n); + ret = kstrtou32(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_ushort(val, kp); + return param_set_uint(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -195,7 +195,7 @@ struct nvme_queue { dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; u32 __iomem *q_db; - u16 q_depth; + u32 q_depth; u16 cq_vector; u16 sq_tail; u16 cq_head; @@ -2320,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); From c61b82c7b71343c6aca6bb6cc3ff44fb123fb5d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Aug 2020 19:51:59 +0200 Subject: [PATCH 24/31] nvme-pci: fix PRP pool size All operations are based on the controller, not the host page size. Switch the dma pool to use the controller page size as well to avoid massive overallocations on large page size systems. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1cb09a187ae7..e001a4ae65b0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2460,7 +2460,8 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_setup_prp_pools(struct nvme_dev *dev) { dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, - PAGE_SIZE, PAGE_SIZE, 0); + NVME_CTRL_PAGE_SIZE, + NVME_CTRL_PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; From c41ad98bebb8f4f0335b3c50dbb7583a6149dce4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 7 Aug 2020 09:32:35 -0700 Subject: [PATCH 25/31] nvme: skip noiob for zoned devices Zoned block devices reuse the chunk_sectors queue limit to define zone boundaries. If a such a device happens to also report an optimal boundary, do not use that to define the chunk_sectors as that may intermittently interfere with io splitting and zone size queries. Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 466c591c05e9..6c0d175f2ffa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2075,7 +2075,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) } } - if (iob) + if (iob && !blk_queue_is_zoned(ns->queue)) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH From 2eb81a3364eada43985efc0641490b73af78d0fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Aug 2020 09:11:29 +0200 Subject: [PATCH 26/31] nvme: rename and document nvme_end_request nvme_end_request is a bit misnamed, as it wraps around the blk_mq_complete_* API. It's semantics also are non-trivial, so give it a more descriptive name and add a comment explaining the semantics. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Mike Snitzer Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- Documentation/fault-injection/nvme-fault-injection.rst | 2 +- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 8 +++++++- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 4 ++-- drivers/nvme/target/loop.c | 2 +- 7 files changed, 14 insertions(+), 8 deletions(-) diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst index cdb2e829228e..1d4427890d75 100644 --- a/Documentation/fault-injection/nvme-fault-injection.rst +++ b/Documentation/fault-injection/nvme-fault-injection.rst @@ -3,7 +3,7 @@ NVMe Fault Injection Linux's fault injection framework provides a systematic way to support error injection via debugfs in the /sys/kernel/debug directory. When enabled, the default NVME_SC_INVALID_OPCODE with no retry will be -injected into the nvme_end_request. Users can change the default status +injected into the nvme_try_complete_req. Users can change the default status code and no retry flag via the debugfs. The list of Generic Command Status can be found in include/linux/nvme.h diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 07aacfe46368..a7f474ddfff7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2035,7 +2035,7 @@ done: } __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); - if (!nvme_end_request(rq, status, result)) + if (!nvme_try_complete_req(rq, status, result)) nvme_fc_complete_rq(rq); check_error: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ebb8c3ed3885..510f7dbeba98 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -523,7 +523,13 @@ static inline u32 nvme_bytes_to_numd(size_t len) return (len >> 2) - 1; } -static inline bool nvme_end_request(struct request *req, __le16 status, +/* + * Fill in the status and result information from the CQE, and then figure out + * if blk-mq will need to use IPI magic to complete the request, and if yes do + * so. If not let the caller complete the request without an indirect function + * call. + */ +static inline bool nvme_try_complete_req(struct request *req, __le16 status, union nvme_result result) { struct nvme_request *rq = nvme_req(req); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e001a4ae65b0..b673fa4cf5ea 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -961,7 +961,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); - if (!nvme_end_request(req, cqe->status, cqe->result)) + if (!nvme_try_complete_req(req, cqe->status, cqe->result)) nvme_pci_complete_rq(req); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 44c76ffbb264..4c5660201009 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1189,7 +1189,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req) if (!refcount_dec_and_test(&req->ref)) return; - if (!nvme_end_request(rq, req->status, req->result)) + if (!nvme_try_complete_req(rq, req->status, req->result)) nvme_rdma_complete_rq(rq); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 62fbaecdc960..3be4749dbf11 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -481,7 +481,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, return -EINVAL; } - if (!nvme_end_request(rq, cqe->status, cqe->result)) + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) nvme_complete_rq(rq); queue->nr_cqe++; @@ -672,7 +672,7 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status) { union nvme_result res = {}; - if (!nvme_end_request(rq, cpu_to_le16(status << 1), res)) + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) nvme_complete_rq(rq); } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4884ef1e46a2..0d6008cf66a2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -115,7 +115,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req) return; } - if (!nvme_end_request(rq, cqe->status, cqe->result)) + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) nvme_loop_complete_rq(rq); } } From 5ddaabe8ed713f148e3d67e99b86d99427aceb5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Aug 2020 09:11:30 +0200 Subject: [PATCH 27/31] nvme: refactor command completion Lift all the code to decide the dispostition of a completed command from nvme_complete_rq and nvme_failover_req into a new helper, which returns an emum of the potential actions. nvme_complete_rq then just switches on those and calls the proper helper for the action. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 77 ++++++++++++++++++++++------------- drivers/nvme/host/multipath.c | 47 ++++++--------------- drivers/nvme/host/nvme.h | 31 ++++++++++++-- 3 files changed, 90 insertions(+), 65 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c0d175f2ffa..9e75f6f62471 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status) } } -static inline bool nvme_req_needs_retry(struct request *req) -{ - if (blk_noretry_request(req)) - return false; - if (nvme_req(req)->status & NVME_SC_DNR) - return false; - if (nvme_req(req)->retries >= nvme_max_retries) - return false; - return true; -} - static void nvme_retry_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; @@ -268,33 +257,65 @@ static void nvme_retry_req(struct request *req) blk_mq_delay_kick_requeue_list(req->q, delay); } -void nvme_complete_rq(struct request *req) +enum nvme_disposition { + COMPLETE, + RETRY, + FAILOVER, +}; + +static inline enum nvme_disposition nvme_decide_disposition(struct request *req) +{ + if (likely(nvme_req(req)->status == 0)) + return COMPLETE; + + if (blk_noretry_request(req) || + (nvme_req(req)->status & NVME_SC_DNR) || + nvme_req(req)->retries >= nvme_max_retries) + return COMPLETE; + + if (req->cmd_flags & REQ_NVME_MPATH) { + if (nvme_is_path_error(nvme_req(req)->status)) + return FAILOVER; + } + + if (blk_queue_dying(req->q)) + return COMPLETE; + + return RETRY; +} + +static inline void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); - trace_nvme_complete_rq(req); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); + nvme_trace_bio_complete(req, status); + blk_mq_end_request(req, status); +} + +void nvme_complete_rq(struct request *req) +{ + trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); if (nvme_req(req)->ctrl->kas) nvme_req(req)->ctrl->comp_seen = true; - if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) - return; - - if (!blk_queue_dying(req->q)) { - nvme_retry_req(req); - return; - } - } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - req_op(req) == REQ_OP_ZONE_APPEND) { - req->__sector = nvme_lba_to_sect(req->q->queuedata, - le64_to_cpu(nvme_req(req)->result.u64)); + switch (nvme_decide_disposition(req)) { + case COMPLETE: + nvme_end_req(req); + return; + case RETRY: + nvme_retry_req(req); + return; + case FAILOVER: + nvme_failover_req(req); + return; } - - nvme_trace_bio_complete(req, status); - blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 760f625cefc8..d4ba736c6c89 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -bool nvme_failover_req(struct request *req) +void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - u16 status = nvme_req(req)->status; + u16 status = nvme_req(req)->status & 0x7ff; unsigned long flags; - switch (status & 0x7ff) { - case NVME_SC_ANA_TRANSITION: - case NVME_SC_ANA_INACCESSIBLE: - case NVME_SC_ANA_PERSISTENT_LOSS: - /* - * If we got back an ANA error we know the controller is alive, - * but not ready to serve this namespaces. The spec suggests - * we should update our general state here, but due to the fact - * that the admin and I/O queues are not serialized that is - * fundamentally racy. So instead just clear the current path, - * mark the the path as pending and kick of a re-read of the ANA - * log page ASAP. - */ - nvme_mpath_clear_current_path(ns); - if (ns->ctrl->ana_log_buf) { - set_bit(NVME_NS_ANA_PENDING, &ns->flags); - queue_work(nvme_wq, &ns->ctrl->ana_work); - } - break; - case NVME_SC_HOST_PATH_ERROR: - case NVME_SC_HOST_ABORTED_CMD: - /* - * Temporary transport disruption in talking to the controller. - * Try to send on a new path. - */ - nvme_mpath_clear_current_path(ns); - break; - default: - /* This was a non-ANA error so follow the normal error path. */ - return false; + nvme_mpath_clear_current_path(ns); + + /* + * If we got back an ANA error, we know the controller is alive but not + * ready to serve this namespace. Kick of a re-read of the ANA + * information page, and just try any other available path for now. + */ + if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); } spin_lock_irqsave(&ns->head->requeue_lock, flags); blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + blk_mq_end_request(req, 0); kblockd_schedule_work(&ns->head->requeue_work); - return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 510f7dbeba98..4ff6fd289082 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -523,6 +523,32 @@ static inline u32 nvme_bytes_to_numd(size_t len) return (len >> 2) - 1; } +static inline bool nvme_is_ana_error(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + return true; + default: + return false; + } +} + +static inline bool nvme_is_path_error(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_HOST_PATH_ERROR: + case NVME_SC_HOST_ABORTED_CMD: + case NVME_SC_ANA_TRANSITION: + case NVME_SC_ANA_INACCESSIBLE: + case NVME_SC_ANA_PERSISTENT_LOSS: + return true; + default: + return false; + } +} + /* * Fill in the status and result information from the CQE, and then figure out * if blk-mq will need to use IPI magic to complete the request, and if yes do @@ -635,7 +661,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -bool nvme_failover_req(struct request *req); +void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -694,9 +720,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline bool nvme_failover_req(struct request *req) +static inline void nvme_failover_req(struct request *req) { - return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { From 1e41f3bd26f79463c07aebf062a0a77f1fd39b2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Aug 2020 09:11:31 +0200 Subject: [PATCH 28/31] nvme: just check the status code type in nvme_is_path_error Check the SCT sub-field for a path related status instead of enumerating invididual status code. As of NVMe 1.4 this adds "Internal Path Error" and "Controller Pathing Error" to the list, but it also future proofs for additional status codes added to the category. Suggested-by: Chao Leng Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4ff6fd289082..e9cf29449dd1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -537,16 +537,8 @@ static inline bool nvme_is_ana_error(u16 status) static inline bool nvme_is_path_error(u16 status) { - switch (status & 0x7ff) { - case NVME_SC_HOST_PATH_ERROR: - case NVME_SC_HOST_ABORTED_CMD: - case NVME_SC_ANA_TRANSITION: - case NVME_SC_ANA_INACCESSIBLE: - case NVME_SC_ANA_PERSISTENT_LOSS: - return true; - default: - return false; - } + /* check for a status code type of 'path related status' */ + return (status & 0x700) == 0x300; } /* From 5eac5f3342b20825260d3800e7f5f74f12bac931 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Tue, 18 Aug 2020 09:11:32 +0200 Subject: [PATCH 29/31] nvme: redirect commands on dying queue If a command send through nvme-multipath failed on a dying queue, resend it on another path. Signed-off-by: Chao Leng [hch: rebased on top of the completion refactoring] Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Mike Snitzer Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9e75f6f62471..c9826ecf80e2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -274,13 +274,14 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) return COMPLETE; if (req->cmd_flags & REQ_NVME_MPATH) { - if (nvme_is_path_error(nvme_req(req)->status)) + if (nvme_is_path_error(nvme_req(req)->status) || + blk_queue_dying(req->q)) return FAILOVER; + } else { + if (blk_queue_dying(req->q)) + return COMPLETE; } - if (blk_queue_dying(req->q)) - return COMPLETE; - return RETRY; } From 0d3b6a8d213a30387b5104b2fb25376d18636f23 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Wed, 19 Aug 2020 11:31:11 +0300 Subject: [PATCH 30/31] nvmet: Disable keep-alive timer when kato is cleared to 0h Based on nvme spec, when keep alive timeout is set to zero the keep-alive timer should be disabled. Signed-off-by: Amit Engel Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b92f45f5cd5b..c052c922853a 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -397,6 +397,9 @@ static void nvmet_keep_alive_timer(struct work_struct *work) static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d start keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); @@ -406,6 +409,9 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) { + if (unlikely(ctrl->kato == 0)) + return; + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); cancel_delayed_work_sync(&ctrl->ka_work); From 2d62e6b038e729c3e4bfbfcfbd44800ef0883680 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Fri, 21 Aug 2020 04:34:42 -0400 Subject: [PATCH 31/31] null_blk: fix passing of REQ_FUA flag in null_handle_rq REQ_FUA should be checked using rq->cmd_flags instead of req_op(). Fixes: deb78b419dfda ("nullb: emulate cache") Signed-off-by: Hou Pu Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 47a9dad880af..d74443a9c8fa 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1147,7 +1147,7 @@ static int null_handle_rq(struct nullb_cmd *cmd) len = bvec.bv_len; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, - req_op(rq) & REQ_FUA); + rq->cmd_flags & REQ_FUA); if (err) { spin_unlock_irq(&nullb->lock); return err;