From c2789bd403f4c0c541a359cf318b1dda9a14234f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 20 Nov 2015 22:16:46 +0100 Subject: [PATCH 01/12] block: rename request_queue slab cache Name the cache after the actual name of the struct. Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5131993b23a1..c0c884efa40f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3513,7 +3513,7 @@ int __init blk_dev_init(void) request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("blkdev_queue", + blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; From d674d4145e1909e0e4c394bb2633ef73d539bd27 Mon Sep 17 00:00:00 2001 From: Wei Tang Date: Tue, 24 Nov 2015 09:58:45 +0800 Subject: [PATCH 02/12] block: do not initialise globals to 0 or NULL This patch fixes the checkpatch.pl error to blk-exec.c: ERROR: do not initialise globals to 0 or NULL Signed-off-by: Wei Tang Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index c0c884efa40f..c88a946eca49 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,7 +51,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -struct kmem_cache *request_cachep = NULL; +struct kmem_cache *request_cachep; /* * For queue allocation From 1fe8f348416b3fb35ea3f24fa92bb1d29ffe7b0b Mon Sep 17 00:00:00 2001 From: Wei Tang Date: Tue, 24 Nov 2015 09:58:46 +0800 Subject: [PATCH 03/12] block: do not initialise statics to 0 or NULL This patch fixes the checkpatch.pl error to genhd.c: ERROR: do not initialise statics to 0 or NULL Signed-off-by: Wei Tang Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index e5cafa51567c..78140b4eea4d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1421,7 +1421,7 @@ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs = 0; +static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { From bd5cecea43ef379e82250addd0303e2f9ede6793 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sat, 21 Nov 2015 17:27:31 +0800 Subject: [PATCH 04/12] bio: use offset_in_page macro Use offset_in_page macro instead of (addr & ~PAGE_MASK). Signed-off-by: Geliang Tang Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 4f184d938942..dbabd48b1934 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1125,7 +1125,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, int i, ret; int nr_pages = 0; unsigned int len = iter->count; - unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; for (i = 0; i < iter->nr_segs; i++) { unsigned long uaddr; @@ -1304,7 +1304,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, goto out_unmap; } - offset = uaddr & ~PAGE_MASK; + offset = offset_in_page(uaddr); for (j = cur_page; j < page_limit; j++) { unsigned int bytes = PAGE_SIZE - offset; From 3b627a3f934c493ada71217f14681e5157e95783 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Nov 2015 15:58:53 -0700 Subject: [PATCH 05/12] block: clarify blk_add_timer() use case for blk-mq Just a comment update on not needing queue_lock, and that we aren't really adding the request to a timeout list for !mq. Signed-off-by: Jens Axboe --- block/blk-timeout.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index aa40aa93381b..3610af561748 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -186,6 +186,7 @@ unsigned long blk_rq_timeout(unsigned long timeout) * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. + * Queue lock must be held for the non-mq case, mq case doesn't care. */ void blk_add_timer(struct request *req) { @@ -209,6 +210,11 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; + + /* + * Only the non-mq case needs to add the request to a protected list. + * For the mq case we simply scan the tag map. + */ if (!q->mq_ops) list_add_tail(&req->timeout_list, &req->q->timeout_list); From d7cf931dd9f18ce8ee7a0a9b7813a19fb2c8f5e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Nov 2015 10:12:54 -0700 Subject: [PATCH 06/12] Revert "blk-flush: Queue through IO scheduler when flush not required" This reverts commit 1b2ff19e6a957b1ef0f365ad331b608af80e932e. Jan writes: -- Thanks for report! After some investigation I found out we allocate elevator specific data in __get_request() only for non-flush requests. And this is actually required since the flush machinery uses the space in struct request for something else. Doh. So my patch is just wrong and not easy to fix since at the time __get_request() is called we are not sure whether the flush machinery will be used in the end. Jens, please revert 1b2ff19e6a957b1ef0f365ad331b608af80e932e. Thanks! I'm somewhat surprised that you can reliably hit the race where flushing gets disabled for the device just while the request is in flight. But I guess during boot it makes some sense. -- So let's just revert it, we can fix the queue run manually after the fact. This race is rare enough that it didn't trigger in testing, it requires the specific disable-while-in-flight scenario to trigger. --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index c81d56ec308f..9c423e53324a 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -422,7 +422,7 @@ void blk_insert_flush(struct request *rq) if (q->mq_ops) { blk_mq_insert_request(rq, false, false, true); } else - q->elevator->type->ops.elevator_add_req_fn(q, rq); + list_add_tail(&rq->queuelist, &q->queue_head); return; } From 6f3b0e8bcf3cbb87a7459b3ed018d31d918df3f8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 09:13:05 +0100 Subject: [PATCH 07/12] blk-mq: add a flags parameter to blk_mq_alloc_request We already have the reserved flag, and a nowait flag awkwardly encoded as a gfp_t. Add a real flags argument to make the scheme more extensible and allow for a nicer calling convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 11 ++++++----- block/blk-mq-tag.c | 11 +++++------ block/blk-mq.c | 20 ++++++++------------ block/blk-mq.h | 11 ++++------- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/null_blk.c | 2 +- drivers/nvme/host/lightnvm.c | 2 +- drivers/nvme/host/pci.c | 11 ++++++----- fs/block_dev.c | 4 ++-- include/linux/blk-mq.h | 8 +++++++- include/linux/blkdev.h | 2 +- 11 files changed, 42 insertions(+), 42 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index c88a946eca49..5ec996036e16 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -630,7 +630,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); -int blk_queue_enter(struct request_queue *q, gfp_t gfp) +int blk_queue_enter(struct request_queue *q, bool nowait) { while (true) { int ret; @@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp) if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; - if (!gfpflags_allow_blocking(gfp)) + if (nowait) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, @@ -1276,7 +1276,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, + (gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT); else return blk_old_get_request(q, rw, gfp_mask); } @@ -2044,8 +2046,7 @@ blk_qc_t generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { - + if (likely(blk_queue_enter(q, false) == 0)) { ret = q->make_request_fn(q, bio); blk_queue_exit(q); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index a07ca3488d96..abdbb47405cb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!gfpflags_allow_blocking(data->gfp)) + if (data->flags & BLK_MQ_REQ_NOWAIT) return -1; bs = bt_wait_ptr(bt, hctx); @@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data, data->ctx = blk_mq_get_ctx(data->q); data->hctx = data->q->mq_ops->map_queue(data->q, data->ctx->cpu); - if (data->reserved) { + if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { last_tag = &data->ctx->last_tag; @@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { - if (!data->reserved) - return __blk_mq_get_tag(data); - - return __blk_mq_get_reserved_tag(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + return __blk_mq_get_reserved_tag(data); + return __blk_mq_get_tag(data); } static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) diff --git a/block/blk-mq.c b/block/blk-mq.c index 6d6f8feb48c0..93a4e1956915 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, - bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + unsigned int flags) { struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; @@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_queue_enter(q, gfp); + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, - reserved, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { + if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, - hctx); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; } @@ -1175,8 +1173,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, rw |= REQ_SYNC; trace_block_getrq(q, bio, rw); - blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, - hctx); + blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); if (unlikely(!rq)) { __blk_mq_run_hw_queue(hctx); @@ -1185,8 +1182,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, - __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; diff --git a/block/blk-mq.h b/block/blk-mq.h index 713820b47b31..eaede8e45c9c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; - gfp_t gfp; - bool reserved; + unsigned int flags; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -105,13 +104,11 @@ struct blk_mq_alloc_data { }; static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, - struct request_queue *q, gfp_t gfp, bool reserved, - struct blk_mq_ctx *ctx, - struct blk_mq_hw_ctx *hctx) + struct request_queue *q, unsigned int flags, + struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) { data->q = q; - data->gfp = gfp; - data->reserved = reserved; + data->flags = flags; data->ctx = ctx; data->hctx = hctx; } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3457ac8c03e2..10bd8d0a9d9c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; - rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true); + rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED); return blk_mq_rq_to_pdu(rq); } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 5c8ba5484d86..fa742dddf3f8 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -464,7 +464,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) struct request *rq; struct bio *bio = rqd->bio; - rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0); + rq = blk_mq_alloc_request(q, bio_rw(bio), 0); if (IS_ERR(rq)) return -ENOMEM; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 9202d1a468d0..d5622f9164ad 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -470,7 +470,7 @@ static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) struct bio *bio = rqd->bio; struct nvme_nvm_command *cmd; - rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0); + rq = blk_mq_alloc_request(q, bio_rw(bio), 0); if (IS_ERR(rq)) return -ENOMEM; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f3b53af789ef..b8a02221233c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1041,7 +1041,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct request *req; int ret; - req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); + req = blk_mq_alloc_request(q, write, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -1094,7 +1094,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) struct nvme_cmd_info *cmd_info; struct request *req; - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); + req = blk_mq_alloc_request(dev->admin_q, WRITE, + BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); if (IS_ERR(req)) return PTR_ERR(req); @@ -1119,7 +1120,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct request *req; struct nvme_cmd_info *cmd_rq; - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + req = blk_mq_alloc_request(dev->admin_q, WRITE, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -1320,8 +1321,8 @@ static void nvme_abort_req(struct request *req) if (!dev->abort_limit) return; - abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, - false); + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, + BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) return; diff --git a/fs/block_dev.c b/fs/block_dev.c index c25639e907bd..aa1a45985889 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -395,7 +395,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); @@ -432,7 +432,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index daf17d70aeca..7fc9296b5742 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -188,8 +188,14 @@ void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_free_request(struct request *rq); void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); + +enum { + BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ + BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ +}; + struct request *blk_mq_alloc_request(struct request_queue *q, int rw, - gfp_t gfp, bool reserved); + unsigned int flags); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0d2b7927c1f..e711f294934c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -794,7 +794,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -extern int blk_queue_enter(struct request_queue *q, gfp_t gfp); +extern int blk_queue_enter(struct request_queue *q, bool nowait); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); From e0e827b9fc71fbed1a9cd246067c2a4dbd3ea220 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 2 Dec 2015 16:57:06 +0530 Subject: [PATCH 08/12] blk-mq: Reuse hardware context cpumask for tags hctx->cpumask is already populated and let the tag cpumask follow that instead of going through a new for loop. Signed-off-by: Raghavendra K T Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 93a4e1956915..35da31841eda 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1850,6 +1850,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); + cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping @@ -1863,14 +1864,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } - - queue_for_each_ctx(q, ctx, i) { - if (!cpumask_test_cpu(i, online_mask)) - continue; - - hctx = q->mq_ops->map_queue(q, i); - cpumask_set_cpu(i, hctx->tags->cpumask); - } } static void queue_set_hctx_shared(struct request_queue *q, bool shared) From bffed457160ab48282ca6d0d58646b3bbc2fa554 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 2 Dec 2015 16:59:05 +0530 Subject: [PATCH 09/12] blk-mq: Avoid memoryless numa node encoded in hctx numa_node In architecture like powerpc, we can have cpus without any local memory attached to it (a.k.a memoryless nodes). In such cases cpu to node mapping can result in memory allocation hints for block hctx->numa_node populated with node values which does not have real memory. Instead use local_memory_node(), which is guaranteed to have memory. local_memory_node is a noop in other architectures that does not support memoryless nodes. Signed-off-by: Raghavendra K T Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 2 +- block/blk-mq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 8764c241e5bb..d0634bcf322f 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) for_each_possible_cpu(i) { if (index == mq_map[i]) - return cpu_to_node(i); + return local_memory_node(cpu_to_node(i)); } return NUMA_NO_NODE; diff --git a/block/blk-mq.c b/block/blk-mq.c index 35da31841eda..6889d7183a2a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1790,7 +1790,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, * not, we remain on the home node of the device */ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = cpu_to_node(i); + hctx->numa_node = local_memory_node(cpu_to_node(i)); } } From cda22646adaa453519fac28222f20b0d73aa8562 Mon Sep 17 00:00:00 2001 From: Mike Krinkin Date: Thu, 3 Dec 2015 17:32:30 +0300 Subject: [PATCH 10/12] block: add call to split trace point There is a split tracepoint that is supposed to be called when bio is splitted, and it was called in bio_split function until commit 4b1faf931650d4a35b2a ("block: Kill bio_pair_split()"). But now, no one reports splits, so this patch adds call to trace_block_split back in blk_queue_split right after split. Signed-off-by: Mike Krinkin Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 41a55ba0d78e..0e5643a5d1c3 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,6 +7,8 @@ #include #include +#include + #include "blk.h" static struct bio *blk_bio_discard_split(struct request_queue *q, @@ -159,6 +161,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, split->bi_rw |= REQ_NOMERGE; bio_chain(split, *bio); + trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); *bio = split; } From e36f6204288088fda50d1c84830340ccb70f85ff Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jan 2016 15:08:39 -0700 Subject: [PATCH 11/12] block: split bios to max possible length This splits bio in the middle of a vector to form the largest possible bio at the h/w's desired alignment, and guarantees the bio being split will have some data. The criteria for splitting is changed from the max sectors to the h/w's optimal sector alignment if it is provided. For h/w that advertise their block storage's underlying chunk size, it's a big performance win to not submit commands that cross them. If sector alignment is not provided, this patch uses the max sectors as before. This addresses the performance issue commit d380561113 attempted to fix, but was reverted due to splitting logic error. Signed-off-by: Keith Busch Cc: Jens Axboe Cc: Ming Lei Cc: Kent Overstreet Cc: # 4.4.x- Signed-off-by: Jens Axboe --- block/blk-merge.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 0e5643a5d1c3..237b087d3529 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -83,9 +83,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *new = NULL; bio_for_each_segment(bv, bio, iter) { - if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) - goto split; - /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -93,6 +90,22 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; + if (sectors + (bv.bv_len >> 9) > + blk_max_size_offset(q, bio->bi_iter.bi_sector)) { + /* + * Consider this a new segment if we're splitting in + * the middle of this vector. + */ + if (nsegs < queue_max_segments(q) && + sectors < blk_max_size_offset(q, + bio->bi_iter.bi_sector)) { + nsegs++; + sectors = blk_max_size_offset(q, + bio->bi_iter.bi_sector); + } + goto split; + } + if (bvprvp && blk_queue_cluster(q)) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; From ed8a9d2c816e0b0a2c5a605505e0f09ae9010dd6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 20 Nov 2015 22:18:43 +0100 Subject: [PATCH 12/12] block: use bd{grab,put}() instead of open-coding - bd_acquire() and bd_forget() open-code bdgrab() and bdput() - raw driver uses igrab() but never checks its return value and always holds another ref from bind_set() while calling it, so it's equivalent to bdgrab() Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- drivers/char/raw.c | 2 +- fs/block_dev.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 60316fbaf295..9b9809b709a5 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp) err = -ENODEV; if (!bdev) goto out; - igrab(bdev->bd_inode); + bdgrab(bdev); err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open); if (err) goto out; diff --git a/fs/block_dev.c b/fs/block_dev.c index aa1a45985889..8b02c90ced87 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -696,7 +696,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - ihold(bdev->bd_inode); + bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } @@ -712,7 +712,7 @@ static struct block_device *bd_acquire(struct inode *inode) * So, we can access it via ->i_mapping always * without igrab(). */ - ihold(bdev->bd_inode); + bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); @@ -735,7 +735,7 @@ void bd_forget(struct inode *inode) spin_unlock(&bdev_lock); if (bdev) - iput(bdev->bd_inode); + bdput(bdev); } /**