From 6a23e05c2fe3c64ec012fd81e51e3ab51e4f2f9f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Oct 2018 20:49:26 -0600 Subject: [PATCH 01/13] dm: remove legacy request-based IO path dm supports both, and since we're killing off the legacy path in general, get rid of it in dm. Signed-off-by: Jens Axboe Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 11 -- drivers/md/dm-core.h | 10 -- drivers/md/dm-mpath.c | 14 +- drivers/md/dm-rq.c | 316 ++++-------------------------------------- drivers/md/dm-rq.h | 4 - drivers/md/dm-sysfs.c | 3 +- drivers/md/dm-table.c | 49 +------ drivers/md/dm.c | 21 +-- drivers/md/dm.h | 1 - 9 files changed, 36 insertions(+), 393 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8b8c123cae66..3db222509e44 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -215,17 +215,6 @@ config BLK_DEV_DM If unsure, say N. -config DM_MQ_DEFAULT - bool "request-based DM: use blk-mq I/O path by default" - depends on BLK_DEV_DM - ---help--- - This option enables the blk-mq based I/O path for request-based - DM devices by default. With the option the dm_mod.use_blk_mq - module/boot option defaults to Y, without it to N, but it can - still be overriden either way. - - If unsure say N. - config DM_DEBUG bool "Device mapper debugging support" depends on BLK_DEV_DM diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 7d480c930eaf..224d44503a06 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -112,18 +112,8 @@ struct mapped_device { struct dm_stats stats; - struct kthread_worker kworker; - struct task_struct *kworker_task; - - /* for request-based merge heuristic in dm_request_fn() */ - unsigned seq_rq_merge_deadline_usecs; - int last_rq_rw; - sector_t last_rq_pos; - ktime_t last_rq_start_time; - /* for blk-mq request-based DM support */ struct blk_mq_tag_set *tag_set; - bool use_blk_mq:1; bool init_tio_pdu:1; struct srcu_struct io_barrier; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 419362c2d8ac..a24ed3973e7c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -203,14 +203,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) { if (m->queue_mode == DM_TYPE_NONE) { - /* - * Default to request-based. - */ - if (dm_use_blk_mq(dm_table_get_md(ti->table))) - m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; - else - m->queue_mode = DM_TYPE_REQUEST_BASED; - + m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; } else if (m->queue_mode == DM_TYPE_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); /* @@ -537,10 +530,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, * get the queue busy feedback (via BLK_STS_RESOURCE), * otherwise I/O merging can suffer. */ - if (q->mq_ops) - return DM_MAPIO_REQUEUE; - else - return DM_MAPIO_DELAY_REQUEUE; + return DM_MAPIO_REQUEUE; } clone->bio = clone->biotail = NULL; clone->rq_disk = bdev->bd_disk; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 6e547b8dd298..7cd36e4d1310 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -23,19 +23,6 @@ static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; #define RESERVED_REQUEST_BASED_IOS 256 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; -static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT); - -bool dm_use_blk_mq_default(void) -{ - return use_blk_mq; -} - -bool dm_use_blk_mq(struct mapped_device *md) -{ - return md->use_blk_mq; -} -EXPORT_SYMBOL_GPL(dm_use_blk_mq); - unsigned dm_get_reserved_rq_based_ios(void) { return __dm_get_module_param(&reserved_rq_based_ios, @@ -59,41 +46,13 @@ int dm_request_based(struct mapped_device *md) return queue_is_rq_based(md->queue); } -static void dm_old_start_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (blk_queue_stopped(q)) - blk_start_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void dm_mq_start_queue(struct request_queue *q) +void dm_start_queue(struct request_queue *q) { blk_mq_unquiesce_queue(q); blk_mq_kick_requeue_list(q); } -void dm_start_queue(struct request_queue *q) -{ - if (!q->mq_ops) - dm_old_start_queue(q); - else - dm_mq_start_queue(q); -} - -static void dm_old_stop_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (!blk_queue_stopped(q)) - blk_stop_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void dm_mq_stop_queue(struct request_queue *q) +void dm_stop_queue(struct request_queue *q) { if (blk_mq_queue_stopped(q)) return; @@ -101,14 +60,6 @@ static void dm_mq_stop_queue(struct request_queue *q) blk_mq_quiesce_queue(q); } -void dm_stop_queue(struct request_queue *q) -{ - if (!q->mq_ops) - dm_old_stop_queue(q); - else - dm_mq_stop_queue(q); -} - /* * Partial completion handling for request-based dm */ @@ -179,27 +130,12 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - struct request_queue *q = md->queue; - unsigned long flags; - atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ if (!md_in_flight(md)) wake_up(&md->wait); - /* - * Run this off this callpath, as drivers could invoke end_io while - * inside their request_fn (and holding the queue lock). Calling - * back into ->request_fn() could deadlock attempting to grab the - * queue lock again. - */ - if (!q->mq_ops && run_queue) { - spin_lock_irqsave(q->queue_lock, flags); - blk_run_queue_async(q); - spin_unlock_irqrestore(q->queue_lock, flags); - } - /* * dm_put() must be at the end of this function. See the comment above */ @@ -222,27 +158,10 @@ static void dm_end_request(struct request *clone, blk_status_t error) tio->ti->type->release_clone_rq(clone); rq_end_stats(md, rq); - if (!rq->q->mq_ops) - blk_end_request_all(rq, error); - else - blk_mq_end_request(rq, error); + blk_mq_end_request(rq, error); rq_completed(md, rw, true); } -/* - * Requeue the original request of a clone. - */ -static void dm_old_requeue_request(struct request *rq, unsigned long delay_ms) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, rq); - blk_delay_queue(q, delay_ms); - spin_unlock_irqrestore(q->queue_lock, flags); -} - static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs) { blk_mq_delay_kick_requeue_list(q, msecs); @@ -273,11 +192,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ tio->ti->type->release_clone_rq(tio->clone); } - if (!rq->q->mq_ops) - dm_old_requeue_request(rq, delay_ms); - else - dm_mq_delay_requeue_request(rq, delay_ms); - + dm_mq_delay_requeue_request(rq, delay_ms); rq_completed(md, rw, false); } @@ -340,10 +255,7 @@ static void dm_softirq_done(struct request *rq) rq_end_stats(md, rq); rw = rq_data_dir(rq); - if (!rq->q->mq_ops) - blk_end_request_all(rq, tio->error); - else - blk_mq_end_request(rq, tio->error); + blk_mq_end_request(rq, tio->error); rq_completed(md, rw, false); return; } @@ -363,17 +275,14 @@ static void dm_complete_request(struct request *rq, blk_status_t error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - if (!rq->q->mq_ops) - blk_complete_request(rq); - else - blk_mq_complete_request(rq); + blk_mq_complete_request(rq); } /* * Complete the not-mapped clone and the original request with the error status * through softirq context. * Target's rq_end_io() function isn't called. - * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. + * This may be used when the target's clone_and_map_rq() function fails. */ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) { @@ -381,21 +290,10 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) dm_complete_request(rq, error); } -/* - * Called with the clone's queue lock held (in the case of .request_fn) - */ static void end_clone_request(struct request *clone, blk_status_t error) { struct dm_rq_target_io *tio = clone->end_io_data; - /* - * Actual request completion is done in a softirq context which doesn't - * hold the clone's queue lock. Otherwise, deadlock could occur because: - * - another request may be submitted by the upper level driver - * of the stacking during the completion - * - the submission which requires queue lock may be done - * against this clone's queue - */ dm_complete_request(tio->orig, error); } @@ -446,8 +344,6 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static void map_tio_request(struct kthread_work *work); - static void init_tio(struct dm_rq_target_io *tio, struct request *rq, struct mapped_device *md) { @@ -464,8 +360,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, */ if (!md->init_tio_pdu) memset(&tio->info, 0, sizeof(tio->info)); - if (md->kworker_task) - kthread_init_work(&tio->work, map_tio_request); } /* @@ -504,10 +398,7 @@ check_again: blk_rq_unprep_clone(clone); tio->ti->type->release_clone_rq(clone); tio->clone = NULL; - if (!rq->q->mq_ops) - r = DM_MAPIO_DELAY_REQUEUE; - else - r = DM_MAPIO_REQUEUE; + r = DM_MAPIO_REQUEUE; goto check_again; } break; @@ -530,20 +421,23 @@ check_again: return r; } +/* DEPRECATED: previously used for request-based merge heuristic in dm_request_fn() */ +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) +{ + return sprintf(buf, "%u\n", 0); +} + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count) +{ + return count; +} + static void dm_start_request(struct mapped_device *md, struct request *orig) { - if (!orig->q->mq_ops) - blk_start_request(orig); - else - blk_mq_start_request(orig); + blk_mq_start_request(orig); atomic_inc(&md->pending[rq_data_dir(orig)]); - if (md->seq_rq_merge_deadline_usecs) { - md->last_rq_pos = rq_end_sector(orig); - md->last_rq_rw = rq_data_dir(orig); - md->last_rq_start_time = ktime_get(); - } - if (unlikely(dm_stats_used(&md->stats))) { struct dm_rq_target_io *tio = tio_from_request(orig); tio->duration_jiffies = jiffies; @@ -563,8 +457,10 @@ static void dm_start_request(struct mapped_device *md, struct request *orig) dm_get(md); } -static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq) +static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { + struct mapped_device *md = set->driver_data; struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); /* @@ -581,163 +477,6 @@ static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq) return 0; } -static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) -{ - return __dm_rq_init_rq(q->rq_alloc_data, rq); -} - -static void map_tio_request(struct kthread_work *work) -{ - struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); - - if (map_request(tio) == DM_MAPIO_REQUEUE) - dm_requeue_original_request(tio, false); -} - -ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) -{ - return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); -} - -#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 - -ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, - const char *buf, size_t count) -{ - unsigned deadline; - - if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED) - return count; - - if (kstrtouint(buf, 10, &deadline)) - return -EINVAL; - - if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) - deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; - - md->seq_rq_merge_deadline_usecs = deadline; - - return count; -} - -static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md) -{ - ktime_t kt_deadline; - - if (!md->seq_rq_merge_deadline_usecs) - return false; - - kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); - kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); - - return !ktime_after(ktime_get(), kt_deadline); -} - -/* - * q->request_fn for old request-based dm. - * Called with the queue lock held. - */ -static void dm_old_request_fn(struct request_queue *q) -{ - struct mapped_device *md = q->queuedata; - struct dm_target *ti = md->immutable_target; - struct request *rq; - struct dm_rq_target_io *tio; - sector_t pos = 0; - - if (unlikely(!ti)) { - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - - if (unlikely(!map)) { - dm_put_live_table(md, srcu_idx); - return; - } - ti = dm_table_find_target(map, pos); - dm_put_live_table(md, srcu_idx); - } - - /* - * For suspend, check blk_queue_stopped() and increment - * ->pending within a single queue_lock not to increment the - * number of in-flight I/Os after the queue is stopped in - * dm_suspend(). - */ - while (!blk_queue_stopped(q)) { - rq = blk_peek_request(q); - if (!rq) - return; - - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (req_op(rq) != REQ_OP_FLUSH) - pos = blk_rq_pos(rq); - - if ((dm_old_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) && - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || - (ti->type->busy && ti->type->busy(ti))) { - blk_delay_queue(q, 10); - return; - } - - dm_start_request(md, rq); - - tio = tio_from_request(rq); - init_tio(tio, rq, md); - /* Establish tio->ti before queuing work (map_tio_request) */ - tio->ti = ti; - kthread_queue_work(&md->kworker, &tio->work); - BUG_ON(!irqs_disabled()); - } -} - -/* - * Fully initialize a .request_fn request-based queue. - */ -int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) -{ - struct dm_target *immutable_tgt; - - /* Fully initialize the queue */ - md->queue->cmd_size = sizeof(struct dm_rq_target_io); - md->queue->rq_alloc_data = md; - md->queue->request_fn = dm_old_request_fn; - md->queue->init_rq_fn = dm_rq_init_rq; - - immutable_tgt = dm_table_get_immutable_target(t); - if (immutable_tgt && immutable_tgt->per_io_data_size) { - /* any target-specific per-io data is immediately after the tio */ - md->queue->cmd_size += immutable_tgt->per_io_data_size; - md->init_tio_pdu = true; - } - if (blk_init_allocated_queue(md->queue) < 0) - return -EINVAL; - - /* disable dm_old_request_fn's merge heuristic by default */ - md->seq_rq_merge_deadline_usecs = 0; - - blk_queue_softirq_done(md->queue, dm_softirq_done); - - /* Initialize the request-based DM worker thread */ - kthread_init_worker(&md->kworker); - md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, - "kdmwork-%s", dm_device_name(md)); - if (IS_ERR(md->kworker_task)) { - int error = PTR_ERR(md->kworker_task); - md->kworker_task = NULL; - return error; - } - - return 0; -} - -static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - return __dm_rq_init_rq(set->driver_data, rq); -} - static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -790,11 +529,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) struct dm_target *immutable_tgt; int err; - if (!dm_table_all_blk_mq_devices(t)) { - DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); - return -EINVAL; - } - md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); if (!md->tag_set) return -ENOMEM; @@ -845,6 +579,8 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md) module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); +/* Unused, but preserved for userspace compatibility */ +static bool use_blk_mq = true; module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index f43c45460aac..b39245545229 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -46,10 +46,6 @@ struct dm_rq_clone_bio_info { struct bio clone; }; -bool dm_use_blk_mq_default(void); -bool dm_use_blk_mq(struct mapped_device *md); - -int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t); int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t); void dm_mq_cleanup_mapped_device(struct mapped_device *md); diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index c209b8a19b84..a05fcd50e1b9 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -92,7 +92,8 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf) { - sprintf(buf, "%d\n", dm_use_blk_mq(md)); + /* Purely for userspace compatibility */ + sprintf(buf, "%d\n", true); return strlen(buf); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3d0e2c198f06..96e152c339a6 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -47,7 +47,6 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; - bool all_blk_mq:1; unsigned integrity_added:1; /* @@ -910,21 +909,10 @@ static bool dm_table_supports_dax(struct dm_table *t) static bool dm_table_does_not_support_partial_completion(struct dm_table *t); -struct verify_rq_based_data { - unsigned sq_count; - unsigned mq_count; -}; - static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - struct verify_rq_based_data *v = data; - - if (q->mq_ops) - v->mq_count++; - else - v->sq_count++; return queue_is_rq_based(q); } @@ -933,7 +921,6 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -1022,11 +1009,9 @@ verify_rq_based: int srcu_idx; struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); - /* inherit live table's type and all_blk_mq */ - if (live_table) { + /* inherit live table's type */ + if (live_table) t->type = live_table->type; - t->all_blk_mq = live_table->all_blk_mq; - } dm_put_live_table(t->md, srcu_idx); return 0; } @@ -1042,21 +1027,10 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { + !tgt->type->iterate_devices(tgt, device_is_rq_based, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } - if (v.sq_count && v.mq_count) { - DMERR("table load rejected: not all devices are blk-mq request-stackable"); - return -EINVAL; - } - t->all_blk_mq = v.mq_count > 0; - - if (!t->all_blk_mq && - (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) { - DMERR("table load rejected: all devices are not blk-mq request-stackable"); - return -EINVAL; - } return 0; } @@ -1105,11 +1079,6 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -bool dm_table_all_blk_mq_devices(struct dm_table *t) -{ - return t->all_blk_mq; -} - static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); @@ -2083,22 +2052,14 @@ void dm_table_run_md_queue_async(struct dm_table *t) { struct mapped_device *md; struct request_queue *queue; - unsigned long flags; if (!dm_table_request_based(t)) return; md = dm_table_get_md(t); queue = dm_get_md_queue(md); - if (queue) { - if (queue->mq_ops) - blk_mq_run_hw_queues(queue, true); - else { - spin_lock_irqsave(queue->queue_lock, flags); - blk_run_queue_async(queue); - spin_unlock_irqrestore(queue->queue_lock, flags); - } - } + if (queue) + blk_mq_run_hw_queues(queue, true); } EXPORT_SYMBOL(dm_table_run_md_queue_async); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 45abb54037fc..0ce00c6f5f9a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1808,8 +1808,6 @@ static void dm_wq_work(struct work_struct *work); static void dm_init_normal_md_queue(struct mapped_device *md) { - md->use_blk_mq = false; - /* * Initialize aspects of queue that aren't relevant for blk-mq */ @@ -1820,8 +1818,6 @@ static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) destroy_workqueue(md->wq); - if (md->kworker_task) - kthread_stop(md->kworker_task); bioset_exit(&md->bs); bioset_exit(&md->io_bs); @@ -1888,7 +1884,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad_io_barrier; md->numa_node_id = numa_node_id; - md->use_blk_mq = dm_use_blk_mq_default(); md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); @@ -1919,7 +1914,6 @@ static struct mapped_device *alloc_dev(int minor) INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); - md->kworker_task = NULL; md->disk->major = _major; md->disk->first_minor = minor; @@ -2219,13 +2213,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) switch (type) { case DM_TYPE_REQUEST_BASED: - dm_init_normal_md_queue(md); - r = dm_old_init_request_queue(md, t); - if (r) { - DMERR("Cannot initialize queue for request-based mapped device"); - return r; - } - break; case DM_TYPE_MQ_REQUEST_BASED: r = dm_mq_init_request_queue(md, t); if (r) { @@ -2331,9 +2318,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) blk_set_queue_dying(md->queue); - if (dm_request_based(md) && md->kworker_task) - kthread_flush_worker(&md->kworker); - /* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend. @@ -2586,11 +2570,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) { + if (dm_request_based(md)) dm_stop_queue(md->queue); - if (md->kworker_task) - kthread_flush_worker(&md->kworker); - } flush_workqueue(md->wq); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 114a81b27c37..2d539b82ec08 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -70,7 +70,6 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_bio_based(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); -bool dm_table_all_blk_mq_devices(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); From 953923c09fe83255ae11845db1c9eb576ba73df8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 11 Oct 2018 11:06:29 -0400 Subject: [PATCH 02/13] dm: rename DM_TYPE_MQ_REQUEST_BASED to DM_TYPE_REQUEST_BASED Now that request-based DM is only using blk-mq, there is no need to differentiate between legacy "rq" and new "mq". We're back to a single request-based DM -- and there was much rejoicing! Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 14 +++++--------- drivers/md/dm-table.c | 7 +------ drivers/md/dm.c | 2 -- include/linux/device-mapper.h | 5 ++--- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index a24ed3973e7c..d6a66921daf4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -203,7 +203,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) { if (m->queue_mode == DM_TYPE_NONE) { - m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; + m->queue_mode = DM_TYPE_REQUEST_BASED; } else if (m->queue_mode == DM_TYPE_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); /* @@ -658,7 +658,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio) static void process_queued_io_list(struct multipath *m) { - if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) + if (m->queue_mode == DM_TYPE_REQUEST_BASED) dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); else if (m->queue_mode == DM_TYPE_BIO_BASED) queue_work(kmultipathd, &m->process_queued_bios); @@ -1079,10 +1079,9 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) if (!strcasecmp(queue_mode_name, "bio")) m->queue_mode = DM_TYPE_BIO_BASED; - else if (!strcasecmp(queue_mode_name, "rq")) + else if (!strcasecmp(queue_mode_name, "rq") || + !strcasecmp(queue_mode_name, "mq")) m->queue_mode = DM_TYPE_REQUEST_BASED; - else if (!strcasecmp(queue_mode_name, "mq")) - m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; else { ti->error = "Unknown 'queue_mode' requested"; r = -EINVAL; @@ -1716,9 +1715,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type, case DM_TYPE_BIO_BASED: DMEMIT("queue_mode bio "); break; - case DM_TYPE_MQ_REQUEST_BASED: - DMEMIT("queue_mode mq "); - break; default: WARN_ON_ONCE(true); break; @@ -1962,7 +1958,7 @@ static int multipath_busy(struct dm_target *ti) /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); + return (m->queue_mode != DM_TYPE_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ pg = READ_ONCE(m->current_pg); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 96e152c339a6..eeea32bb6a3e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -871,8 +871,7 @@ static bool __table_type_bio_based(enum dm_queue_mode table_type) static bool __table_type_request_based(enum dm_queue_mode table_type) { - return (table_type == DM_TYPE_REQUEST_BASED || - table_type == DM_TYPE_MQ_REQUEST_BASED); + return table_type == DM_TYPE_REQUEST_BASED; } void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) @@ -986,10 +985,6 @@ verify_bio_based: BUG_ON(!request_based); /* No targets in this table */ - /* - * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by - * having a compatible target use dm_table_set_type. - */ t->type = DM_TYPE_REQUEST_BASED; verify_rq_based: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0ce00c6f5f9a..bf36e2635ea7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2213,7 +2213,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) switch (type) { case DM_TYPE_REQUEST_BASED: - case DM_TYPE_MQ_REQUEST_BASED: r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm-mq mapped device"); @@ -2946,7 +2945,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu goto out; break; case DM_TYPE_REQUEST_BASED: - case DM_TYPE_MQ_REQUEST_BASED: pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_io_data_size is used for blk-mq pdu at queue allocation */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 6fb0808e87c8..8d937754aa0c 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -26,9 +26,8 @@ enum dm_queue_mode { DM_TYPE_NONE = 0, DM_TYPE_BIO_BASED = 1, DM_TYPE_REQUEST_BASED = 2, - DM_TYPE_MQ_REQUEST_BASED = 3, - DM_TYPE_DAX_BIO_BASED = 4, - DM_TYPE_NVME_BIO_BASED = 5, + DM_TYPE_DAX_BIO_BASED = 3, + DM_TYPE_NVME_BIO_BASED = 4, }; typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; From cef6f55a9fb4f6d6f9df0f772aa64cf159997466 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 11 Oct 2018 17:44:04 -0400 Subject: [PATCH 03/13] dm table: require that request-based DM be layered on blk-mq devices Now that request-based DM (multipath) is blk-mq only: this restriction is required while the legacy request-based IO path still exists. Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index eeea32bb6a3e..618edfc3846f 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -908,10 +908,21 @@ static bool dm_table_supports_dax(struct dm_table *t) static bool dm_table_does_not_support_partial_completion(struct dm_table *t); +struct verify_rq_based_data { + unsigned sq_count; + unsigned mq_count; +}; + static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); + struct verify_rq_based_data *v = data; + + if (q->mq_ops) + v->mq_count++; + else + v->sq_count++; return queue_is_rq_based(q); } @@ -920,6 +931,7 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; + struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -1022,10 +1034,14 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_based, NULL)) { + !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } + if (v.sq_count > 0) { + DMERR("table load rejected: not all devices are blk-mq request-stackable"); + return -EINVAL; + } return 0; } From 22d4c291f58752d14efb8a185b772a3fc3e947d2 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Thu, 23 Aug 2018 13:35:55 -0400 Subject: [PATCH 04/13] dm thin: use refcount_t for thin_c reference counting The API surrounding refcount_t should be used in place of atomic_t when variables are being used as reference counters. It can potentially prevent reference counter overflows and use-after-free conditions. In the dm thin layer, one such example is tc->refcount. Change this from the atomic_t API to the refcount_t API to prevent mentioned conditions. Signed-off-by: John Pittman Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index aaf1ad481ee8..0bd8d498b3b9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -325,7 +325,7 @@ struct thin_c { * Ensures the thin is not destroyed until the worker has finished * iterating the active_thins list. */ - atomic_t refcount; + refcount_t refcount; struct completion can_destroy; }; @@ -4044,12 +4044,12 @@ static struct target_type pool_target = { *--------------------------------------------------------------*/ static void thin_get(struct thin_c *tc) { - atomic_inc(&tc->refcount); + refcount_inc(&tc->refcount); } static void thin_put(struct thin_c *tc) { - if (atomic_dec_and_test(&tc->refcount)) + if (refcount_dec_and_test(&tc->refcount)) complete(&tc->can_destroy); } @@ -4193,7 +4193,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto bad; } - atomic_set(&tc->refcount, 1); + refcount_set(&tc->refcount, 1); init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); spin_unlock_irqrestore(&tc->pool->lock, flags); From 092b5648760a23656f6dcc83b74f4cf661094aca Mon Sep 17 00:00:00 2001 From: John Pittman Date: Thu, 23 Aug 2018 13:35:57 -0400 Subject: [PATCH 05/13] dm zoned: target: use refcount_t for dm zoned reference counters The API surrounding refcount_t should be used in place of atomic_t when variables are being used as reference counters. This API can prevent issues such as counter overflows and use-after-free conditions. Within the dm zoned target stack, the atomic_t API is used for bioctx->ref and cw->refcount. Change these to use refcount_t, avoiding the issues mentioned. Signed-off-by: John Pittman Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index a44183ff4be0..fa36825c1eff 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -19,7 +19,7 @@ struct dmz_bioctx { struct dmz_target *target; struct dm_zone *zone; struct bio *bio; - atomic_t ref; + refcount_t ref; blk_status_t status; }; @@ -28,7 +28,7 @@ struct dmz_bioctx { */ struct dm_chunk_work { struct work_struct work; - atomic_t refcount; + refcount_t refcount; struct dmz_target *target; unsigned int chunk; struct bio_list bio_list; @@ -115,7 +115,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone, if (nr_blocks == dmz_bio_blocks(bio)) { /* Setup and submit the BIO */ bio->bi_iter.bi_sector = sector; - atomic_inc(&bioctx->ref); + refcount_inc(&bioctx->ref); generic_make_request(bio); return 0; } @@ -134,7 +134,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone, bio_advance(bio, clone->bi_iter.bi_size); /* Submit the clone */ - atomic_inc(&bioctx->ref); + refcount_inc(&bioctx->ref); generic_make_request(clone); return 0; @@ -240,7 +240,7 @@ static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone, /* Setup and submit the BIO */ bio_set_dev(bio, dmz->dev->bdev); bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); - atomic_inc(&bioctx->ref); + refcount_inc(&bioctx->ref); generic_make_request(bio); if (dmz_is_seq(zone)) @@ -456,7 +456,7 @@ out: */ static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) { - atomic_inc(&cw->refcount); + refcount_inc(&cw->refcount); } /* @@ -465,7 +465,7 @@ static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) */ static void dmz_put_chunk_work(struct dm_chunk_work *cw) { - if (atomic_dec_and_test(&cw->refcount)) { + if (refcount_dec_and_test(&cw->refcount)) { WARN_ON(!bio_list_empty(&cw->bio_list)); radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk); kfree(cw); @@ -546,7 +546,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) goto out; INIT_WORK(&cw->work, dmz_chunk_work); - atomic_set(&cw->refcount, 0); + refcount_set(&cw->refcount, 0); cw->target = dmz; cw->chunk = chunk; bio_list_init(&cw->bio_list); @@ -599,7 +599,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bioctx->target = dmz; bioctx->zone = NULL; bioctx->bio = bio; - atomic_set(&bioctx->ref, 1); + refcount_set(&bioctx->ref, 1); bioctx->status = BLK_STS_OK; /* Set the BIO pending in the flush list */ @@ -633,7 +633,7 @@ static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error if (bioctx->status == BLK_STS_OK && *error) bioctx->status = *error; - if (!atomic_dec_and_test(&bioctx->ref)) + if (!refcount_dec_and_test(&bioctx->ref)) return DM_ENDIO_INCOMPLETE; /* Done */ From bab5d988841e58fec6ae22f486905ddde2d715f4 Mon Sep 17 00:00:00 2001 From: Igor Stoppa Date: Fri, 7 Sep 2018 20:03:37 +0300 Subject: [PATCH 06/13] dm: remove unnecessary unlikely() around WARN_ON_ONCE() WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 2 +- drivers/md/dm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 1b5b9ad9e492..b61aac00ff40 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1200,7 +1200,7 @@ static void queue_demotion(struct smq_policy *mq) struct policy_work work; struct entry *e; - if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) + if (WARN_ON_ONCE(!mq->migrations_allowed)) return; e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index bf36e2635ea7..1fbc28ab157c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1666,7 +1666,7 @@ static blk_qc_t __process_bio(struct mapped_device *md, * Defend against IO still getting in during teardown * - as was seen for a time with nvme-fcloop */ - if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) { + if (WARN_ON_ONCE(!ti || !dm_target_is_valid(ti))) { error = -EIO; goto out; } From 800a7340ab7dd667edf95e74d8e4f23a17e87076 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Wed, 3 Oct 2018 11:43:59 -0500 Subject: [PATCH 07/13] dm ioctl: harden copy_params()'s copy_from_user() from malicious users In copy_params(), the struct 'dm_ioctl' is first copied from the user space buffer 'user' to 'param_kernel' and the field 'data_size' is checked against 'minimum_data_size' (size of 'struct dm_ioctl' payload up to its 'data' member). If the check fails, an error code EINVAL will be returned. Otherwise, param_kernel->data_size is used to do a second copy, which copies from the same user-space buffer to 'dmi'. After the second copy, only 'dmi->data_size' is checked against 'param_kernel->data_size'. Given that the buffer 'user' resides in the user space, a malicious user-space process can race to change the content in the buffer between the two copies. This way, the attacker can inject inconsistent data into 'dmi' (versus previously validated 'param_kernel'). Fix redundant copying of 'minimum_data_size' from user-space buffer by using the first copy stored in 'param_kernel'. Also remove the 'data_size' check after the second copy because it is now unnecessary. Cc: stable@vger.kernel.org Signed-off-by: Wenwen Wang Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index b810ea77e6b1..f666778ad237 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1720,8 +1720,7 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla } static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, - int ioctl_flags, - struct dm_ioctl **param, int *param_flags) + int ioctl_flags, struct dm_ioctl **param, int *param_flags) { struct dm_ioctl *dmi; int secure_data; @@ -1762,18 +1761,13 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern *param_flags |= DM_PARAMS_MALLOC; - if (copy_from_user(dmi, user, param_kernel->data_size)) - goto bad; + /* Copy from param_kernel (which was already copied from user) */ + memcpy(dmi, param_kernel, minimum_data_size); + if (copy_from_user(&dmi->data, (char __user *)user + minimum_data_size, + param_kernel->data_size - minimum_data_size)) + goto bad; data_copied: - /* - * Abort if something changed the ioctl data while it was being copied. - */ - if (dmi->data_size != param_kernel->data_size) { - DMERR("rejecting ioctl: data size modified while processing parameters"); - goto bad; - } - /* Wipe the user buffer so we do not return it to userspace */ if (secure_data && clear_user(user, param_kernel->data_size)) goto bad; From f349b0a3e1f0d184374936f1b2a49352f8a4b1c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 9 Oct 2018 22:13:42 +0200 Subject: [PATCH 08/13] dm: add dm_table_device_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a shortcut for dm_device_name(dm_table_get_md(t)). Signed-off-by: Michał Mirosław Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 6 ++++++ include/linux/device-mapper.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 618edfc3846f..49ab0cbef739 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2059,6 +2059,12 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) } EXPORT_SYMBOL(dm_table_get_md); +const char *dm_table_device_name(struct dm_table *t) +{ + return dm_device_name(t->md); +} +EXPORT_SYMBOL_GPL(dm_table_device_name); + void dm_table_run_md_queue_async(struct dm_table *t) { struct mapped_device *md; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 8d937754aa0c..d7bee8669f10 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -489,6 +489,7 @@ sector_t dm_table_get_size(struct dm_table *t); unsigned int dm_table_get_num_targets(struct dm_table *t); fmode_t dm_table_get_mode(struct dm_table *t); struct mapped_device *dm_table_get_md(struct dm_table *t); +const char *dm_table_device_name(struct dm_table *t); /* * Trigger an event. From ed0302e83098db9b3a8c27f36c93deeafd6963d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 9 Oct 2018 22:13:43 +0200 Subject: [PATCH 09/13] dm crypt: make workqueue names device-specific MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make cpu-usage debugging easier by naming workqueues per device. Example ps output: root 413 0.0 0.0 0 0 ? I< paź02 0:00 [kcryptd_io/253:0] root 414 0.0 0.0 0 0 ? I< paź02 0:00 [kcryptd/253:0] root 415 0.0 0.0 0 0 ? S paź02 1:10 [dmcrypt_write/253:0] root 465 0.0 0.0 0 0 ? I< paź02 0:00 [kcryptd_io/253:2] root 466 0.0 0.0 0 0 ? I< paź02 0:00 [kcryptd/253:2] root 467 0.0 0.0 0 0 ? S paź02 2:06 [dmcrypt_write/253:2] root 15359 0.2 0.0 0 0 ? I< 19:43 0:25 [kworker/u17:8-kcryptd/253:0] root 16563 0.2 0.0 0 0 ? I< 20:10 0:18 [kworker/u17:0-kcryptd/253:2] root 23205 0.1 0.0 0 0 ? I< 21:21 0:04 [kworker/u17:4-kcryptd/253:0] root 13383 0.1 0.0 0 0 ? I< 21:32 0:02 [kworker/u17:2-kcryptd/253:2] root 2610 0.1 0.0 0 0 ? I< 21:42 0:01 [kworker/u17:12-kcryptd/253:2] root 20124 0.1 0.0 0 0 ? I< 21:56 0:01 [kworker/u17:1-kcryptd/253:2] Signed-off-by: Michał Mirosław Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0481223b1deb..b8eec515a003 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2661,6 +2661,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; + const char *devname = dm_table_device_name(ti->table); int key_size; unsigned int align_mask; unsigned long long tmpll; @@ -2806,18 +2807,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->io_queue = alloc_workqueue("kcryptd_io/%s", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->crypt_queue = alloc_workqueue("kcryptd/%s", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + 1, devname); else - cc->crypt_queue = alloc_workqueue("kcryptd", + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, - num_online_cpus()); + num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -2826,7 +2831,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&cc->write_thread_lock); cc->write_tree = RB_ROOT; - cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); + cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write/%s", devname); if (IS_ERR(cc->write_thread)) { ret = PTR_ERR(cc->write_thread); cc->write_thread = NULL; From d857ad75edf3c0066fcd920746f9dc75382b3324 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 12 Oct 2018 20:24:25 +0200 Subject: [PATCH 10/13] dm raid: avoid bitmap with raid4/5/6 journal device With raid4/5/6, journal device and write intent bitmap are mutually exclusive. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c44925e4e481..e1dd1622a290 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2475,7 +2475,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) } /* Enable bitmap creation for RAID levels != 0 */ - mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); + mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; if (!test_and_clear_bit(FirstUse, &rdev->flags)) { From 33c2865f8d011a2ca9f67124ddab9dc89382e9f1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 17 Oct 2018 18:05:07 +0900 Subject: [PATCH 11/13] dm zoned: fix metadata block ref counting Since the ref field of struct dmz_mblock is always used with the spinlock of struct dmz_metadata locked, there is no need to use an atomic_t type. Change the type of the ref field to an unsigne integer. Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 969954915566..67b71f6e3bda 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -99,7 +99,7 @@ struct dmz_mblock { struct rb_node node; struct list_head link; sector_t no; - atomic_t ref; + unsigned int ref; unsigned long state; struct page *page; void *data; @@ -296,7 +296,7 @@ static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, RB_CLEAR_NODE(&mblk->node); INIT_LIST_HEAD(&mblk->link); - atomic_set(&mblk->ref, 0); + mblk->ref = 0; mblk->state = 0; mblk->no = mblk_no; mblk->data = page_address(mblk->page); @@ -397,7 +397,7 @@ static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd, return NULL; spin_lock(&zmd->mblk_lock); - atomic_inc(&mblk->ref); + mblk->ref++; set_bit(DMZ_META_READING, &mblk->state); dmz_insert_mblock(zmd, mblk); spin_unlock(&zmd->mblk_lock); @@ -484,7 +484,8 @@ static void dmz_release_mblock(struct dmz_metadata *zmd, spin_lock(&zmd->mblk_lock); - if (atomic_dec_and_test(&mblk->ref)) { + mblk->ref--; + if (mblk->ref == 0) { if (test_bit(DMZ_META_ERROR, &mblk->state)) { rb_erase(&mblk->node, &zmd->mblk_rbtree); dmz_free_mblock(zmd, mblk); @@ -511,7 +512,8 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, mblk = dmz_lookup_mblock(zmd, mblk_no); if (mblk) { /* Cache hit: remove block from LRU list */ - if (atomic_inc_return(&mblk->ref) == 1 && + mblk->ref++; + if (mblk->ref == 1 && !test_bit(DMZ_META_DIRTY, &mblk->state)) list_del_init(&mblk->link); } @@ -753,7 +755,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) spin_lock(&zmd->mblk_lock); clear_bit(DMZ_META_DIRTY, &mblk->state); - if (atomic_read(&mblk->ref) == 0) + if (mblk->ref == 0) list_add_tail(&mblk->link, &zmd->mblk_lru_list); spin_unlock(&zmd->mblk_lock); } @@ -2308,7 +2310,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) mblk = list_first_entry(&zmd->mblk_dirty_list, struct dmz_mblock, link); dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)", - (u64)mblk->no, atomic_read(&mblk->ref)); + (u64)mblk->no, mblk->ref); list_del_init(&mblk->link); rb_erase(&mblk->node, &zmd->mblk_rbtree); dmz_free_mblock(zmd, mblk); @@ -2326,8 +2328,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) root = &zmd->mblk_rbtree; rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree", - (u64)mblk->no, atomic_read(&mblk->ref)); - atomic_set(&mblk->ref, 0); + (u64)mblk->no, mblk->ref); + mblk->ref = 0; dmz_free_mblock(zmd, mblk); } From 3d4e738311327bc4ba1d55fbe2f1da3de4a475f9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 17 Oct 2018 18:05:08 +0900 Subject: [PATCH 12/13] dm zoned: fix various dmz_get_mblock() issues dmz_fetch_mblock() called from dmz_get_mblock() has a race since the allocation of the new metadata block descriptor and its insertion in the cache rbtree with the READING state is not atomic. Two different contexts requesting the same block may end up each adding two different descriptors of the same block to the cache. Another problem for this function is that the BIO for processing the block read is allocated after the metadata block descriptor is inserted in the cache rbtree. If the BIO allocation fails, the metadata block descriptor is freed without first being removed from the rbtree. Fix the first problem by checking again if the requested block is not in the cache right before inserting the newly allocated descriptor, atomically under the mblk_lock spinlock. The second problem is fixed by simply allocating the BIO before inserting the new block in the cache. Finally, since dmz_fetch_mblock() also increments a block reference counter, rename the function to dmz_get_mblock_slow(). To be symmetric and clear, also rename dmz_lookup_mblock() to dmz_get_mblock_fast() and increment the block reference counter directly in that function rather than in dmz_get_mblock(). Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 66 +++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 67b71f6e3bda..fa68336560c3 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -339,10 +339,11 @@ static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) } /* - * Lookup a metadata block in the rbtree. + * Lookup a metadata block in the rbtree. If the block is found, increment + * its reference count. */ -static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd, - sector_t mblk_no) +static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, + sector_t mblk_no) { struct rb_root *root = &zmd->mblk_rbtree; struct rb_node *node = root->rb_node; @@ -350,8 +351,17 @@ static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd, while (node) { mblk = container_of(node, struct dmz_mblock, node); - if (mblk->no == mblk_no) + if (mblk->no == mblk_no) { + /* + * If this is the first reference to the block, + * remove it from the LRU list. + */ + mblk->ref++; + if (mblk->ref == 1 && + !test_bit(DMZ_META_DIRTY, &mblk->state)) + list_del_init(&mblk->link); return mblk; + } node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; } @@ -382,32 +392,47 @@ static void dmz_mblock_bio_end_io(struct bio *bio) } /* - * Read a metadata block from disk. + * Read an uncached metadata block from disk and add it to the cache. */ -static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd, - sector_t mblk_no) +static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, + sector_t mblk_no) { - struct dmz_mblock *mblk; + struct dmz_mblock *mblk, *m; sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; struct bio *bio; - /* Get block and insert it */ + /* Get a new block and a BIO to read it */ mblk = dmz_alloc_mblock(zmd, mblk_no); if (!mblk) return NULL; - spin_lock(&zmd->mblk_lock); - mblk->ref++; - set_bit(DMZ_META_READING, &mblk->state); - dmz_insert_mblock(zmd, mblk); - spin_unlock(&zmd->mblk_lock); - bio = bio_alloc(GFP_NOIO, 1); if (!bio) { dmz_free_mblock(zmd, mblk); return NULL; } + spin_lock(&zmd->mblk_lock); + + /* + * Make sure that another context did not start reading + * the block already. + */ + m = dmz_get_mblock_fast(zmd, mblk_no); + if (m) { + spin_unlock(&zmd->mblk_lock); + dmz_free_mblock(zmd, mblk); + bio_put(bio); + return m; + } + + mblk->ref++; + set_bit(DMZ_META_READING, &mblk->state); + dmz_insert_mblock(zmd, mblk); + + spin_unlock(&zmd->mblk_lock); + + /* Submit read BIO */ bio->bi_iter.bi_sector = dmz_blk2sect(block); bio_set_dev(bio, zmd->dev->bdev); bio->bi_private = mblk; @@ -509,19 +534,12 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, /* Check rbtree */ spin_lock(&zmd->mblk_lock); - mblk = dmz_lookup_mblock(zmd, mblk_no); - if (mblk) { - /* Cache hit: remove block from LRU list */ - mblk->ref++; - if (mblk->ref == 1 && - !test_bit(DMZ_META_DIRTY, &mblk->state)) - list_del_init(&mblk->link); - } + mblk = dmz_get_mblock_fast(zmd, mblk_no); spin_unlock(&zmd->mblk_lock); if (!mblk) { /* Cache miss: read the block from disk */ - mblk = dmz_fetch_mblock(zmd, mblk_no); + mblk = dmz_get_mblock_slow(zmd, mblk_no); if (!mblk) return ERR_PTR(-ENOMEM); } From da4ad3a23af3d7f357b24b33e9fec7531b59ee49 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 22 Oct 2018 10:59:52 -0400 Subject: [PATCH 13/13] dm writecache: remove disabled code in memory_entry() This dead branch was missed during review. It only makes memory_entry() more inefficient due to needless call to is_power_of_2(), etc. Reported-by: shenghui Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5f1f80d424dd..2d50eec94cd7 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -350,10 +350,7 @@ static struct wc_memory_superblock *sb(struct dm_writecache *wc) static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) { - if (is_power_of_2(sizeof(struct wc_entry)) && 0) - return &sb(wc)->entries[e - wc->entries]; - else - return &sb(wc)->entries[e->index]; + return &sb(wc)->entries[e->index]; } static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)