From 4c7e309340ff85072e96f529582d159002c36734 Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Fri, 26 Jun 2015 15:25:48 +0100 Subject: [PATCH 01/11] dm btree remove: fix bug in redistribute3 redistribute3() shares entries out across 3 nodes. Some entries were being moved the wrong way, breaking the ordering. This manifested as a BUG() in dm-btree-remove.c:shift() when entries were removed from the btree. For additional context see: https://www.redhat.com/archives/dm-devel/2015-May/msg00113.html Signed-off-by: Dennis Yang Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/persistent-data/dm-btree-remove.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index e04cfd2d60ef..9836c0ae897c 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -309,8 +309,8 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, if (s < 0 && nr_center < -s) { /* not enough in central node */ - shift(left, center, nr_center); - s = nr_center - target; + shift(left, center, -nr_center); + s += nr_center; shift(left, right, s); nr_right += s; } else @@ -323,7 +323,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, if (s > 0 && nr_center < s) { /* not enough in central node */ shift(center, right, nr_center); - s = target - nr_center; + s -= nr_center; shift(left, right, s); nr_left -= s; } else From a822c83e47d97cdef38c4352e1ef62d9f46cfe98 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 3 Jul 2015 10:22:42 +0100 Subject: [PATCH 02/11] dm thin: allocate the cell_sort_array dynamically Given the pool's cell_sort_array holds 8192 pointers it triggers an order 5 allocation via kmalloc. This order 5 allocation is prone to failure as system memory gets more fragmented over time. Fix this by allocating the cell_sort_array using vmalloc. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c33f61a4cc28..8f015d924a24 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -268,7 +269,7 @@ struct pool { process_mapping_fn process_prepared_mapping; process_mapping_fn process_prepared_discard; - struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; + struct dm_bio_prison_cell **cell_sort_array; }; static enum pool_mode get_pool_mode(struct pool *pool); @@ -2777,6 +2778,7 @@ static void __pool_destroy(struct pool *pool) { __pool_table_remove(pool); + vfree(pool->cell_sort_array); if (dm_pool_metadata_close(pool->pmd) < 0) DMWARN("%s: dm_pool_metadata_close() failed.", __func__); @@ -2889,6 +2891,13 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_mapping_pool; } + pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE); + if (!pool->cell_sort_array) { + *error = "Error allocating cell sort array"; + err_p = ERR_PTR(-ENOMEM); + goto bad_sort_array; + } + pool->ref_count = 1; pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; @@ -2897,6 +2906,8 @@ static struct pool *pool_create(struct mapped_device *pool_md, return pool; +bad_sort_array: + mempool_destroy(pool->mapping_pool); bad_mapping_pool: dm_deferred_set_destroy(pool->all_io_ds); bad_all_io_ds: From 1c7518794a3647eb345d59ee52844e8a40405198 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 3 Jul 2015 14:51:32 +0100 Subject: [PATCH 03/11] dm btree: silence lockdep lock inversion in dm_btree_del() Allocate memory using GFP_NOIO when deleting a btree. dm_btree_del() can be called via an ioctl and we don't want to recurse into the FS or block layer. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/persistent-data/dm-btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 200ac12a1d40..fdd3793e22f9 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -255,7 +255,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) int r; struct del_stack *s; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc(sizeof(*s), GFP_NOIO); if (!s) return -ENOMEM; s->info = info; From 621739b00e16ca2d80411dc9b111cb15b91f3ba9 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 8 Jul 2015 16:08:24 -0400 Subject: [PATCH 04/11] Revert "dm: only run the queue on completion if congested or no requests pending" This reverts commit 9a0e609e3fd8a95c96629b9fbde6b8c5b9a1456a. (Resolved a conflict during revert due to commit bfebd1cdb4 that came after) This revert is motivated by a couple failure reports on request-based DM multipath testbeds: 1) Netapp reported that their multipath fault injection test under heavy IO load can stall longer than 300 seconds. 2) IBM reported elevated lock contention in their testbed (likely due to increased back pressure due to IO not being dispatched as quickly): https://www.redhat.com/archives/dm-devel/2015-July/msg00057.html Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 4.1+ --- drivers/md/dm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f331d888e7f5..de703778d39f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1067,13 +1067,10 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - int nr_requests_pending; - atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ - nr_requests_pending = md_in_flight(md); - if (!nr_requests_pending) + if (!md_in_flight(md)) wake_up(&md->wait); /* @@ -1085,8 +1082,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) if (run_queue) { if (md->queue->mq_ops) blk_mq_run_hw_queues(md->queue, true); - else if (!nr_requests_pending || - (nr_requests_pending >= md->queue->nr_congestion_on)) + else blk_run_queue_async(md->queue); } From b06075a98d595b761881fb2d7b8a557ea2f8b7ac Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 10 Jul 2015 17:21:43 -0400 Subject: [PATCH 05/11] dm: fix use after free crash due to incorrect cleanup sequence Linux 4.2-rc1 Commit 0f20972f7bf6 ("dm: factor out a common cleanup_mapped_device()") moved a common cleanup code to a separate function. Unfortunately, that commit incorrectly changed the order of cleanup, so that it destroys the mapped_device's srcu structure 'io_barrier' before destroying its workqueue. The function that is executed on the workqueue (dm_wq_work) uses the srcu structure, thus it may use it after being freed. That results in a crash in the LVM test suite's mirror-vgreduce-removemissing.sh test. Signed-off-by: Mikulas Patocka Fixes: 0f20972f7bf6 ("dm: factor out a common cleanup_mapped_device()") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index de703778d39f..ab37ae114e94 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2277,8 +2277,6 @@ static void dm_init_old_md_queue(struct mapped_device *md) static void cleanup_mapped_device(struct mapped_device *md) { - cleanup_srcu_struct(&md->io_barrier); - if (md->wq) destroy_workqueue(md->wq); if (md->kworker_task) @@ -2290,6 +2288,8 @@ static void cleanup_mapped_device(struct mapped_device *md) if (md->bs) bioset_free(md->bs); + cleanup_srcu_struct(&md->io_barrier); + if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; From bcc696fac11fe13e59dda5aaec6322a25b7c9a3a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 15 Jul 2015 16:52:04 -0400 Subject: [PATCH 06/11] dm thin: stay in out-of-data-space mode once no_space_timeout expires This fixes an issue where running out of data space would cause the thin-pool's metadata to become read-only. There was no reason to make metadata read-only -- calling set_pool_mode() with PM_READ_ONLY was a misguided way to error all queued and future write IOs. We can accomplish the same by degrading from PM_OUT_OF_DATA_SPACE to PM_OUT_OF_DATA_SPACE with error_if_no_space enabled. Otherwise, the use of PM_READ_ONLY could cause a race where commit() was started before the PM_READ_ONLY transition but dm_pool_commit_metadata() would go on to fail because the block manager had transitioned to read-only. The return of -EPERM from dm_pool_commit_metadata(), due to attempting to commit while in read-only mode, caused the thin-pool to set 'needs_check' because a metadata_operation_failed(). This needless cascade of failures makes life for users more difficult than needed. Reported-by: Vivek Goyal Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 8f015d924a24..34e79531ea3f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2282,18 +2282,23 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } +static void notify_of_pool_mode_change_to_oods(struct pool *pool); + /* * We're holding onto IO to allow userland time to react. After the * timeout either the pool will have been resized (and thus back in - * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space. */ static void do_no_space_timeout(struct work_struct *ws) { struct pool *pool = container_of(to_delayed_work(ws), struct pool, no_space_timeout); - if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) - set_pool_mode(pool, PM_READ_ONLY); + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { + pool->pf.error_if_no_space = true; + notify_of_pool_mode_change_to_oods(pool); + error_retry_list(pool); + } } /*----------------------------------------------------------------*/ @@ -2371,6 +2376,14 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) dm_device_name(pool->pool_md), new_mode); } +static void notify_of_pool_mode_change_to_oods(struct pool *pool) +{ + if (!pool->pf.error_if_no_space) + notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)"); + else + notify_of_pool_mode_change(pool, "out-of-data-space (error IO)"); +} + static bool passdown_enabled(struct pool_c *pt) { return pt->adjusted_pf.discard_passdown; @@ -2455,7 +2468,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) * frequently seeing this mode. */ if (old_mode != new_mode) - notify_of_pool_mode_change(pool, "out-of-data-space"); + notify_of_pool_mode_change_to_oods(pool); pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; pool->process_cell = process_cell_read_only; From e4c78e210daea17f82f12037005df225e22189b9 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 15 Jul 2015 11:40:24 -0400 Subject: [PATCH 07/11] dm thin: display 'needs_check' in status if it is set There is currently no way to see that the needs_check flag has been set in the metadata. Display 'needs_check' in the thin-pool status if it is set in the thinp metadata. Also, update thinp documentation. Signed-off-by: Mike Snitzer --- Documentation/device-mapper/thin-provisioning.txt | 9 ++++++++- drivers/md/dm-thin.c | 10 ++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 4f67578b2954..1699a55b7b70 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -296,7 +296,7 @@ ii) Status underlying device. When this is enabled when loading the table, it can get disabled if the underlying device doesn't support it. - ro|rw + ro|rw|out_of_data_space If the pool encounters certain types of device failures it will drop into a read-only metadata mode in which no changes to the pool metadata (like allocating new blocks) are permitted. @@ -314,6 +314,13 @@ ii) Status module parameter can be used to change this timeout -- it defaults to 60 seconds but may be disabled using a value of 0. + needs_check + A metadata operation has failed, resulting in the needs_check + flag being set in the metadata's superblock. The metadata + device must be deactivated and checked/repaired before the + thin-pool can be made fully operational again. '-' indicates + needs_check is not set. + iii) Messages create_thin diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 34e79531ea3f..1c50c580215c 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3738,6 +3738,7 @@ static void emit_flags(struct pool_features *pf, char *result, * Status line is: * / * / + * */ static void pool_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -3839,6 +3840,11 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("queue_if_no_space "); + if (dm_pool_metadata_needs_check(pool->pmd)) + DMEMIT("needs_check "); + else + DMEMIT("- "); + break; case STATUSTYPE_TABLE: @@ -3942,7 +3948,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 15, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4329,7 +4335,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 15, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, From 255eac20054e90ac7a52b3e179b61de1168a8fe6 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 15 Jul 2015 11:42:59 -0400 Subject: [PATCH 08/11] dm cache: display 'needs_check' in status if it is set There is currently no way to see that the needs_check flag has been set in the metadata. Display 'needs_check' in the cache status if it is set in the cache metadata. Also, update cache documentation. Signed-off-by: Mike Snitzer --- Documentation/device-mapper/cache.txt | 6 ++++++ drivers/md/dm-cache-target.c | 9 +++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index 82960cffbad3..785eab87aa71 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -258,6 +258,12 @@ cache metadata mode : ro if read-only, rw if read-write no further I/O will be permitted and the status will just contain the string 'Fail'. The userspace recovery tools should then be used. +needs_check : 'needs_check' if set, '-' if not set + A metadata operation has failed, resulting in the needs_check + flag being set in the metadata's superblock. The metadata + device must be deactivated and checked/repaired before the + cache can be made fully operational again. '-' indicates + needs_check is not set. Messages -------- diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1b4e1756b169..4afa34d7b8ad 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3496,7 +3496,7 @@ static void cache_resume(struct dm_target *ti) * <#demotions> <#promotions> <#dirty> * <#features> * * <#core args> - * <#policy args> * + * <#policy args> * */ static void cache_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -3582,6 +3582,11 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("rw "); + if (dm_cache_metadata_needs_check(cache->cmd)) + DMEMIT("needs_check "); + else + DMEMIT("- "); + break; case STATUSTYPE_TABLE: @@ -3820,7 +3825,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, From 386cb7cdeeef97e0bf082a8d6bbfc07a2ccce07b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 16 Jul 2015 21:16:31 -0400 Subject: [PATCH 09/11] dm cache: do not wake_worker() in free_migration() All methods that queue work call wake_worker() as you'd expect. E.g. cell_defer, defer_bio, quiesce_migration (which is called by writeback, promote, demote_then_promote, invalidate, discard, etc). Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 4afa34d7b8ad..c8a160b37412 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -424,7 +424,6 @@ static void free_migration(struct dm_cache_migration *mg) wake_up(&cache->migration_wait); mempool_free(mg, cache->migration_pool); - wake_worker(cache); } static int prealloc_data_structs(struct cache *cache, struct prealloc *p) From e782eff591bca2d96bac30ab5d1cfa4ccd3b0f86 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 16 Jul 2015 21:26:10 -0400 Subject: [PATCH 10/11] dm cache: avoid preallocation if no work in writeback_some_dirty_blocks() Refactor writeback_some_dirty_blocks() to avoid prealloc_data_structs() if the policy doesn't have any dirty blocks ready for writeback. Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index c8a160b37412..408dd276d6c9 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2061,7 +2061,6 @@ static void process_deferred_writethrough_bios(struct cache *cache) static void writeback_some_dirty_blocks(struct cache *cache) { - int r = 0; dm_oblock_t oblock; dm_cblock_t cblock; struct prealloc structs; @@ -2071,15 +2070,11 @@ static void writeback_some_dirty_blocks(struct cache *cache) memset(&structs, 0, sizeof(structs)); while (spare_migration_bandwidth(cache)) { - if (prealloc_data_structs(cache, &structs)) - break; + if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) + break; /* no work to do */ - r = policy_writeback_work(cache->policy, &oblock, &cblock, busy); - if (r) - break; - - r = get_cell(cache, oblock, &structs, &old_ocell); - if (r) { + if (prealloc_data_structs(cache, &structs) || + get_cell(cache, oblock, &structs, &old_ocell)) { policy_set_dirty(cache->policy, oblock); break; } From 665022d72f9b5762f21b5ea02fa0503d04802849 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 16 Jul 2015 21:48:55 -0400 Subject: [PATCH 11/11] dm cache: avoid calls to prealloc_free_structs() if possible If no work was performed then prealloc_data_structs() wasn't ever called so there isn't any need to call prealloc_free_structs(). Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 408dd276d6c9..b680da5d7b93 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1946,6 +1946,7 @@ static int commit_if_needed(struct cache *cache) static void process_deferred_bios(struct cache *cache) { + bool prealloc_used = false; unsigned long flags; struct bio_list bios; struct bio *bio; @@ -1980,13 +1981,16 @@ static void process_deferred_bios(struct cache *cache) process_discard_bio(cache, &structs, bio); else process_bio(cache, &structs, bio); + prealloc_used = true; } - prealloc_free_structs(cache, &structs); + if (prealloc_used) + prealloc_free_structs(cache, &structs); } static void process_deferred_cells(struct cache *cache) { + bool prealloc_used = false; unsigned long flags; struct dm_bio_prison_cell *cell, *tmp; struct list_head cells; @@ -2014,9 +2018,11 @@ static void process_deferred_cells(struct cache *cache) } process_cell(cache, &structs, cell); + prealloc_used = true; } - prealloc_free_structs(cache, &structs); + if (prealloc_used) + prealloc_free_structs(cache, &structs); } static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) @@ -2061,6 +2067,7 @@ static void process_deferred_writethrough_bios(struct cache *cache) static void writeback_some_dirty_blocks(struct cache *cache) { + bool prealloc_used = false; dm_oblock_t oblock; dm_cblock_t cblock; struct prealloc structs; @@ -2080,9 +2087,11 @@ static void writeback_some_dirty_blocks(struct cache *cache) } writeback(cache, &structs, oblock, cblock, old_ocell); + prealloc_used = true; } - prealloc_free_structs(cache, &structs); + if (prealloc_used) + prealloc_free_structs(cache, &structs); } /*----------------------------------------------------------------