From c13b5487d9dec7189390c76c11358584e327870a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Apr 2019 18:33:34 +0200 Subject: [PATCH 01/42] dm crypt: fix endianness annotations around org_sector_of_dmreq The sector used here is a little endian value, so use the right type for it. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index dd6565798778..692cddf3fe2a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1034,11 +1034,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc, return iv_of_dmreq(cc, dmreq) + cc->iv_size; } -static uint64_t *org_sector_of_dmreq(struct crypt_config *cc, +static __le64 *org_sector_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; - return (uint64_t*) ptr; + return (__le64 *) ptr; } static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, @@ -1074,7 +1074,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); struct dm_crypt_request *dmreq; u8 *iv, *org_iv, *tag_iv, *tag; - uint64_t *sector; + __le64 *sector; int r = 0; BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); @@ -1169,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, struct scatterlist *sg_in, *sg_out; struct dm_crypt_request *dmreq; u8 *iv, *org_iv, *tag_iv; - uint64_t *sector; + __le64 *sector; int r = 0; /* Reject unexpected unaligned bio. */ From a3839bc6351d79cf85790f302238dfc84382429f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Apr 2019 11:12:31 +0300 Subject: [PATCH 02/42] dm zoned: Silence a static checker warning My static checker complains about this line from dmz_get_zoned_device() aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); The problem is that "aligned_capacity" and "dev->capacity" are sector_t type (which is a u64 under most configs) but blk_queue_zone_sectors(q) returns a u32 so the higher 32 bits in aligned_capacity are cleared to zero. This patch adds a cast to address the issue. Fixes: 114e025968b5 ("dm zoned: ignore last smaller runt zone") Signed-off-by: Dan Carpenter Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 8865c1709e16..51d029bbb740 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) q = bdev_get_queue(dev->bdev); dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); + aligned_capacity = dev->capacity & + ~((sector_t)blk_queue_zone_sectors(q) - 1); if (ti->begin || ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { ti->error = "Partial mapping not supported"; From 7aedf75ff740a98f3683439449cd91c8662d03b2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Apr 2019 18:03:07 +0900 Subject: [PATCH 03/42] dm zoned: Fix zone report handling The function blkdev_report_zones() returns success even if no zone information is reported (empty report). Empty zone reports can only happen if the report start sector passed exceeds the device capacity. The conditions for this to happen are either a bug in the caller code, or, a change in the device that forced the low level driver to change the device capacity to a value that is lower than the report start sector. This situation includes a failed disk revalidation resulting in the disk capacity being changed to 0. If this change happens while dm-zoned is in its initialization phase executing dmz_init_zones(), this function may enter an infinite loop and hang the system. To avoid this, add a check to disallow empty zone reports and bail out early. Also fix the function dmz_update_zone() to make sure that the report for the requested zone was correctly obtained. Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Shaun Tancheff Signed-off-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fa68336560c3..d8334cd45d7c 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd) goto out; } + if (!nr_blkz) + break; + /* Process report */ for (i = 0; i < nr_blkz; i++) { ret = dmz_init_zone(zmd, zone, &blkz[i]); @@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) /* Get zone information from disk */ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), &blkz, &nr_blkz, GFP_NOIO); + if (!nr_blkz) + ret = -EIO; if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); From e28adc3bf34e434b30e8d063df4823ba0f3e0529 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Wed, 17 Apr 2019 17:19:18 +0300 Subject: [PATCH 04/42] dm cache metadata: Fix loading discard bitset Add missing dm_bitset_cursor_next() to properly advance the bitset cursor. Otherwise, the discarded state of all blocks is set according to the discarded state of the first block. Fixes: ae4a46a1f6 ("dm cache metadata: use bitset cursor api to load discard bitset") Cc: stable@vger.kernel.org Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 6fc93834da44..151aa95775be 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd, if (r) return r; - for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + for (b = 0; ; b++) { r = fn(context, cmd->discard_block_size, to_dblock(b), dm_bitset_cursor_get_value(&c)); if (r) break; + + if (b >= (from_dblock(cmd->discard_nr_blocks) - 1)) + break; + + r = dm_bitset_cursor_next(&c); + if (r) + break; } dm_bitset_cursor_end(&c); From ae325dcd1905925557dfdd03ae4b97e7ac6cd6a6 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Sun, 17 Mar 2019 14:22:53 +0200 Subject: [PATCH 05/42] list: Don't use WRITE_ONCE() in hlist_add_behind() Commit 1c97be677f72b3 ("list: Use WRITE_ONCE() when adding to lists and hlists") introduced the use of WRITE_ONCE() to atomically write the list head's ->next pointer. hlist_add_behind() doesn't touch the hlist head's ->first pointer so there is no reason to use WRITE_ONCE() in this case. Co-developed-by: Ilias Tsitsimpis Signed-off-by: Nikos Tsironis Reviewed-by: Paul E. McKenney Signed-off-by: Mike Snitzer --- include/linux/list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/list.h b/include/linux/list.h index 58aa3adf94e6..e137b98cd31a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -754,7 +754,7 @@ static inline void hlist_add_behind(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; - WRITE_ONCE(prev->next, n); + prev->next = n; n->pprev = &prev->next; if (n->next) From 34191ae816b04518943fb073207d259881a54315 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Sun, 17 Mar 2019 14:22:54 +0200 Subject: [PATCH 06/42] list_bl: Add hlist_bl_add_before/behind helpers Add hlist_bl_add_before/behind helpers to add an element before/after an existing element in a bl_list. Co-developed-by: Ilias Tsitsimpis Signed-off-by: Nikos Tsironis Reviewed-by: Paul E. McKenney Signed-off-by: Mike Snitzer --- include/linux/list_bl.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 3fc2cc57ba1b..ae1b541446c9 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -86,6 +86,32 @@ static inline void hlist_bl_add_head(struct hlist_bl_node *n, hlist_bl_set_first(h, n); } +static inline void hlist_bl_add_before(struct hlist_bl_node *n, + struct hlist_bl_node *next) +{ + struct hlist_bl_node **pprev = next->pprev; + + n->pprev = pprev; + n->next = next; + next->pprev = &n->next; + + /* pprev may be `first`, so be careful not to lose the lock bit */ + WRITE_ONCE(*pprev, + (struct hlist_bl_node *) + ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK))); +} + +static inline void hlist_bl_add_behind(struct hlist_bl_node *n, + struct hlist_bl_node *prev) +{ + n->next = prev->next; + n->pprev = &prev->next; + prev->next = n; + + if (n->next) + n->next->pprev = &n->next; +} + static inline void __hlist_bl_del(struct hlist_bl_node *n) { struct hlist_bl_node *next = n->next; From 65fc7c37047797a0a7b80d0ad2c063deda569337 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Sun, 17 Mar 2019 14:22:55 +0200 Subject: [PATCH 07/42] dm snapshot: Don't sleep holding the snapshot lock When completing a pending exception, pending_complete() waits for all conflicting reads to drain, before inserting the final, completed exception. Conflicting reads are snapshot reads redirected to the origin, because the relevant chunk is not remapped to the COW device the moment we receive the read. The completed exception must be inserted into the exception table after all conflicting reads drain to ensure snapshot reads don't return corrupted data. This is required because inserting the completed exception into the exception table signals that the relevant chunk is remapped and both origin writes and snapshot merging will now overwrite the chunk in origin. This wait is done holding the snapshot lock to ensure that pending_complete() doesn't starve if new snapshot reads keep coming for this chunk. In preparation for the next commit, where we use a spinlock instead of a mutex to protect the exception tables, we remove the need for holding the lock while waiting for conflicting reads to drain. We achieve this in two steps: 1. pending_complete() inserts the completed exception before waiting for conflicting reads to drain and removes the pending exception after all conflicting reads drain. This ensures that new snapshot reads will be redirected to the COW device, instead of the origin, and thus pending_complete() will not starve. Moreover, we use the existence of both a completed and a pending exception to signify that the COW is done but there are conflicting reads in flight. 2. In __origin_write() we check first if there is a pending exception and then if there is a completed exception. If there is a pending exception any submitted BIO is delayed on the pe->origin_bios list and DM_MAPIO_SUBMITTED is returned. This ensures that neither writes to the origin nor snapshot merging can overwrite the origin chunk, until all conflicting reads drain, and thus snapshot reads will not return corrupted data. Summarizing, we now have the following possible combinations of pending and completed exceptions for a chunk, along with their meaning: A. No exceptions exist: The chunk has not been remapped yet. B. Only a pending exception exists: The chunk is currently being copied to the COW device. C. Both a pending and a completed exception exist: COW for this chunk has completed but there are snapshot reads in flight which had been redirected to the origin before the chunk was remapped. D. Only the completed exception exists: COW has been completed and there are no conflicting reads in flight. Co-developed-by: Ilias Tsitsimpis Signed-off-by: Nikos Tsironis Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 108 +++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a168963b757d..051e4d076323 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1501,16 +1501,24 @@ static void pending_complete(void *context, int success) goto out; } - /* Check for conflicting reads */ - __check_for_conflicting_io(s, pe->e.old_chunk); - /* - * Add a proper exception, and remove the - * in-flight exception from the list. + * Add a proper exception. After inserting the completed exception all + * subsequent snapshot reads to this chunk will be redirected to the + * COW device. This ensures that we do not starve. Moreover, as long + * as the pending exception exists, neither origin writes nor snapshot + * merging can overwrite the chunk in origin. */ dm_insert_exception(&s->complete, e); + /* Wait for conflicting reads to drain */ + if (__chunk_is_tracked(s, pe->e.old_chunk)) { + mutex_unlock(&s->lock); + __check_for_conflicting_io(s, pe->e.old_chunk); + mutex_lock(&s->lock); + } + out: + /* Remove the in-flight exception from the list */ dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); @@ -1659,6 +1667,34 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) return container_of(e, struct dm_snap_pending_exception, e); } +/* + * Inserts a pending exception into the pending table. + * + * NOTE: a write lock must be held on snap->lock before calling + * this. + */ +static struct dm_snap_pending_exception * +__insert_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + pe->e.old_chunk = chunk; + bio_list_init(&pe->origin_bios); + bio_list_init(&pe->snapshot_bios); + pe->started = 0; + pe->full_bio = NULL; + + if (s->store->type->prepare_exception(s->store, &pe->e)) { + free_pending_exception(pe); + return NULL; + } + + pe->exception_sequence = s->exception_start_sequence++; + + dm_insert_exception(&s->pending, &pe->e); + + return pe; +} + /* * Looks to see if this snapshot already has a pending exception * for this chunk, otherwise it allocates a new one and inserts @@ -1679,22 +1715,7 @@ __find_pending_exception(struct dm_snapshot *s, return pe2; } - pe->e.old_chunk = chunk; - bio_list_init(&pe->origin_bios); - bio_list_init(&pe->snapshot_bios); - pe->started = 0; - pe->full_bio = NULL; - - if (s->store->type->prepare_exception(s->store, &pe->e)) { - free_pending_exception(pe); - return NULL; - } - - pe->exception_sequence = s->exception_start_sequence++; - - dm_insert_exception(&s->pending, &pe->e); - - return pe; + return __insert_pending_exception(s, pe, chunk); } static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, @@ -2107,7 +2128,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, int r = DM_MAPIO_REMAPPED; struct dm_snapshot *snap; struct dm_exception *e; - struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe, *pe2; struct dm_snap_pending_exception *pe_to_start_now = NULL; struct dm_snap_pending_exception *pe_to_start_last = NULL; chunk_t chunk; @@ -2137,17 +2158,17 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, */ chunk = sector_to_chunk(snap->store, sector); - /* - * Check exception table to see if block - * is already remapped in this snapshot - * and trigger an exception if not. - */ - e = dm_lookup_exception(&snap->complete, chunk); - if (e) - goto next_snapshot; - pe = __lookup_pending_exception(snap, chunk); if (!pe) { + /* + * Check exception table to see if block is already + * remapped in this snapshot and trigger an exception + * if not. + */ + e = dm_lookup_exception(&snap->complete, chunk); + if (e) + goto next_snapshot; + mutex_unlock(&snap->lock); pe = alloc_pending_exception(snap); mutex_lock(&snap->lock); @@ -2157,16 +2178,23 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, goto next_snapshot; } - e = dm_lookup_exception(&snap->complete, chunk); - if (e) { - free_pending_exception(pe); - goto next_snapshot; - } + pe2 = __lookup_pending_exception(snap, chunk); - pe = __find_pending_exception(snap, pe, chunk); - if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + if (!pe2) { + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __insert_pending_exception(snap, pe, chunk); + if (!pe) { + __invalidate_snapshot(snap, -ENOMEM); + goto next_snapshot; + } + } else { + free_pending_exception(pe); + pe = pe2; } } From 4ad8d880b6c4497e365fb6fd16bab52e9974a3f6 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Sun, 17 Mar 2019 14:22:56 +0200 Subject: [PATCH 08/42] dm snapshot: Replace mutex with rw semaphore dm-snapshot uses a single mutex to serialize every access to the snapshot state. This includes all accesses to the complete and pending exception tables, which occur at every origin write, every snapshot read/write and every exception completion. The lock statistics indicate that this mutex is a bottleneck (average wait time ~480 usecs for 8 processes doing random 4K writes to the origin device) preventing dm-snapshot to scale as the number of threads doing IO increases. The major contention points are __origin_write()/snapshot_map() and pending_complete(), i.e., the submission and completion of pending exceptions. Replace this mutex with a rw semaphore. We essentially revert commit ae1093be5a0ef9 ("dm snapshot: use mutex instead of rw_semaphore") and together with the next two patches we substitute the single mutex with a fine-grained locking scheme, where we use a read-write semaphore to protect the mostly read fields of the snapshot structure, e.g., valid, active, etc., and per-bucket bit spinlocks to protect accesses to the complete and pending exception tables. Co-developed-by: Ilias Tsitsimpis Signed-off-by: Nikos Tsironis Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 88 ++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 051e4d076323..5a67f408876e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -48,7 +48,7 @@ struct dm_exception_table { }; struct dm_snapshot { - struct mutex lock; + struct rw_semaphore lock; struct dm_dev *origin; struct dm_dev *cow; @@ -457,9 +457,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) continue; - mutex_lock(&s->lock); + down_read(&s->lock); active = s->active; - mutex_unlock(&s->lock); + up_read(&s->lock); if (active) { if (snap_src) @@ -927,7 +927,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) int r; chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; - mutex_lock(&s->lock); + down_write(&s->lock); /* * Process chunks (and associated exceptions) in reverse order @@ -942,7 +942,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) b = __release_queued_bios_after_merge(s); out: - mutex_unlock(&s->lock); + up_write(&s->lock); if (b) flush_bios(b); @@ -1001,9 +1001,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) if (linear_chunks < 0) { DMERR("Read error in exception store: " "shutting down merge"); - mutex_lock(&s->lock); + down_write(&s->lock); s->merge_failed = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); } goto shut; } @@ -1044,10 +1044,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) previous_count = read_pending_exceptions_done_count(); } - mutex_lock(&s->lock); + down_write(&s->lock); s->first_merging_chunk = old_chunk; s->num_merging_chunks = linear_chunks; - mutex_unlock(&s->lock); + up_write(&s->lock); /* Wait until writes to all 'linear_chunks' drain */ for (i = 0; i < linear_chunks; i++) @@ -1089,10 +1089,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) return; shut: - mutex_lock(&s->lock); + down_write(&s->lock); s->merge_failed = 1; b = __release_queued_bios_after_merge(s); - mutex_unlock(&s->lock); + up_write(&s->lock); error_bios(b); merge_shutdown(s); @@ -1191,7 +1191,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->exception_start_sequence = 0; s->exception_complete_sequence = 0; s->out_of_order_tree = RB_ROOT; - mutex_init(&s->lock); + init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; @@ -1357,9 +1357,9 @@ static void snapshot_dtr(struct dm_target *ti) /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest && (s == snap_src)) { - mutex_lock(&snap_dest->lock); + down_write(&snap_dest->lock); snap_dest->valid = 0; - mutex_unlock(&snap_dest->lock); + up_write(&snap_dest->lock); DMERR("Cancelling snapshot handover."); } up_read(&_origins_lock); @@ -1390,8 +1390,6 @@ static void snapshot_dtr(struct dm_target *ti) dm_exception_store_destroy(s->store); - mutex_destroy(&s->lock); - dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); @@ -1479,7 +1477,7 @@ static void pending_complete(void *context, int success) if (!success) { /* Read/write error - snapshot is unusable */ - mutex_lock(&s->lock); + down_write(&s->lock); __invalidate_snapshot(s, -EIO); error = 1; goto out; @@ -1487,14 +1485,14 @@ static void pending_complete(void *context, int success) e = alloc_completed_exception(GFP_NOIO); if (!e) { - mutex_lock(&s->lock); + down_write(&s->lock); __invalidate_snapshot(s, -ENOMEM); error = 1; goto out; } *e = pe->e; - mutex_lock(&s->lock); + down_write(&s->lock); if (!s->valid) { free_completed_exception(e); error = 1; @@ -1512,9 +1510,9 @@ static void pending_complete(void *context, int success) /* Wait for conflicting reads to drain */ if (__chunk_is_tracked(s, pe->e.old_chunk)) { - mutex_unlock(&s->lock); + up_write(&s->lock); __check_for_conflicting_io(s, pe->e.old_chunk); - mutex_lock(&s->lock); + down_write(&s->lock); } out: @@ -1527,7 +1525,7 @@ out: full_bio->bi_end_io = pe->full_bio_end_io; increment_pending_exceptions_done_count(); - mutex_unlock(&s->lock); + up_write(&s->lock); /* Submit any pending write bios */ if (error) { @@ -1750,7 +1748,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; - mutex_lock(&s->lock); + down_write(&s->lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { @@ -1773,9 +1771,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { - mutex_unlock(&s->lock); + up_write(&s->lock); pe = alloc_pending_exception(s); - mutex_lock(&s->lock); + down_write(&s->lock); if (!s->valid || s->snapshot_overflowed) { free_pending_exception(pe); @@ -1810,7 +1808,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); start_full_bio(pe, bio); goto out; } @@ -1820,7 +1818,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); start_copy(pe); goto out; } @@ -1830,7 +1828,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } out_unlock: - mutex_unlock(&s->lock); + up_write(&s->lock); out: return r; } @@ -1866,7 +1864,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); - mutex_lock(&s->lock); + down_write(&s->lock); /* Full merging snapshots are redirected to the origin */ if (!s->valid) @@ -1897,12 +1895,12 @@ redirect_to_origin: bio_set_dev(bio, s->origin->bdev); if (bio_data_dir(bio) == WRITE) { - mutex_unlock(&s->lock); + up_write(&s->lock); return do_origin(s->origin, bio); } out_unlock: - mutex_unlock(&s->lock); + up_write(&s->lock); return r; } @@ -1934,7 +1932,7 @@ static int snapshot_preresume(struct dm_target *ti) down_read(&_origins_lock); (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - mutex_lock(&snap_src->lock); + down_read(&snap_src->lock); if (s == snap_src) { DMERR("Unable to resume snapshot source until " "handover completes."); @@ -1944,7 +1942,7 @@ static int snapshot_preresume(struct dm_target *ti) "source is suspended."); r = -EINVAL; } - mutex_unlock(&snap_src->lock); + up_read(&snap_src->lock); } up_read(&_origins_lock); @@ -1990,11 +1988,11 @@ static void snapshot_resume(struct dm_target *ti) (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - mutex_lock(&snap_src->lock); - mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); __handover_exceptions(snap_src, snap_dest); - mutex_unlock(&snap_dest->lock); - mutex_unlock(&snap_src->lock); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); } up_read(&_origins_lock); @@ -2009,9 +2007,9 @@ static void snapshot_resume(struct dm_target *ti) /* Now we have correct chunk size, reregister */ reregister_snapshot(s); - mutex_lock(&s->lock); + down_write(&s->lock); s->active = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); } static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) @@ -2051,7 +2049,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - mutex_lock(&snap->lock); + down_write(&snap->lock); if (!snap->valid) DMEMIT("Invalid"); @@ -2076,7 +2074,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, DMEMIT("Unknown"); } - mutex_unlock(&snap->lock); + up_write(&snap->lock); break; @@ -2142,7 +2140,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (dm_target_is_snapshot_merge(snap->ti)) continue; - mutex_lock(&snap->lock); + down_write(&snap->lock); /* Only deal with valid and active snapshots */ if (!snap->valid || !snap->active) @@ -2169,9 +2167,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (e) goto next_snapshot; - mutex_unlock(&snap->lock); + up_write(&snap->lock); pe = alloc_pending_exception(snap); - mutex_lock(&snap->lock); + down_write(&snap->lock); if (!snap->valid) { free_pending_exception(pe); @@ -2221,7 +2219,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } next_snapshot: - mutex_unlock(&snap->lock); + up_write(&snap->lock); if (pe_to_start_now) { start_copy(pe_to_start_now); From f79ae415b64c35d9ecca159fe796cf98d2ff9e9c Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Sun, 17 Mar 2019 14:22:57 +0200 Subject: [PATCH 09/42] dm snapshot: Make exception tables scalable Use list_bl to implement the exception hash tables' buckets. This change permits concurrent access, to distinct buckets, by multiple threads. Also, implement helper functions to lock and unlock the exception tables based on the chunk number of the exception at hand. We retain the global locking, by means of down_write(), which is replaced by the next commit. Still, we must acquire the per-bucket spinlocks when accessing the hash tables, since list_bl does not allow modification on unlocked lists. Co-developed-by: Ilias Tsitsimpis Signed-off-by: Nikos Tsironis Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-exception-store.h | 3 +- drivers/md/dm-snap.c | 137 ++++++++++++++++++++++++++------ 2 files changed, 116 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 12b5216c2cfe..5a3c696c057f 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -11,6 +11,7 @@ #define _LINUX_DM_EXCEPTION_STORE #include +#include #include /* @@ -27,7 +28,7 @@ typedef sector_t chunk_t; * chunk within the device. */ struct dm_exception { - struct list_head hash_list; + struct hlist_bl_node hash_list; chunk_t old_chunk; chunk_t new_chunk; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 5a67f408876e..10bb37e27ecf 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,7 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; struct dm_exception_table { uint32_t hash_mask; unsigned hash_shift; - struct list_head *table; + struct hlist_bl_head *table; }; struct dm_snapshot { @@ -618,6 +619,36 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); + +/* Lock to protect access to the completed and pending exception hash tables. */ +struct dm_exception_table_lock { + struct hlist_bl_head *complete_slot; + struct hlist_bl_head *pending_slot; +}; + +static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, + struct dm_exception_table_lock *lock) +{ + struct dm_exception_table *complete = &s->complete; + struct dm_exception_table *pending = &s->pending; + + lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; + lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; +} + +static void dm_exception_table_lock(struct dm_exception_table_lock *lock) +{ + hlist_bl_lock(lock->complete_slot); + hlist_bl_lock(lock->pending_slot); +} + +static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) +{ + hlist_bl_unlock(lock->pending_slot); + hlist_bl_unlock(lock->complete_slot); +} + static int dm_exception_table_init(struct dm_exception_table *et, uint32_t size, unsigned hash_shift) { @@ -625,12 +656,12 @@ static int dm_exception_table_init(struct dm_exception_table *et, et->hash_shift = hash_shift; et->hash_mask = size - 1; - et->table = dm_vcalloc(size, sizeof(struct list_head)); + et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head)); if (!et->table) return -ENOMEM; for (i = 0; i < size; i++) - INIT_LIST_HEAD(et->table + i); + INIT_HLIST_BL_HEAD(et->table + i); return 0; } @@ -638,15 +669,16 @@ static int dm_exception_table_init(struct dm_exception_table *et, static void dm_exception_table_exit(struct dm_exception_table *et, struct kmem_cache *mem) { - struct list_head *slot; - struct dm_exception *ex, *next; + struct hlist_bl_head *slot; + struct dm_exception *ex; + struct hlist_bl_node *pos, *n; int i, size; size = et->hash_mask + 1; for (i = 0; i < size; i++) { slot = et->table + i; - list_for_each_entry_safe (ex, next, slot, hash_list) + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) kmem_cache_free(mem, ex); } @@ -660,7 +692,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) static void dm_remove_exception(struct dm_exception *e) { - list_del(&e->hash_list); + hlist_bl_del(&e->hash_list); } /* @@ -670,11 +702,12 @@ static void dm_remove_exception(struct dm_exception *e) static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, chunk_t chunk) { - struct list_head *slot; + struct hlist_bl_head *slot; + struct hlist_bl_node *pos; struct dm_exception *e; slot = &et->table[exception_hash(et, chunk)]; - list_for_each_entry (e, slot, hash_list) + hlist_bl_for_each_entry(e, pos, slot, hash_list) if (chunk >= e->old_chunk && chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; @@ -721,7 +754,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) static void dm_insert_exception(struct dm_exception_table *eh, struct dm_exception *new_e) { - struct list_head *l; + struct hlist_bl_head *l; + struct hlist_bl_node *pos; struct dm_exception *e = NULL; l = &eh->table[exception_hash(eh, new_e->old_chunk)]; @@ -731,7 +765,7 @@ static void dm_insert_exception(struct dm_exception_table *eh, goto out; /* List is ordered by old_chunk */ - list_for_each_entry_reverse(e, l, hash_list) { + hlist_bl_for_each_entry(e, pos, l, hash_list) { /* Insert after an existing chunk? */ if (new_e->old_chunk == (e->old_chunk + dm_consecutive_chunk_count(e) + 1) && @@ -752,12 +786,24 @@ static void dm_insert_exception(struct dm_exception_table *eh, return; } - if (new_e->old_chunk > e->old_chunk) + if (new_e->old_chunk < e->old_chunk) break; } out: - list_add(&new_e->hash_list, e ? &e->hash_list : l); + if (!e) { + /* + * Either the table doesn't support consecutive chunks or slot + * l is empty. + */ + hlist_bl_add_head(&new_e->hash_list, l); + } else if (new_e->old_chunk < e->old_chunk) { + /* Add before an existing exception */ + hlist_bl_add_before(&new_e->hash_list, &e->hash_list); + } else { + /* Add to l's tail: e is the last exception in this slot */ + hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); + } } /* @@ -766,6 +812,7 @@ out: */ static int dm_add_exception(void *context, chunk_t old, chunk_t new) { + struct dm_exception_table_lock lock; struct dm_snapshot *s = context; struct dm_exception *e; @@ -778,7 +825,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; + /* + * Although there is no need to lock access to the exception tables + * here, if we don't then hlist_bl_add_head(), called by + * dm_insert_exception(), will complain about accessing the + * corresponding list without locking it first. + */ + dm_exception_table_lock_init(s, old, &lock); + + dm_exception_table_lock(&lock); dm_insert_exception(&s->complete, e); + dm_exception_table_unlock(&lock); return 0; } @@ -807,7 +864,7 @@ static int calc_max_buckets(void) { /* use a fixed size of 2MB */ unsigned long mem = 2 * 1024 * 1024; - mem /= sizeof(struct list_head); + mem /= sizeof(struct hlist_bl_head); return mem; } @@ -1473,13 +1530,18 @@ static void pending_complete(void *context, int success) struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; struct bio *full_bio = NULL; + struct dm_exception_table_lock lock; int error = 0; + dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); + if (!success) { /* Read/write error - snapshot is unusable */ down_write(&s->lock); __invalidate_snapshot(s, -EIO); error = 1; + + dm_exception_table_lock(&lock); goto out; } @@ -1488,11 +1550,14 @@ static void pending_complete(void *context, int success) down_write(&s->lock); __invalidate_snapshot(s, -ENOMEM); error = 1; + + dm_exception_table_lock(&lock); goto out; } *e = pe->e; down_write(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid) { free_completed_exception(e); error = 1; @@ -1510,14 +1575,19 @@ static void pending_complete(void *context, int success) /* Wait for conflicting reads to drain */ if (__chunk_is_tracked(s, pe->e.old_chunk)) { + dm_exception_table_unlock(&lock); up_write(&s->lock); __check_for_conflicting_io(s, pe->e.old_chunk); down_write(&s->lock); + dm_exception_table_lock(&lock); } out: /* Remove the in-flight exception from the list */ dm_remove_exception(&pe->e); + + dm_exception_table_unlock(&lock); + snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; @@ -1733,6 +1803,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; + struct dm_exception_table_lock lock; init_tracked_chunk(bio); @@ -1742,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + dm_exception_table_lock_init(s, chunk, &lock); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ @@ -1749,6 +1821,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_KILL; down_write(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { @@ -1771,9 +1844,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { + dm_exception_table_unlock(&lock); up_write(&s->lock); pe = alloc_pending_exception(s); down_write(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid || s->snapshot_overflowed) { free_pending_exception(pe); @@ -1790,13 +1865,17 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __find_pending_exception(s, pe, chunk); if (!pe) { + dm_exception_table_unlock(&lock); + if (s->store->userspace_supports_overflow) { s->snapshot_overflowed = 1; DMERR("Snapshot overflowed: Unable to allocate exception."); } else __invalidate_snapshot(s, -ENOMEM); + up_write(&s->lock); + r = DM_MAPIO_KILL; - goto out_unlock; + goto out; } } @@ -1808,6 +1887,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; + dm_exception_table_unlock(&lock); up_write(&s->lock); start_full_bio(pe, bio); goto out; @@ -1818,6 +1898,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; + dm_exception_table_unlock(&lock); up_write(&s->lock); start_copy(pe); goto out; @@ -1828,6 +1909,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } out_unlock: + dm_exception_table_unlock(&lock); up_write(&s->lock); out: return r; @@ -2129,6 +2211,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, struct dm_snap_pending_exception *pe, *pe2; struct dm_snap_pending_exception *pe_to_start_now = NULL; struct dm_snap_pending_exception *pe_to_start_last = NULL; + struct dm_exception_table_lock lock; chunk_t chunk; /* Do all the snapshots on this origin */ @@ -2140,21 +2223,23 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (dm_target_is_snapshot_merge(snap->ti)) continue; - down_write(&snap->lock); - - /* Only deal with valid and active snapshots */ - if (!snap->valid || !snap->active) - goto next_snapshot; - /* Nothing to do if writing beyond end of snapshot */ if (sector >= dm_table_get_size(snap->ti->table)) - goto next_snapshot; + continue; /* * Remember, different snapshots can have * different chunk sizes. */ chunk = sector_to_chunk(snap->store, sector); + dm_exception_table_lock_init(snap, chunk, &lock); + + down_write(&snap->lock); + dm_exception_table_lock(&lock); + + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) + goto next_snapshot; pe = __lookup_pending_exception(snap, chunk); if (!pe) { @@ -2167,9 +2252,11 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (e) goto next_snapshot; + dm_exception_table_unlock(&lock); up_write(&snap->lock); pe = alloc_pending_exception(snap); down_write(&snap->lock); + dm_exception_table_lock(&lock); if (!snap->valid) { free_pending_exception(pe); @@ -2187,8 +2274,11 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, pe = __insert_pending_exception(snap, pe, chunk); if (!pe) { + dm_exception_table_unlock(&lock); __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + up_write(&snap->lock); + + continue; } } else { free_pending_exception(pe); @@ -2219,6 +2309,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } next_snapshot: + dm_exception_table_unlock(&lock); up_write(&snap->lock); if (pe_to_start_now) { From 3f1637f2103822f9cab4c927d929db57ac4fd933 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Sun, 17 Mar 2019 14:22:58 +0200 Subject: [PATCH 10/42] dm snapshot: Use fine-grained locking scheme Substitute the global locking scheme with a fine grained one, employing the read-write semaphore and the scalable exception tables with per-bucket locks introduced by the previous two commits. Summarizing, we now use a read-write semaphore to protect the mostly read fields of the snapshot structure, e.g., valid, active, etc., and per-bucket bit spinlocks to protect accesses to the complete and pending exception tables. Finally, we use an extra spinlock (pe_allocation_lock) to serialize the allocation of new exceptions by the exception store. This allocation is really fast, so the extra spinlock doesn't hurt the performance. This scheme allows dm-snapshot to scale better, resulting in increased IOPS and reduced latency. Following are some benchmark results using the null_blk device: modprobe null_blk gb=1024 bs=512 submit_queues=8 hw_queue_depth=4096 \ queue_mode=2 irqmode=1 completion_nsec=1 nr_devices=1 * Benchmark fio_origin_randwrite_throughput_N, from the device mapper test suite [1] (direct IO, random 4K writes to origin device, IO engine libaio): +--------------+-------------+------------+ | # of workers | IOPS Before | IOPS After | +--------------+-------------+------------+ | 1 | 57708 | 66421 | | 2 | 63415 | 77589 | | 4 | 67276 | 98839 | | 8 | 60564 | 109258 | +--------------+-------------+------------+ * Benchmark fio_origin_randwrite_latency_N, from the device mapper test suite [1] (direct IO, random 4K writes to origin device, IO engine psync): +--------------+-----------------------+----------------------+ | # of workers | Latency (usec) Before | Latency (usec) After | +--------------+-----------------------+----------------------+ | 1 | 16.25 | 13.27 | | 2 | 31.65 | 25.08 | | 4 | 55.28 | 41.08 | | 8 | 121.47 | 74.44 | +--------------+-----------------------+----------------------+ * Benchmark fio_snapshot_randwrite_throughput_N, from the device mapper test suite [1] (direct IO, random 4K writes to snapshot device, IO engine libaio): +--------------+-------------+------------+ | # of workers | IOPS Before | IOPS After | +--------------+-------------+------------+ | 1 | 72593 | 84938 | | 2 | 97379 | 134973 | | 4 | 90610 | 143077 | | 8 | 90537 | 180085 | +--------------+-------------+------------+ * Benchmark fio_snapshot_randwrite_latency_N, from the device mapper test suite [1] (direct IO, random 4K writes to snapshot device, IO engine psync): +--------------+-----------------------+----------------------+ | # of workers | Latency (usec) Before | Latency (usec) After | +--------------+-----------------------+----------------------+ | 1 | 12.53 | 10.6 | | 2 | 19.78 | 14.89 | | 4 | 40.37 | 23.47 | | 8 | 89.32 | 48.48 | +--------------+-----------------------+----------------------+ [1] https://github.com/jthornber/device-mapper-test-suite Co-developed-by: Ilias Tsitsimpis Signed-off-by: Nikos Tsironis Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 84 +++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 10bb37e27ecf..3107f2b1988b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -77,7 +77,9 @@ struct dm_snapshot { atomic_t pending_exceptions_count; - /* Protected by "lock" */ + spinlock_t pe_allocation_lock; + + /* Protected by "pe_allocation_lock" */ sector_t exception_start_sequence; /* Protected by kcopyd single-threaded callback */ @@ -1245,6 +1247,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->snapshot_overflowed = 0; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + spin_lock_init(&s->pe_allocation_lock); s->exception_start_sequence = 0; s->exception_complete_sequence = 0; s->out_of_order_tree = RB_ROOT; @@ -1522,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) dm_table_event(s->ti->table); } +static void invalidate_snapshot(struct dm_snapshot *s, int err) +{ + down_write(&s->lock); + __invalidate_snapshot(s, err); + up_write(&s->lock); +} + static void pending_complete(void *context, int success) { struct dm_snap_pending_exception *pe = context; @@ -1537,8 +1547,7 @@ static void pending_complete(void *context, int success) if (!success) { /* Read/write error - snapshot is unusable */ - down_write(&s->lock); - __invalidate_snapshot(s, -EIO); + invalidate_snapshot(s, -EIO); error = 1; dm_exception_table_lock(&lock); @@ -1547,8 +1556,7 @@ static void pending_complete(void *context, int success) e = alloc_completed_exception(GFP_NOIO); if (!e) { - down_write(&s->lock); - __invalidate_snapshot(s, -ENOMEM); + invalidate_snapshot(s, -ENOMEM); error = 1; dm_exception_table_lock(&lock); @@ -1556,11 +1564,13 @@ static void pending_complete(void *context, int success) } *e = pe->e; - down_write(&s->lock); + down_read(&s->lock); dm_exception_table_lock(&lock); if (!s->valid) { + up_read(&s->lock); free_completed_exception(e); error = 1; + goto out; } @@ -1572,13 +1582,12 @@ static void pending_complete(void *context, int success) * merging can overwrite the chunk in origin. */ dm_insert_exception(&s->complete, e); + up_read(&s->lock); /* Wait for conflicting reads to drain */ if (__chunk_is_tracked(s, pe->e.old_chunk)) { dm_exception_table_unlock(&lock); - up_write(&s->lock); __check_for_conflicting_io(s, pe->e.old_chunk); - down_write(&s->lock); dm_exception_table_lock(&lock); } @@ -1595,8 +1604,6 @@ out: full_bio->bi_end_io = pe->full_bio_end_io; increment_pending_exceptions_done_count(); - up_write(&s->lock); - /* Submit any pending write bios */ if (error) { if (full_bio) @@ -1738,8 +1745,8 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) /* * Inserts a pending exception into the pending table. * - * NOTE: a write lock must be held on snap->lock before calling - * this. + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. */ static struct dm_snap_pending_exception * __insert_pending_exception(struct dm_snapshot *s, @@ -1751,12 +1758,15 @@ __insert_pending_exception(struct dm_snapshot *s, pe->started = 0; pe->full_bio = NULL; + spin_lock(&s->pe_allocation_lock); if (s->store->type->prepare_exception(s->store, &pe->e)) { + spin_unlock(&s->pe_allocation_lock); free_pending_exception(pe); return NULL; } pe->exception_sequence = s->exception_start_sequence++; + spin_unlock(&s->pe_allocation_lock); dm_insert_exception(&s->pending, &pe->e); @@ -1768,8 +1778,8 @@ __insert_pending_exception(struct dm_snapshot *s, * for this chunk, otherwise it allocates a new one and inserts * it into the pending table. * - * NOTE: a write lock must be held on snap->lock before calling - * this. + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. */ static struct dm_snap_pending_exception * __find_pending_exception(struct dm_snapshot *s, @@ -1820,7 +1830,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; - down_write(&s->lock); + down_read(&s->lock); dm_exception_table_lock(&lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && @@ -1845,17 +1855,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __lookup_pending_exception(s, chunk); if (!pe) { dm_exception_table_unlock(&lock); - up_write(&s->lock); pe = alloc_pending_exception(s); - down_write(&s->lock); dm_exception_table_lock(&lock); - if (!s->valid || s->snapshot_overflowed) { - free_pending_exception(pe); - r = DM_MAPIO_KILL; - goto out_unlock; - } - e = dm_lookup_exception(&s->complete, chunk); if (e) { free_pending_exception(pe); @@ -1866,10 +1868,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __find_pending_exception(s, pe, chunk); if (!pe) { dm_exception_table_unlock(&lock); + up_read(&s->lock); + + down_write(&s->lock); if (s->store->userspace_supports_overflow) { - s->snapshot_overflowed = 1; - DMERR("Snapshot overflowed: Unable to allocate exception."); + if (s->valid && !s->snapshot_overflowed) { + s->snapshot_overflowed = 1; + DMERR("Snapshot overflowed: Unable to allocate exception."); + } } else __invalidate_snapshot(s, -ENOMEM); up_write(&s->lock); @@ -1887,8 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; + dm_exception_table_unlock(&lock); - up_write(&s->lock); + up_read(&s->lock); + start_full_bio(pe, bio); goto out; } @@ -1896,10 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio_list_add(&pe->snapshot_bios, bio); if (!pe->started) { - /* this is protected by snap->lock */ + /* this is protected by the exception table lock */ pe->started = 1; + dm_exception_table_unlock(&lock); - up_write(&s->lock); + up_read(&s->lock); + start_copy(pe); goto out; } @@ -1910,7 +1921,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) out_unlock: dm_exception_table_unlock(&lock); - up_write(&s->lock); + up_read(&s->lock); out: return r; } @@ -2234,7 +2245,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, chunk = sector_to_chunk(snap->store, sector); dm_exception_table_lock_init(snap, chunk, &lock); - down_write(&snap->lock); + down_read(&snap->lock); dm_exception_table_lock(&lock); /* Only deal with valid and active snapshots */ @@ -2253,16 +2264,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, goto next_snapshot; dm_exception_table_unlock(&lock); - up_write(&snap->lock); pe = alloc_pending_exception(snap); - down_write(&snap->lock); dm_exception_table_lock(&lock); - if (!snap->valid) { - free_pending_exception(pe); - goto next_snapshot; - } - pe2 = __lookup_pending_exception(snap, chunk); if (!pe2) { @@ -2275,9 +2279,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, pe = __insert_pending_exception(snap, pe, chunk); if (!pe) { dm_exception_table_unlock(&lock); - __invalidate_snapshot(snap, -ENOMEM); - up_write(&snap->lock); + up_read(&snap->lock); + invalidate_snapshot(snap, -ENOMEM); continue; } } else { @@ -2310,7 +2314,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, next_snapshot: dm_exception_table_unlock(&lock); - up_write(&snap->lock); + up_read(&snap->lock); if (pe_to_start_now) { start_copy(pe_to_start_now); From 09f2d6563055b8ff0948cefb8911a4de0d559963 Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Fri, 12 Apr 2019 11:27:18 -0400 Subject: [PATCH 11/42] dm writecache: remove needless dereferences in __writecache_writeback_pmem() bio is already available so there is no need to access it in terms of the wb pointer. Signed-off-by: Huaisheng Ye Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index f7822875589e..5b4d1c11eff1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1478,9 +1478,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); wb = container_of(bio, struct writeback_struct, bio); wb->wc = wc; - wb->bio.bi_end_io = writecache_writeback_endio; - bio_set_dev(&wb->bio, wc->dev->bdev); - wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); + bio->bi_end_io = writecache_writeback_endio; + bio_set_dev(bio, wc->dev->bdev); + bio->bi_iter.bi_sector = read_original_sector(wc, e); wb->page_offset = PAGE_SIZE; if (max_pages <= WB_LIST_INLINE || unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), @@ -1507,12 +1507,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba wb->wc_list[wb->wc_list_n++] = f; e = f; } - bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); + bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; - bio_endio(&wb->bio); + bio_endio(bio); } else { - submit_bio(&wb->bio); + submit_bio(bio); } __writeback_throttle(wc, wbl); From 84420b1e5d7254e58f551e32244d13c8e124bdcc Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Fri, 12 Apr 2019 11:28:14 -0400 Subject: [PATCH 12/42] dm writecache: add unlikely for returned value of rb_next/prev In functions writecache_discard() and writecache_find_entry() there is a high probablity that the pointer of structure rb_node won't equal NULL. Add unlikely for the pointer node NULL. Signed-off-by: Huaisheng Ye Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5b4d1c11eff1..cfbbfbc028f9 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -571,7 +571,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, node = rb_prev(&e->rb_node); else node = rb_next(&e->rb_node); - if (!node) + if (unlikely(!node)) return e; e2 = container_of(node, struct wc_entry, rb_node); if (read_original_sector(wc, e2) != block) @@ -804,7 +804,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ writecache_free_entry(wc, e); } - if (!node) + if (unlikely(!node)) break; e = container_of(node, struct wc_entry, rb_node); From c6e086e0c9b2133f38563707c96b22fd0906f570 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 22 Mar 2019 16:12:22 -0400 Subject: [PATCH 13/42] dm space map common: zero entire ll_disk Otherwise, memory that is allocated (and potentially not previously zeroed) will get written to disk as part of the space maps. Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-space-map-common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 0a3b8ae4a29c..b8a62188f6be 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end, static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) { + memset(ll, 0, sizeof(struct ll_disk)); + ll->tm = tm; ll->bitmap_info.tm = tm; From a1ed4d9e937678d15990694c32e1d104e54e97a3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 15 Apr 2019 16:40:08 -0400 Subject: [PATCH 14/42] dm thin metadata: check __commit_transaction()'s return Fix __reserve_metadata_snap() to return early if __commit_transaction() fails. Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index ed3caceaed07..7381e477a945 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1225,7 +1225,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) * We commit to ensure the btree roots which we increment in a * moment are up to date. */ - __commit_transaction(pmd); + r = __commit_transaction(pmd); + if (r < 0) { + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + return r; + } /* * Copy the superblock. From 6a1b1ddc6a2cfb32da8f5e75f1aa053280682a05 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 15 Apr 2019 16:54:36 -0400 Subject: [PATCH 15/42] dm thin metadata: add wrappers for managing write locking of metadata No functional change, but this prepares to hook off of pmd_write_lock() with additional functionality (as provided in next commit). Suggested-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 108 ++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 44 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 7381e477a945..fd63485b27e9 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -367,6 +367,26 @@ static int subtree_equal(void *context, const void *value1_le, const void *value /*----------------------------------------------------------------*/ +static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) + __acquires(pmd->root_lock) +{ + down_write(&pmd->root_lock); +} +#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd)) + +static inline void pmd_write_lock(struct dm_pool_metadata *pmd) +{ + __pmd_write_lock(pmd); +} + +static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) + __releases(pmd->root_lock) +{ + up_write(&pmd->root_lock); +} + +/*----------------------------------------------------------------*/ + static int superblock_lock_zero(struct dm_pool_metadata *pmd, struct dm_block **sblock) { @@ -1032,10 +1052,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __create_thin(pmd, dev); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1123,10 +1143,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __create_snap(pmd, dev, origin); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1166,10 +1186,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __delete_device(pmd, dev); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1180,7 +1200,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1194,7 +1214,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, r = 0; out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1288,10 +1308,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __reserve_metadata_snap(pmd); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1336,10 +1356,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __release_metadata_snap(pmd); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1382,19 +1402,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); if (!pmd->fail_io) r = __open_device(pmd, dev, 0, td); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } int dm_pool_close_thin_device(struct dm_thin_device *td) { - down_write(&td->pmd->root_lock); + pmd_write_lock_in_core(td->pmd); __close_device(td); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return 0; } @@ -1575,10 +1595,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __insert(td, block, data_block); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1662,10 +1682,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __remove(td, block); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1675,10 +1695,10 @@ int dm_thin_remove_range(struct dm_thin_device *td, { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __remove_range(td, begin, end); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1701,13 +1721,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ { int r = 0; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); for (; b != e; b++) { r = dm_sm_inc_block(pmd->data_sm, b); if (r) break; } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1716,13 +1736,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ { int r = 0; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); for (; b != e; b++) { r = dm_sm_dec_block(pmd->data_sm, b); if (r) break; } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1770,10 +1790,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = dm_sm_new_block(pmd->data_sm, result); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1782,7 +1802,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1795,7 +1815,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) */ r = __begin_transaction(pmd); out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1811,7 +1831,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1822,7 +1842,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) pmd->fail_io = true; out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1953,10 +1973,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __resize_space_map(pmd->data_sm, new_count); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1965,29 +1985,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) { r = __resize_space_map(pmd->metadata_sm, new_count); if (!r) __set_metadata_reserve(pmd); } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) { - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); dm_bm_set_read_only(pmd->bm); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); } void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) { - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); dm_bm_set_read_write(pmd->bm); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); } int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, @@ -1997,9 +2017,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, { int r; - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -2010,7 +2030,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) struct dm_block *sblock; struct thin_disk_superblock *disk_super; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; r = superblock_lock(pmd, &sblock); @@ -2024,7 +2044,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) dm_bm_unlock(sblock); out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } From 873f258becca87f4dd973fe0ba09b88b737c9b14 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 18 Apr 2019 10:29:48 -0400 Subject: [PATCH 16/42] dm thin metadata: do not write metadata if no changes occurred Otherwise, just activating a thin-pool and thin device and then deactivating them will cause the thin-pool metadata to be changed (e.g. superblock written) -- even without any metadata being changed. Add 'in_service' flag to struct dm_pool_metadata and set it in pmd_write_lock() because all on-disk metadata changes must take a write lock of pmd->root_lock. Once 'in_service' is set it is never cleared. __commit_transaction() will return 0 if 'in_service' is not set. dm_pool_commit_metadata() is updated to use __pmd_write_lock() so that it isn't the sole reason for putting a thin-pool in service. Also fix dm_pool_commit_metadata() to open the next transaction if the return from __commit_transaction() is 0. Not seeing why the early return ever made since for a return of 0 given that dm-io's async_io(), as used by bufio, always returns 0. Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index fd63485b27e9..7f0840601737 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -201,6 +201,13 @@ struct dm_pool_metadata { */ bool fail_io:1; + /* + * Set once a thin-pool has been accessed through one of the interfaces + * that imply the pool is in-service (e.g. thin devices created/deleted, + * thin-pool message, metadata snapshots, etc). + */ + bool in_service:1; + /* * Reading the space map roots can fail, so we read it into these * buffers before the superblock is locked and updated. @@ -367,6 +374,10 @@ static int subtree_equal(void *context, const void *value1_le, const void *value /*----------------------------------------------------------------*/ +/* + * Variant that is used for in-core only changes or code that + * shouldn't put the pool in service on its own (e.g. commit). + */ static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) __acquires(pmd->root_lock) { @@ -377,6 +388,8 @@ static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) static inline void pmd_write_lock(struct dm_pool_metadata *pmd) { __pmd_write_lock(pmd); + if (unlikely(!pmd->in_service)) + pmd->in_service = true; } static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) @@ -810,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + if (unlikely(!pmd->in_service)) + return 0; + r = __write_changed_details(pmd); if (r < 0) return r; @@ -873,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, pmd->time = 0; INIT_LIST_HEAD(&pmd->thin_devices); pmd->fail_io = false; + pmd->in_service = false; pmd->bdev = bdev; pmd->data_block_size = data_block_size; @@ -923,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) DMWARN("%s: __commit_transaction() failed, error = %d", __func__, r); } - if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); @@ -1802,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - pmd_write_lock(pmd); + /* + * Care is taken to not have commit be what + * triggers putting the thin-pool in-service. + */ + __pmd_write_lock(pmd); if (pmd->fail_io) goto out; r = __commit_transaction(pmd); - if (r <= 0) + if (r < 0) goto out; /* From 5de719e3d01b4abe0de0d7b857148a880ff2a90b Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 24 Apr 2019 23:19:05 +0800 Subject: [PATCH 17/42] dm mpath: fix missing call of path selector type->end_io After commit 396eaf21ee17 ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback"), map_request() will requeue the tio when issued clone request return BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE. Thus, if device driver status is error, a tio may be requeued multiple times until the return value is not DM_MAPIO_REQUEUE. That means type->start_io may be called multiple times, while type->end_io is only called when IO complete. In fact, even without commit 396eaf21ee17, setup_clone() failure can also cause tio requeue and associated missed call to type->end_io. The service-time path selector selects path based on in_flight_size, which is increased by st_start_io() and decreased by st_end_io(). Missed calls to st_end_io() can lead to in_flight_size count error and will cause the selector to make the wrong choice. In addition, queue-length path selector will also be affected. To fix the problem, call type->end_io in ->release_clone_rq before tio requeue. map_info is passed to ->release_clone_rq() for map_request() error path that result in requeue. Fixes: 396eaf21ee17 ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback") Cc: stable@vger.kernl.org Signed-off-by: Yufen Yu Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 17 ++++++++++++++++- drivers/md/dm-rq.c | 8 ++++---- drivers/md/dm-target.c | 3 ++- include/linux/device-mapper.h | 3 ++- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2ee5e357a0a7..07cfef67c0ba 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_REMAPPED; } -static void multipath_release_clone(struct request *clone) +static void multipath_release_clone(struct request *clone, + union map_info *map_context) { + if (unlikely(map_context)) { + /* + * non-NULL map_context means caller is still map + * method; must undo multipath_clone_and_map() + */ + struct dm_mpath_io *mpio = get_mpio(map_context); + struct pgpath *pgpath = mpio->pgpath; + + if (pgpath && pgpath->pg->ps.type->end_io) + pgpath->pg->ps.type->end_io(&pgpath->pg->ps, + &pgpath->path, + mpio->nr_bytes); + } + blk_put_request(clone); } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b66745bd08bb..5f7063f05ae0 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error) struct request *rq = tio->orig; blk_rq_unprep_clone(clone); - tio->ti->type->release_clone_rq(clone); + tio->ti->type->release_clone_rq(clone, NULL); rq_end_stats(md, rq); blk_mq_end_request(rq, error); @@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ rq_end_stats(md, rq); if (tio->clone) { blk_rq_unprep_clone(tio->clone); - tio->ti->type->release_clone_rq(tio->clone); + tio->ti->type->release_clone_rq(tio->clone, NULL); } dm_mq_delay_requeue_request(rq, delay_ms); @@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio) case DM_MAPIO_REMAPPED: if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { /* -ENOMEM */ - ti->type->release_clone_rq(clone); + ti->type->release_clone_rq(clone, &tio->info); return DM_MAPIO_REQUEUE; } @@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio) ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_rq_unprep_clone(clone); - tio->ti->type->release_clone_rq(clone); + tio->ti->type->release_clone_rq(clone, &tio->info); tio->clone = NULL; return DM_MAPIO_REQUEUE; } diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 314d17ca6466..64dd0b34fcf4 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, return DM_MAPIO_KILL; } -static void io_err_release_clone_rq(struct request *clone) +static void io_err_release_clone_rq(struct request *clone, + union map_info *map_context) { } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b0672756d056..e1f51d607cc5 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -62,7 +62,8 @@ typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti, struct request *rq, union map_info *map_context, struct request **clone); -typedef void (*dm_release_clone_request_fn) (struct request *clone); +typedef void (*dm_release_clone_request_fn) (struct request *clone, + union map_info *map_context); /* * Returns: From 514cf4f881dc82507d87d2ccd5e7478fd36632fa Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Thu, 18 Apr 2019 21:59:19 +0800 Subject: [PATCH 18/42] dm: only initialize md->dax_dev if CONFIG_DAX_DRIVER is enabled md->dax_dev defaults to NULL and there is no need to initialize it if CONFIG_DAX_DRIVER is disabled. Signed-off-by: Peng Wang Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 043f0761e4a0..56c34a0a9cd9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1906,7 +1906,6 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); - struct dax_device *dax_dev = NULL; struct mapped_device *md; void *old_md; @@ -1969,11 +1968,10 @@ static struct mapped_device *alloc_dev(int minor) sprintf(md->disk->disk_name, "dm-%d", minor); if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); - if (!dax_dev) + md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + if (!md->dax_dev) goto bad; } - md->dax_dev = dax_dev; add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); From 81bc6d150ace6250503b825d9d0c10f7bbd24095 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 25 Apr 2019 12:07:54 -0400 Subject: [PATCH 19/42] dm delay: fix a crash when invalid device is specified When the target line contains an invalid device, delay_ctr() will call delay_dtr() with NULL workqueue. Attempting to destroy the NULL workqueue causes a crash. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index fddffe251bf6..f496213f8b67 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - destroy_workqueue(dc->kdelayd_wq); + if (dc->kdelayd_wq) + destroy_workqueue(dc->kdelayd_wq); if (dc->read.dev) dm_put_device(ti, dc->read.dev); From 08a8e804620bbb41d06d144441d19c589bb17aa4 Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Thu, 25 Apr 2019 21:31:19 +0800 Subject: [PATCH 20/42] dm writecache: remove unused member page_offset in writeback_struct The stucture member page_offset in writeback_struct never has been used actually. Remove it. Signed-off-by: Huaisheng Ye Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index cfbbfbc028f9..ddf1732dae50 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -190,7 +190,6 @@ struct writeback_struct { struct dm_writecache *wc; struct wc_entry **wc_list; unsigned wc_list_n; - unsigned page_offset; struct page *page; struct wc_entry *wc_list_inline[WB_LIST_INLINE]; struct bio bio; @@ -1481,7 +1480,6 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba bio->bi_end_io = writecache_writeback_endio; bio_set_dev(bio, wc->dev->bdev); bio->bi_iter.bi_sector = read_original_sector(wc, e); - wb->page_offset = PAGE_SIZE; if (max_pages <= WB_LIST_INLINE || unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), GFP_NOIO | __GFP_NORETRY | From f8011d334426cee77276a1038b627b5cb0470258 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 26 Apr 2019 09:59:24 -0400 Subject: [PATCH 21/42] dm writecache: avoid unnecessary lookups in writecache_find_entry() This is a small optimization in writecache_find_entry(). If we go past the condition "if (unlikely(!node))", we can be certain that there is no entry in the tree that has the block equal to the "block" variable. Consequently, we can return the next entry directly, we don't need to go to the second part of the function that finds the entry with lowest or highest seq number that matches the "block" variable. Also, add some whitespace and cleanup needless braces. Suggested-by: Huaisheng Ye Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index ddf1732dae50..1cb137f0ef9d 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -545,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, e = container_of(node, struct wc_entry, rb_node); if (read_original_sector(wc, e) == block) break; + node = (read_original_sector(wc, e) >= block ? e->rb_node.rb_left : e->rb_node.rb_right); if (unlikely(!node)) { - if (!(flags & WFE_RETURN_FOLLOWING)) { + if (!(flags & WFE_RETURN_FOLLOWING)) return NULL; - } if (read_original_sector(wc, e) >= block) { - break; + return e; } else { node = rb_next(&e->rb_node); - if (unlikely(!node)) { + if (unlikely(!node)) return NULL; - } e = container_of(node, struct wc_entry, rb_node); - break; + return e; } } } From e4f3fabd67480bf2ad3f71aa6126ffb8bb7dc712 Mon Sep 17 00:00:00 2001 From: Bryan Gurney Date: Thu, 7 Mar 2019 15:42:39 -0500 Subject: [PATCH 22/42] dm: add dust target Add the dm-dust target, which simulates the behavior of bad sectors at arbitrary locations, and the ability to enable the emulation of the read failures at an arbitrary time. This target behaves similarly to a linear target. At a given time, the user can send a message to the target to start failing read requests on specific blocks. When the failure behavior is enabled, reads of blocks configured "bad" will fail with EIO. Writes of blocks configured "bad" will result in the following: 1. Remove the block from the "bad block list". 2. Successfully complete the write. After this point, the block will successfully contain the written data, and will service reads and writes normally. This emulates the behavior of a "remapped sector" on a hard disk drive. dm-dust provides logging of which blocks have been added or removed to the "bad block list", as well as logging when a block has been removed from the bad block list. These messages can be used alongside the messages from the driver using a dm-dust device to analyze the driver's behavior when a read fails at a given time. (This logging can be reduced via a "quiet" mode, if desired.) NOTE: If the block size is larger than 512 bytes, only the first sector of each "dust block" is detected. Placing a limiting layer above a dust target, to limit the minimum I/O size to the dust block size, will ensure proper emulation of the given large block size. Signed-off-by: Bryan Gurney Co-developed-by: Joe Shimkus Co-developed-by: John Dorminy Co-developed-by: John Pittman Co-developed-by: Thomas Jaskiewicz Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-dust.txt | 272 +++++++++++++ drivers/md/Kconfig | 9 + drivers/md/Makefile | 1 + drivers/md/dm-dust.c | 515 ++++++++++++++++++++++++ 4 files changed, 797 insertions(+) create mode 100644 Documentation/device-mapper/dm-dust.txt create mode 100644 drivers/md/dm-dust.c diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/device-mapper/dm-dust.txt new file mode 100644 index 000000000000..954d402a1f6a --- /dev/null +++ b/Documentation/device-mapper/dm-dust.txt @@ -0,0 +1,272 @@ +dm-dust +======= + +This target emulates the behavior of bad sectors at arbitrary +locations, and the ability to enable the emulation of the failures +at an arbitrary time. + +This target behaves similarly to a linear target. At a given time, +the user can send a message to the target to start failing read +requests on specific blocks (to emulate the behavior of a hard disk +drive with bad sectors). + +When the failure behavior is enabled (i.e.: when the output of +"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks +in the "bad block list" will fail with EIO ("Input/output error"). + +Writes of blocks in the "bad block list will result in the following: + +1. Remove the block from the "bad block list". +2. Successfully complete the write. + +This emulates the "remapped sector" behavior of a drive with bad +sectors. + +Normally, a drive that is encountering bad sectors will most likely +encounter more bad sectors, at an unknown time or location. +With dm-dust, the user can use the "addbadblock" and "removebadblock" +messages to add arbitrary bad blocks at new locations, and the +"enable" and "disable" messages to modulate the state of whether the +configured "bad blocks" will be treated as bad, or bypassed. +This allows the pre-writing of test data and metadata prior to +simulating a "failure" event where bad sectors start to appear. + +Table parameters: +----------------- + + +Mandatory parameters: + : path to the block device. + : offset to data area from start of device_path + : block size in bytes + (minimum 512, maximum 1073741824, must be a power of 2) + +Usage instructions: +------------------- + +First, find the size (in 512-byte sectors) of the device to be used: + +$ sudo blockdev --getsz /dev/vdb1 +33552384 + +Create the dm-dust device: +(For a device with a block size of 512 bytes) +$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512' + +(For a device with a block size of 4096 bytes) +$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096' + +Check the status of the read behavior ("bypass" indicates that all I/O +will be passed through to the underlying device): +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 bypass + +$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct +128+0 records in +128+0 records out + +$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct +128+0 records in +128+0 records out + +Adding and removing bad blocks: +------------------------------- + +At any time (i.e.: whether the device has the "bad block" emulation +enabled or disabled), bad blocks may be added or removed from the +device via the "addbadblock" and "removebadblock" messages: + +$ sudo dmsetup message dust1 0 addbadblock 60 +kernel: device-mapper: dust: badblock added at block 60 + +$ sudo dmsetup message dust1 0 addbadblock 67 +kernel: device-mapper: dust: badblock added at block 67 + +$ sudo dmsetup message dust1 0 addbadblock 72 +kernel: device-mapper: dust: badblock added at block 72 + +These bad blocks will be stored in the "bad block list". +While the device is in "bypass" mode, reads and writes will succeed: + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 bypass + +Enabling block read failures: +----------------------------- + +To enable the "fail read on bad block" behavior, send the "enable" message: + +$ sudo dmsetup message dust1 0 enable +kernel: device-mapper: dust: enabling read failures on bad sectors + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 fail_read_on_bad_block + +With the device in "fail read on bad block" mode, attempting to read a +block will encounter an "Input/output error": + +$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct +dd: error reading '/dev/mapper/dust1': Input/output error +0+0 records in +0+0 records out +0 bytes copied, 0.00040651 s, 0.0 kB/s + +...and writing to the bad blocks will remove the blocks from the list, +therefore emulating the "remap" behavior of hard disk drives: + +$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct +128+0 records in +128+0 records out + +kernel: device-mapper: dust: block 60 removed from badblocklist by write +kernel: device-mapper: dust: block 67 removed from badblocklist by write +kernel: device-mapper: dust: block 72 removed from badblocklist by write +kernel: device-mapper: dust: block 87 removed from badblocklist by write + +Bad block add/remove error handling: +------------------------------------ + +Attempting to add a bad block that already exists in the list will +result in an "Invalid argument" error, as well as a helpful message: + +$ sudo dmsetup message dust1 0 addbadblock 88 +device-mapper: message ioctl on dust1 failed: Invalid argument +kernel: device-mapper: dust: block 88 already in badblocklist + +Attempting to remove a bad block that doesn't exist in the list will +result in an "Invalid argument" error, as well as a helpful message: + +$ sudo dmsetup message dust1 0 removebadblock 87 +device-mapper: message ioctl on dust1 failed: Invalid argument +kernel: device-mapper: dust: block 87 not found in badblocklist + +Counting the number of bad blocks in the bad block list: +-------------------------------------------------------- + +To count the number of bad blocks configured in the device, run the +following message command: + +$ sudo dmsetup message dust1 0 countbadblocks + +A message will print with the number of bad blocks currently +configured on the device: + +kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found + +Querying for specific bad blocks: +--------------------------------- + +To find out if a specific block is in the bad block list, run the +following message command: + +$ sudo dmsetup message dust1 0 queryblock 72 + +The following message will print if the block is in the list: +device-mapper: dust: queryblock: block 72 found in badblocklist + +The following message will print if the block is in the list: +device-mapper: dust: queryblock: block 72 not found in badblocklist + +The "queryblock" message command will work in both the "enabled" +and "disabled" modes, allowing the verification of whether a block +will be treated as "bad" without having to issue I/O to the device, +or having to "enable" the bad block emulation. + +Clearing the bad block list: +---------------------------- + +To clear the bad block list (without needing to individually run +a "removebadblock" message command for every block), run the +following message command: + +$ sudo dmsetup message dust1 0 clearbadblocks + +After clearing the bad block list, the following message will appear: + +kernel: device-mapper: dust: clearbadblocks: badblocks cleared + +If there were no bad blocks to clear, the following message will +appear: + +kernel: device-mapper: dust: clearbadblocks: no badblocks found + +Message commands list: +---------------------- + +Below is a list of the messages that can be sent to a dust device: + +Operations on blocks (requires a argument): + +addbadblock +queryblock +removebadblock + +...where is a block number within range of the device + (corresponding to the block size of the device.) + +Single argument message commands: + +countbadblocks +clearbadblocks +disable +enable +quiet + +Device removal: +--------------- + +When finished, remove the device via the "dmsetup remove" command: + +$ sudo dmsetup remove dust1 + +Quiet mode: +----------- + +On test runs with many bad blocks, it may be desirable to avoid +excessive logging (from bad blocks added, removed, or "remapped"). +This can be done by enabling "quiet mode" via the following message: + +$ sudo dmsetup message dust1 0 quiet + +This will suppress log messages from add / remove / removed by write +operations. Log messages from "countbadblocks" or "queryblock" +message commands will still print in quiet mode. + +The status of quiet mode can be seen by running "dmsetup status": + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 fail_read_on_bad_block quiet + +To disable quiet mode, send the "quiet" message again: + +$ sudo dmsetup message dust1 0 quiet + +$ sudo dmsetup status dust1 +0 33552384 dust 252:17 fail_read_on_bad_block verbose + +(The presence of "verbose" indicates normal logging.) + +"Why not...?" +------------- + +scsi_debug has a "medium error" mode that can fail reads on one +specified sector (sector 0x1234, hardcoded in the source code), but +it uses RAM for the persistent storage, which drastically decreases +the potential device size. + +dm-flakey fails all I/O from all block locations at a specified time +frequency, and not a given point in time. + +When a bad sector occurs on a hard disk drive, reads to that sector +are failed by the device, usually resulting in an error code of EIO +("I/O error") or ENODATA ("No data available"). However, a write to +the sector may succeed, and result in the sector becoming readable +after the device controller no longer experiences errors reading the +sector (or after a reallocation of the sector). However, there may +be bad sectors that occur on the device in the future, in a different, +unpredictable location. + +This target seeks to provide a device that can exhibit the behavior +of a bad sector at a known sector location, at a known time, based +on a large storage device (at least tens of gigabytes, not occupying +system memory). diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2557f198e175..db269a348b20 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -436,6 +436,15 @@ config DM_DELAY If unsure, say N. +config DM_DUST + tristate "Bad sector simulation target" + depends on BLK_DEV_DM + ---help--- + A target that simulates bad sector behavior. + Useful for testing. + + If unsure, say N. + config DM_INIT bool "DM \"dm-mod.create=\" parameter support" depends on BLK_DEV_DM=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a52b703e588e..be7a6eb92abc 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_DUST) += dm-dust.o obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c new file mode 100644 index 000000000000..178587bdc626 --- /dev/null +++ b/drivers/md/dm-dust.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + * + * This is a test "dust" device, which fails reads on specified + * sectors, emulating the behavior of a hard disk drive sending + * a "Read Medium Error" sense. + * + */ + +#include +#include +#include + +#define DM_MSG_PREFIX "dust" + +struct badblock { + struct rb_node node; + sector_t bb; +}; + +struct dust_device { + struct dm_dev *dev; + struct rb_root badblocklist; + unsigned long long badblock_count; + spinlock_t dust_lock; + unsigned int blksz; + unsigned int sect_per_block; + sector_t start; + bool fail_read_on_bb:1; + bool quiet_mode:1; +}; + +static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct badblock *bblk = rb_entry(node, struct badblock, node); + + if (bblk->bb > blk) + node = node->rb_left; + else if (bblk->bb < blk) + node = node->rb_right; + else + return bblk; + } + + return NULL; +} + +static bool dust_rb_insert(struct rb_root *root, struct badblock *new) +{ + struct badblock *bblk; + struct rb_node **link = &root->rb_node, *parent = NULL; + sector_t value = new->bb; + + while (*link) { + parent = *link; + bblk = rb_entry(parent, struct badblock, node); + + if (bblk->bb > value) + link = &(*link)->rb_left; + else if (bblk->bb < value) + link = &(*link)->rb_right; + else + return false; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); + + return true; +} + +static int dust_remove_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + + if (bblock == NULL) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu not found in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + return -EINVAL; + } + + rb_erase(&bblock->node, &dd->badblocklist); + dd->badblock_count--; + if (!dd->quiet_mode) + DMINFO("%s: badblock removed at block %llu", __func__, block); + kfree(bblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_add_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + bblock = kmalloc(sizeof(*bblock), GFP_KERNEL); + if (bblock == NULL) { + if (!dd->quiet_mode) + DMERR("%s: badblock allocation failed", __func__); + return -ENOMEM; + } + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock->bb = block * dd->sect_per_block; + if (!dust_rb_insert(&dd->badblocklist, bblock)) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu already in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + kfree(bblock); + return -EINVAL; + } + + dd->badblock_count++; + if (!dd->quiet_mode) + DMINFO("%s: badblock added at block %llu", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_query_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + if (bblock != NULL) + DMINFO("%s: block %llu found in badblocklist", __func__, block); + else + DMINFO("%s: block %llu not found in badblocklist", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int __dust_map_read(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) + return DM_MAPIO_KILL; + + return DM_MAPIO_REMAPPED; +} + +static int dust_map_read(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + int ret = DM_MAPIO_REMAPPED; + + if (fail_read_on_bb) { + spin_lock_irqsave(&dd->dust_lock, flags); + ret = __dust_map_read(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return ret; +} + +static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) { + rb_erase(&bblk->node, &dd->badblocklist); + dd->badblock_count--; + kfree(bblk); + if (!dd->quiet_mode) { + sector_div(thisblock, dd->sect_per_block); + DMINFO("block %llu removed from badblocklist by write", + (unsigned long long)thisblock); + } + } +} + +static int dust_map_write(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + + if (fail_read_on_bb) { + spin_lock_irqsave(&dd->dust_lock, flags); + __dust_map_write(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return DM_MAPIO_REMAPPED; +} + +static int dust_map(struct dm_target *ti, struct bio *bio) +{ + struct dust_device *dd = ti->private; + int ret; + + bio_set_dev(bio, dd->dev->bdev); + bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (bio_data_dir(bio) == READ) + ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + else + ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + + return ret; +} + +static bool __dust_clear_badblocks(struct rb_root *tree, + unsigned long long count) +{ + struct rb_node *node = NULL, *nnode = NULL; + + nnode = rb_first(tree); + if (nnode == NULL) { + BUG_ON(count != 0); + return false; + } + + while (nnode) { + node = nnode; + nnode = rb_next(node); + rb_erase(node, tree); + count--; + kfree(node); + } + BUG_ON(count != 0); + BUG_ON(tree->rb_node != NULL); + + return true; +} + +static int dust_clear_badblocks(struct dust_device *dd) +{ + unsigned long flags; + struct rb_root badblocklist; + unsigned long long badblock_count; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + badblock_count = dd->badblock_count; + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_unlock_irqrestore(&dd->dust_lock, flags); + + if (!__dust_clear_badblocks(&badblocklist, badblock_count)) + DMINFO("%s: no badblocks found", __func__); + else + DMINFO("%s: badblocks cleared", __func__); + + return 0; +} + +/* + * Target parameters: + * + * + * + * device_path: path to the block device + * offset: offset to data area from start of device_path + * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2) + */ +static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dust_device *dd; + unsigned long long tmp; + char dummy; + unsigned int blksz; + unsigned int sect_per_block; + sector_t DUST_MAX_BLKSZ_SECTORS = 2097152; + sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS); + + if (argc != 3) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + if (kstrtouint(argv[2], 10, &blksz) || !blksz) { + ti->error = "Invalid block size parameter"; + return -EINVAL; + } + + if (blksz < 512) { + ti->error = "Block size must be at least 512"; + return -EINVAL; + } + + if (!is_power_of_2(blksz)) { + ti->error = "Block size must be a power of 2"; + return -EINVAL; + } + + if (to_sector(blksz) > max_block_sectors) { + ti->error = "Block size is too large"; + return -EINVAL; + } + + sect_per_block = (blksz >> SECTOR_SHIFT); + + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { + ti->error = "Invalid device offset sector"; + return -EINVAL; + } + + dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL); + if (dd == NULL) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) { + ti->error = "Device lookup failed"; + kfree(dd); + return -EINVAL; + } + + dd->sect_per_block = sect_per_block; + dd->blksz = blksz; + dd->start = tmp; + + /* + * Whether to fail a read on a "bad" block. + * Defaults to false; enabled later by message. + */ + dd->fail_read_on_bb = false; + + /* + * Initialize bad block list rbtree. + */ + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_lock_init(&dd->dust_lock); + + dd->quiet_mode = false; + + BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0); + + ti->num_discard_bios = 1; + ti->num_flush_bios = 1; + ti->private = dd; + + return 0; +} + +static void dust_dtr(struct dm_target *ti) +{ + struct dust_device *dd = ti->private; + + __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count); + dm_put_device(ti, dd->dev); + kfree(dd); +} + +static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result_buf, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; + bool invalid_msg = false; + int result = -EINVAL; + unsigned long long tmp, block; + unsigned long flags; + char dummy; + + if (argc == 1) { + if (!strcasecmp(argv[0], "addbadblock") || + !strcasecmp(argv[0], "removebadblock") || + !strcasecmp(argv[0], "queryblock")) { + DMERR("%s requires an additional argument", argv[0]); + } else if (!strcasecmp(argv[0], "disable")) { + DMINFO("disabling read failures on bad sectors"); + dd->fail_read_on_bb = false; + result = 0; + } else if (!strcasecmp(argv[0], "enable")) { + DMINFO("enabling read failures on bad sectors"); + dd->fail_read_on_bb = true; + result = 0; + } else if (!strcasecmp(argv[0], "countbadblocks")) { + spin_lock_irqsave(&dd->dust_lock, flags); + DMINFO("countbadblocks: %llu badblock(s) found", + dd->badblock_count); + spin_unlock_irqrestore(&dd->dust_lock, flags); + result = 0; + } else if (!strcasecmp(argv[0], "clearbadblocks")) { + result = dust_clear_badblocks(dd); + } else if (!strcasecmp(argv[0], "quiet")) { + if (!dd->quiet_mode) + dd->quiet_mode = true; + else + dd->quiet_mode = false; + result = 0; + } else { + invalid_msg = true; + } + } else if (argc == 2) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return result; + + block = tmp; + sector_div(size, dd->sect_per_block); + if (block > size || block < 0) { + DMERR("selected block value out of range"); + return result; + } + + if (!strcasecmp(argv[0], "addbadblock")) + result = dust_add_block(dd, block); + else if (!strcasecmp(argv[0], "removebadblock")) + result = dust_remove_block(dd, block); + else if (!strcasecmp(argv[0], "queryblock")) + result = dust_query_block(dd, block); + else + invalid_msg = true; + + } else + DMERR("invalid number of arguments '%d'", argc); + + if (invalid_msg) + DMERR("unrecognized message '%s' received", argv[0]); + + return result; +} + +static void dust_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %s %s", dd->dev->name, + dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass", + dd->quiet_mode ? "quiet" : "verbose"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u", dd->dev->name, + (unsigned long long)dd->start, dd->blksz); + break; + } +} + +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct dust_device *dd = ti->private; + struct dm_dev *dev = dd->dev; + + *bdev = dev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (dd->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + return 1; + + return 0; +} + +static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, + void *data) +{ + struct dust_device *dd = ti->private; + + return fn(ti, dd->dev, dd->start, ti->len, data); +} + +static struct target_type dust_target = { + .name = "dust", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = dust_ctr, + .dtr = dust_dtr, + .iterate_devices = dust_iterate_devices, + .map = dust_map, + .message = dust_message, + .status = dust_status, + .prepare_ioctl = dust_prepare_ioctl, +}; + +int __init dm_dust_init(void) +{ + int result = dm_register_target(&dust_target); + + if (result < 0) + DMERR("dm_register_target failed %d", result); + + return result; +} + +void __exit dm_dust_exit(void) +{ + dm_unregister_target(&dust_target); +} + +module_init(dm_dust_init); +module_exit(dm_dust_exit); + +MODULE_DESCRIPTION(DM_NAME " dust test target"); +MODULE_AUTHOR("Bryan Gurney "); +MODULE_LICENSE("GPL"); From 8e890c1ab1b1e0f765cd8da82c4dee011698a5e8 Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Fri, 26 Apr 2019 17:09:55 -0300 Subject: [PATCH 23/42] dm init: fix max devices/targets checks dm-init should allow up to DM_MAX_{DEVICES,TARGETS} for devices/targets, and not DM_MAX_{DEVICES,TARGETS} - 1. Fix the checks and also fix the error message when the number of devices is surpassed. Fixes: 6bbc923dfcf57d ("dm: add support to directly boot to a mapped device") Cc: stable@vger.kernel.org Signed-off-by: Helen Koike Signed-off-by: Mike Snitzer --- drivers/md/dm-init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 4b76f84424c3..352e803f566e 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str) while (table_entry) { DMDEBUG("parsing table \"%s\"", str); - if (++dev->dmi.target_count >= DM_MAX_TARGETS) { + if (++dev->dmi.target_count > DM_MAX_TARGETS) { DMERR("too many targets %u > %d", dev->dmi.target_count, DM_MAX_TARGETS); return -EINVAL; @@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str) return -ENOMEM; list_add_tail(&dev->list, devices); - if (++ndev >= DM_MAX_DEVICES) { - DMERR("too many targets %u > %d", - dev->dmi.target_count, DM_MAX_TARGETS); + if (++ndev > DM_MAX_DEVICES) { + DMERR("too many devices %lu > %d", + ndev, DM_MAX_DEVICES); return -EINVAL; } From 940bc471780b004a5277c1931f52af363c2fc9da Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 29 Apr 2019 11:48:15 +0200 Subject: [PATCH 24/42] dm mpath: always free attached_handler_name in parse_path() Commit b592211c33f7 ("dm mpath: fix attached_handler_name leak and dangling hw_handler_name pointer") fixed a memory leak for the case where setup_scsi_dh() returns failure. But setup_scsi_dh may return success and not "use" attached_handler_name if the retain_attached_hwhandler flag is not set on the map. As setup_scsi_sh properly "steals" the pointer by nullifying it, freeing it unconditionally in parse_path() is safe. Fixes: b592211c33f7 ("dm mpath: fix attached_handler_name leak and dangling hw_handler_name pointer") Cc: stable@vger.kernel.org Reported-by: Yufen Yu Signed-off-by: Martin Wilck Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 07cfef67c0ba..dbcc1e41cd57 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -897,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps if (attached_handler_name || m->hw_handler_name) { INIT_DELAYED_WORK(&p->activate_path, activate_path_work); r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); + kfree(attached_handler_name); if (r) { dm_put_device(ti, p->path.dev); goto bad; @@ -911,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps return p; bad: - kfree(attached_handler_name); free_pgpath(p); return ERR_PTR(r); } From cacddeab563be5850ec12ba6a1396e120f94a529 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 1 May 2019 13:57:17 +0100 Subject: [PATCH 25/42] dm dust: remove redundant unsigned comparison to less than zero Variable block is an unsigned long long hence the less than zero comparison is always false, hence it is redundant and can be removed. Addresses-Coverity: ("Unsigned compared against 0") Signed-off-by: Colin Ian King Reviewed-by: Bryan Gurney Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 178587bdc626..e739092bfc65 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -411,7 +411,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, block = tmp; sector_div(size, dd->sect_per_block); - if (block > size || block < 0) { + if (block > size) { DMERR("selected block value out of range"); return result; } From 9ccce5a0fb703bbae425d2f38513e30f62eee282 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 4 May 2019 18:30:36 +0800 Subject: [PATCH 26/42] dm dust: Make dm_dust_init and dm_dust_exit static Fix sparse warnings: drivers/md/dm-dust.c:495:12: warning: symbol 'dm_dust_init' was not declared. Should it be static? drivers/md/dm-dust.c:505:13: warning: symbol 'dm_dust_exit' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index e739092bfc65..845f376a72d9 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -492,7 +492,7 @@ static struct target_type dust_target = { .prepare_ioctl = dust_prepare_ioctl, }; -int __init dm_dust_init(void) +static int __init dm_dust_init(void) { int result = dm_register_target(&dust_target); @@ -502,7 +502,7 @@ int __init dm_dust_init(void) return result; } -void __exit dm_dust_exit(void) +static void __exit dm_dust_exit(void) { dm_unregister_target(&dust_target); } From 30bba430ddf737978e40561198693ba91386dac1 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 7 May 2019 14:28:35 -0400 Subject: [PATCH 27/42] dm integrity: correctly calculate the size of metadata area When we use separate devices for data and metadata, dm-integrity would incorrectly calculate the size of the metadata device as if it had 512-byte block size - and it would refuse activation with larger block size and smaller metadata device. Fix this so that it takes actual block size into account, which fixes the following reported issue: https://gitlab.com/cryptsetup/cryptsetup/issues/450 Fixes: 356d9d52e122 ("dm integrity: allow separate metadata device") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 7c678f50aaa3..7848ef019880 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2568,7 +2568,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) return -EINVAL; } else { - __u64 meta_size = ic->provided_data_sectors * ic->tag_size; + __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) >> (ic->log2_buffer_sectors + SECTOR_SHIFT); meta_size <<= ic->log2_buffer_sectors; @@ -3439,7 +3439,7 @@ try_smaller_buffer: DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); DEBUG_print(" journal_entries %u\n", ic->journal_entries); DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); - DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); + DEBUG_print(" data_device_sectors 0x%llx\n", (unsigned long long)ic->data_device_sectors); DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); From 97abfde17ae011525755f50242ed447ecebdbab5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:17 +0200 Subject: [PATCH 28/42] dm integrity: don't check null pointer before kvfree and vfree The functions kfree, vfree and kvfree do nothing if we pass a NULL pointer to them. So we don't need to test the pointer for NULL. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 7848ef019880..1e73422c04bc 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3548,10 +3548,8 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->writer_wq); if (ic->recalc_wq) destroy_workqueue(ic->recalc_wq); - if (ic->recalc_buffer) - vfree(ic->recalc_buffer); - if (ic->recalc_tags) - kvfree(ic->recalc_tags); + vfree(ic->recalc_buffer); + kvfree(ic->recalc_tags); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); mempool_exit(&ic->journal_io_mempool); From 893e3c395b2b7ea224c3d954bf9ba468745253f2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:18 +0200 Subject: [PATCH 29/42] dm integrity: don't report unused options If we are not journaling, don't report journaling options in the table status. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1e73422c04bc..1ff02683f6ec 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2468,10 +2468,12 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); - arg_count = 5; + arg_count = 3; arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'J'; arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -2486,8 +2488,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); - DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); - DMEMIT(" commit_time:%u", ic->autocommit_msec); + if (ic->mode == 'J') { + DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); + DMEMIT(" commit_time:%u", ic->autocommit_msec); + } #define EMIT_ALG(a, n) \ do { \ From 88ad5d1eb147a73ad000c658dff0e5166819e6f2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:23 +0200 Subject: [PATCH 30/42] dm integrity: update documentation Update documentation with the "meta_device" parameter and flags. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-integrity.txt | 10 +++++++++- drivers/md/dm-integrity.c | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt index 297251b0d2d5..7dc9180cdeac 100644 --- a/Documentation/device-mapper/dm-integrity.txt +++ b/Documentation/device-mapper/dm-integrity.txt @@ -79,6 +79,10 @@ interleave_sectors:number a power of two. If the device is already formatted, the value from the superblock is used. +meta_device:device + Don't interleave the data and metadata on on device. Use a + separate device for metadata. + buffer_sectors:number The number of sectors in one buffer. The value is rounded down to a power of two. @@ -167,7 +171,11 @@ The layout of the formatted block device: provides (i.e. the size of the device minus the size of all metadata and padding). The user of this target should not send bios that access data beyond the "provided data sectors" limit. - * flags - a flag is set if journal_mac is used + * flags + SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used + SB_FLAG_RECALCULATING - recalculating is in progress + * log2(sectors per block) + * a position where recalculating finished * journal The journal is divided into sections, each section contains: * metadata area (4kiB), it contains journal entries diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1ff02683f6ec..4c3bc16d3750 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3081,10 +3081,12 @@ bad: * buffer_sectors * journal_watermark * commit_time + * meta_device + * block_size * internal_hash * journal_crypt * journal_mac - * block_size + * recalculate */ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) { From 981e8a980dc25a980188b157988d8651c03adc5c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:19 +0200 Subject: [PATCH 31/42] dm integrity: introduce rw_journal_sectors() Introduce a function rw_journal_sectors() that takes sector and length as its arguments instead of a section and the number of sections. This functions will be used in further patches. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4c3bc16d3750..8bc1849a6c8d 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -761,12 +761,12 @@ static void complete_journal_io(unsigned long error, void *context) complete_journal_op(comp); } -static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, - unsigned n_sections, struct journal_completion *comp) +static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, + unsigned sector, unsigned n_sectors, struct journal_completion *comp) { struct dm_io_request io_req; struct dm_io_region io_loc; - unsigned sector, n_sectors, pl_index, pl_offset; + unsigned pl_index, pl_offset; int r; if (unlikely(dm_integrity_failed(ic))) { @@ -775,9 +775,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned return; } - sector = section * ic->journal_section_sectors; - n_sectors = n_sections * ic->journal_section_sectors; - pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); @@ -810,6 +807,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned } } +static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + unsigned sector, n_sectors; + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp); +} + static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) { struct journal_completion io_comp; From d5027e0345c2f014e1328b53e8d86a293edf1caf Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:20 +0200 Subject: [PATCH 32/42] dm ingerity: pass size to dm_integrity_alloc_page_list() Pass size to dm_integrity_alloc_page_list(). This is needed so following commits can pass a size that is different from ic->journal_pages. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 8bc1849a6c8d..ffd0d156c24e 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2677,37 +2677,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) blk_queue_max_integrity_segments(disk->queue, UINT_MAX); } -static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) +static void dm_integrity_free_page_list(struct page_list *pl) { unsigned i; if (!pl) return; - for (i = 0; i < ic->journal_pages; i++) - if (pl[i].page) - __free_page(pl[i].page); + for (i = 0; pl[i].page; i++) + __free_page(pl[i].page); kvfree(pl); } -static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) +static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages) { - size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list); struct page_list *pl; unsigned i; - pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO); + pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO); if (!pl) return NULL; - for (i = 0; i < ic->journal_pages; i++) { + for (i = 0; i < n_pages; i++) { pl[i].page = alloc_page(GFP_KERNEL); if (!pl[i].page) { - dm_integrity_free_page_list(ic, pl); + dm_integrity_free_page_list(pl); return NULL; } if (i) pl[i - 1].next = &pl[i]; } + pl[i].page = NULL; + pl[i].next = NULL; return pl; } @@ -2860,7 +2860,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) } ic->journal_pages = journal_pages; - ic->journal = dm_integrity_alloc_page_list(ic); + ic->journal = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal) { *error = "Could not allocate memory for journal"; r = -ENOMEM; @@ -2892,7 +2892,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) DEBUG_print("cipher %s, block size %u iv size %u\n", ic->journal_crypt_alg.alg_string, blocksize, ivsize); - ic->journal_io = dm_integrity_alloc_page_list(ic); + ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal_io) { *error = "Could not allocate memory for journal io"; r = -ENOMEM; @@ -2916,7 +2916,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) goto bad; } - ic->journal_xor = dm_integrity_alloc_page_list(ic); + ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal_xor) { *error = "Could not allocate memory for journal xor"; r = -ENOMEM; @@ -3573,9 +3573,9 @@ static void dm_integrity_dtr(struct dm_target *ti) dm_put_device(ti, ic->dev); if (ic->meta_dev) dm_put_device(ti, ic->meta_dev); - dm_integrity_free_page_list(ic, ic->journal); - dm_integrity_free_page_list(ic, ic->journal_io); - dm_integrity_free_page_list(ic, ic->journal_xor); + dm_integrity_free_page_list(ic->journal); + dm_integrity_free_page_list(ic->journal_io); + dm_integrity_free_page_list(ic->journal_xor); if (ic->journal_scatterlist) dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); if (ic->journal_io_scatterlist) From 4f43446ddff056df237a8ee9257ec94baeed909d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:21 +0200 Subject: [PATCH 33/42] dm integrity: allow large ranges to be described Change n_sectors data type from unsigned to sector_t. Following commits will need to lock large ranges. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ffd0d156c24e..0dcced588c42 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -246,7 +246,7 @@ struct dm_integrity_c { struct dm_integrity_range { sector_t logical_sector; - unsigned n_sectors; + sector_t n_sectors; bool waiting; union { struct rb_node node; @@ -1695,7 +1695,7 @@ retry: unsigned ws, we, range_sectors; dio->range.n_sectors = min(dio->range.n_sectors, - ic->free_sectors << ic->sb->log2_sectors_per_block); + (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block); if (unlikely(!dio->range.n_sectors)) { if (from_map) goto offload_to_thread; @@ -2153,7 +2153,7 @@ next_chunk: get_area_and_offset(ic, range.logical_sector, &area, &offset); range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); if (!ic->meta_dev) - range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); + range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); if (unlikely(!add_new_range(ic, &range, true))) wait_and_add_new_range(ic, &range); From 8b3bbd490d880db1377c71daf9c929c8446c8375 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:22 +0200 Subject: [PATCH 34/42] dm integrity: introduce a function add_new_range_and_wait() Introduce a function add_new_range_and_wait() in order to avoid repetitive code. It will be used in the following commit. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 0dcced588c42..fb8935d80842 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1001,6 +1001,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit } while (unlikely(new_range->waiting)); } +static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + if (unlikely(!add_new_range(ic, new_range, true))) + wait_and_add_new_range(ic, new_range); +} + static void init_journal_node(struct journal_node *node) { RB_CLEAR_NODE(&node->node); @@ -1995,8 +2001,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; spin_lock_irq(&ic->endio_wait.lock); - if (unlikely(!add_new_range(ic, &io->range, true))) - wait_and_add_new_range(ic, &io->range); + add_new_range_and_wait(ic, &io->range); if (likely(!from_replay)) { struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; @@ -2155,8 +2160,7 @@ next_chunk: if (!ic->meta_dev) range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); - if (unlikely(!add_new_range(ic, &range, true))) - wait_and_add_new_range(ic, &range); + add_new_range_and_wait(ic, &range); spin_unlock_irq(&ic->endio_wait.lock); From 468dfca38b1a6fbdccd195d875599cb7c8875cd9 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:24 +0200 Subject: [PATCH 35/42] dm integrity: add a bitmap mode Introduce an alternate mode of operation where dm-integrity uses a bitmap instead of a journal. If a bit in the bitmap is 1, the corresponding region's data and integrity tags are not synchronized - if the machine crashes, the unsynchronized regions will be recalculated. The bitmap mode is faster than the journal mode, because we don't have to write the data twice, but it is also less reliable, because if data corruption happens when the machine crashes, it may not be detected. Benchmark results for an SSD connected to a SATA300 port, when doing large linear writes with dd: buffered I/O: raw device throughput - 245MB/s dm-integrity with journaling - 120MB/s dm-integrity with bitmap - 238MB/s direct I/O with 1MB block size: raw device throughput - 248MB/s dm-integrity with journaling - 123MB/s dm-integrity with bitmap - 223MB/s For more info see dm-integrity in Documentation/device-mapper/ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-integrity.txt | 22 + drivers/md/dm-integrity.c | 534 +++++++++++++++++-- 2 files changed, 524 insertions(+), 32 deletions(-) diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt index 7dc9180cdeac..d63d78ffeb73 100644 --- a/Documentation/device-mapper/dm-integrity.txt +++ b/Documentation/device-mapper/dm-integrity.txt @@ -21,6 +21,13 @@ mode it calculates and verifies the integrity tag internally. In this mode, the dm-integrity target can be used to detect silent data corruption on the disk or in the I/O path. +There's an alternate mode of operation where dm-integrity uses bitmap +instead of a journal. If a bit in the bitmap is 1, the corresponding +region's data and integrity tags are not synchronized - if the machine +crashes, the unsynchronized regions will be recalculated. The bitmap mode +is faster than the journal mode, because we don't have to write the data +twice, but it is also less reliable, because if data corruption happens +when the machine crashes, it may not be detected. When loading the target for the first time, the kernel driver will format the device. But it will only format the device if the superblock contains @@ -59,6 +66,10 @@ Target arguments: either both data and tag or none of them are written. The journaled mode degrades write throughput twice because the data have to be written twice. + B - bitmap mode - data and metadata are written without any + synchronization, the driver maintains a bitmap of dirty + regions where data and metadata don't match. This mode can + only be used with internal hash. R - recovery mode - in this mode, journal is not replayed, checksums are not checked and writes to the device are not allowed. This mode is useful for data recovery if the @@ -150,6 +161,15 @@ block_size:number Supported values are 512, 1024, 2048 and 4096 bytes. If not specified the default block size is 512 bytes. +sectors_per_bit:number + In the bitmap mode, this parameter specifies the number of + 512-byte sectors that corresponds to one bitmap bit. + +bitmap_flush_interval:number + The bitmap flush interval in milliseconds. The metadata buffers + are synchronized when this interval expires. + + The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can be changed when reloading the target (load an inactive table and swap the tables with suspend and resume). The other arguments should not be changed @@ -174,6 +194,8 @@ The layout of the formatted block device: * flags SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used SB_FLAG_RECALCULATING - recalculating is in progress + SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty + blocks * log2(sectors per block) * a position where recalculating finished * journal diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index fb8935d80842..54b3fe1403a8 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -24,6 +24,7 @@ #define DEFAULT_INTERLEAVE_SECTORS 32768 #define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768 #define DEFAULT_BUFFER_SECTORS 128 #define DEFAULT_JOURNAL_WATERMARK 50 #define DEFAULT_SYNC_MSEC 10000 @@ -33,6 +34,8 @@ #define METADATA_WORKQUEUE_MAX_ACTIVE 16 #define RECALC_SECTORS 8192 #define RECALC_WRITE_SUPER 16 +#define BITMAP_BLOCK_SIZE 4096 /* don't change it */ +#define BITMAP_FLUSH_INTERVAL (10 * HZ) /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -48,6 +51,7 @@ #define SB_MAGIC "integrt" #define SB_VERSION_1 1 #define SB_VERSION_2 2 +#define SB_VERSION_3 3 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -60,12 +64,14 @@ struct superblock { __u64 provided_data_sectors; /* userspace uses this value */ __u32 flags; __u8 log2_sectors_per_block; - __u8 pad[3]; + __u8 log2_blocks_per_bitmap_bit; + __u8 pad[2]; __u64 recalc_sector; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 +#define SB_FLAG_DIRTY_BITMAP 0x4 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -155,9 +161,16 @@ struct dm_integrity_c { struct workqueue_struct *metadata_wq; struct superblock *sb; unsigned journal_pages; + unsigned n_bitmap_blocks; + struct page_list *journal; struct page_list *journal_io; struct page_list *journal_xor; + struct page_list *recalc_bitmap; + struct page_list *may_write_bitmap; + struct bitmap_block_status *bbs; + unsigned bitmap_flush_interval; + struct delayed_work bitmap_flush_work; struct crypto_skcipher *journal_crypt; struct scatterlist **journal_scatterlist; @@ -184,6 +197,7 @@ struct dm_integrity_c { __s8 log2_metadata_run; __u8 log2_buffer_sectors; __u8 sectors_per_block; + __u8 log2_blocks_per_bitmap_bit; unsigned char mode; int suspending; @@ -236,6 +250,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; + bool recalculate_flag; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -292,6 +307,16 @@ struct journal_io { struct journal_completion *comp; }; +struct bitmap_block_status { + struct work_struct work; + struct dm_integrity_c *ic; + unsigned idx; + unsigned long *bitmap; + struct bio_list bio_queue; + spinlock_t bio_queue_lock; + +}; + static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 @@ -427,7 +452,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + ic->sb->version = SB_VERSION_3; + else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; else ic->sb->version = SB_VERSION_1; @@ -451,6 +478,135 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) return dm_io(&io_req, 1, &io_loc, NULL); } +#define BITMAP_OP_TEST_ALL_SET 0 +#define BITMAP_OP_TEST_ALL_CLEAR 1 +#define BITMAP_OP_SET 2 +#define BITMAP_OP_CLEAR 3 + +static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, sector_t sector, sector_t n_sectors, int mode) +{ + unsigned long bit, end_bit, this_end_bit, page, end_page; + unsigned long *data; + + if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { + DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)\n", + (unsigned long long)sector, + (unsigned long long)n_sectors, + ic->sb->log2_sectors_per_block, + ic->log2_blocks_per_bitmap_bit, + mode); + BUG(); + } + + if (unlikely(!n_sectors)) + return true; + + bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + end_bit = (sector + n_sectors - 1) >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + + page = bit / (PAGE_SIZE * 8); + bit %= PAGE_SIZE * 8; + + end_page = end_bit / (PAGE_SIZE * 8); + end_bit %= PAGE_SIZE * 8; + +repeat: + if (page < end_page) { + this_end_bit = PAGE_SIZE * 8 - 1; + } else { + this_end_bit = end_bit; + } + + data = lowmem_page_address(bitmap[page].page); + + if (mode == BITMAP_OP_TEST_ALL_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != -1) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (!test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != 0) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = -1; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __set_bit(bit, data); + bit++; + } + } else if (mode == BITMAP_OP_CLEAR) { + if (!bit && this_end_bit == PAGE_SIZE * 8 - 1) + clear_page(data); + else while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = 0; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __clear_bit(bit, data); + bit++; + } + } else { + BUG(); + } + + if (unlikely(page < end_page)) { + bit = 0; + page++; + goto repeat; + } + + return true; +} + +static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src) +{ + unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + unsigned i; + + for (i = 0; i < n_bitmap_pages; i++) { + unsigned long *dst_data = lowmem_page_address(dst[i].page); + unsigned long *src_data = lowmem_page_address(src[i].page); + copy_page(dst_data, src_data); + } +} + +static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector) +{ + unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8); + + BUG_ON(bitmap_block >= ic->n_bitmap_blocks); + return &ic->bbs[bitmap_block]; +} + static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, bool e, const char *function) { @@ -1784,6 +1940,20 @@ offload_to_thread: goto journal_read_write; } + if (ic->mode == 'B' && dio->write) { + if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + struct bitmap_block_status *bbs = sector_to_bitmap_block(ic, dio->range.logical_sector); + + spin_lock(&bbs->bio_queue_lock); + bio_list_add(&bbs->bio_queue, bio); + spin_unlock(&bbs->bio_queue_lock); + + queue_work(ic->writer_wq, &bbs->work); + + return; + } + } + dio->in_flight = (atomic_t)ATOMIC_INIT(2); if (need_sync_io) { @@ -1810,10 +1980,14 @@ offload_to_thread: if (need_sync_io) { wait_for_completion_io(&read_comp); - if (unlikely(ic->recalc_wq != NULL) && - ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) goto skip_check; + if (ic->mode == 'B') { + if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) + goto skip_check; + } + if (likely(!bio->bi_status)) integrity_metadata(&dio->work); else @@ -1851,8 +2025,22 @@ static void pad_uncommitted(struct dm_integrity_c *ic) wraparound_section(ic, &ic->free_section); ic->n_uncommitted_sections++; } - WARN_ON(ic->journal_sections * ic->journal_section_entries != - (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); + if (WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors)) { + printk(KERN_CRIT "dm-integrity: " + "journal_sections %u, " + "journal_section_entries %u, " + "n_uncommitted_sections %u, " + "n_committed_sections %u, " + "journal_section_entries %u, " + "free_sectors %u\n", + ic->journal_sections, + ic->journal_section_entries, + ic->n_uncommitted_sections, + ic->n_committed_sections, + ic->journal_section_entries, + ic->free_sectors); + } } static void integrity_commit(struct work_struct *w) @@ -2139,11 +2327,14 @@ static void integrity_recalc(struct work_struct *w) sector_t area, offset; sector_t metadata_block; unsigned metadata_offset; + sector_t logical_sector, n_sectors; __u8 *t; unsigned i; int r; unsigned super_counter = 0; + DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector)); + spin_lock_irq(&ic->endio_wait.lock); next_chunk: @@ -2152,8 +2343,13 @@ next_chunk: goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); - if (unlikely(range.logical_sector >= ic->provided_data_sectors)) + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) { + if (ic->mode == 'B') { + DEBUG_print("queue_delayed_work: bitmap_flush_work\n"); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + } goto unlock_ret; + } get_area_and_offset(ic, range.logical_sector, &area, &offset); range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); @@ -2161,11 +2357,33 @@ next_chunk: range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); add_new_range_and_wait(ic, &range); - spin_unlock_irq(&ic->endio_wait.lock); + logical_sector = range.logical_sector; + n_sectors = range.n_sectors; + + if (ic->mode == 'B') { + if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { + goto advance_and_next; + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + logical_sector += ic->sectors_per_block; + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block, ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + get_area_and_offset(ic, logical_sector, &area, &offset); + } + + DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors); if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { recalc_write_super(ic); + if (ic->mode == 'B') { + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); + } super_counter = 0; } @@ -2180,7 +2398,7 @@ next_chunk: io_req.client = ic->io; io_loc.bdev = ic->dev->bdev; io_loc.sector = get_data_sector(ic, area, offset); - io_loc.count = range.n_sectors; + io_loc.count = n_sectors; r = dm_io(&io_req, 1, &io_loc, NULL); if (unlikely(r)) { @@ -2189,8 +2407,8 @@ next_chunk: } t = ic->recalc_tags; - for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + for (i = 0; i < n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); t += ic->tag_size; } @@ -2202,6 +2420,9 @@ next_chunk: goto err; } +advance_and_next: + cond_resched(); + spin_lock_irq(&ic->endio_wait.lock); remove_range_unlocked(ic, &range); ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); @@ -2217,6 +2438,89 @@ unlock_ret: recalc_write_super(ic); } +static void bitmap_block_work(struct work_struct *w) +{ + struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work); + struct dm_integrity_c *ic = bbs->ic; + struct bio *bio; + struct bio_list bio_queue; + struct bio_list waiting; + + bio_list_init(&waiting); + + spin_lock(&bbs->bio_queue_lock); + bio_queue = bbs->bio_queue; + bio_list_init(&bbs->bio_queue); + spin_unlock(&bbs->bio_queue_lock); + + while ((bio = bio_list_pop(&bio_queue))) { + struct dm_integrity_io *dio; + + dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + } else { + block_bitmap_op(ic, ic->journal, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET); + bio_list_add(&waiting, bio); + } + } + + if (bio_list_empty(&waiting)) + return; + + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); + + while ((bio = bio_list_pop(&waiting))) { + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET); + + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + } + + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); +} + +static void bitmap_flush_work(struct work_struct *work) +{ + struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work); + struct dm_integrity_range range; + unsigned long limit; + + dm_integrity_flush_buffers(ic); + + range.logical_sector = 0; + range.n_sectors = ic->provided_data_sectors; + + spin_lock_irq(&ic->endio_wait.lock); + add_new_range_and_wait(ic, &range); + spin_unlock_irq(&ic->endio_wait.lock); + + dm_integrity_flush_buffers(ic); + if (ic->meta_dev) + blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL); + + limit = ic->provided_data_sectors; + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + limit = le64_to_cpu(ic->sb->recalc_sector) + >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit) + << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + } + DEBUG_print("zeroing journal\n"); + block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); + + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + + remove_range(ic, &range); +} + + static void init_journal(struct dm_integrity_c *ic, unsigned start_section, unsigned n_sections, unsigned char commit_seq) { @@ -2416,6 +2720,7 @@ clear_journal: static void dm_integrity_postsuspend(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; del_timer_sync(&ic->autocommit_timer); @@ -2424,6 +2729,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti) if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); + if (ic->mode == 'B') + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_work(ic->commit_wq, &ic->commit_work); drain_workqueue(ic->commit_wq); @@ -2434,6 +2742,17 @@ static void dm_integrity_postsuspend(struct dm_target *ti) dm_integrity_flush_buffers(ic); } + if (ic->mode == 'B') { + dm_integrity_flush_buffers(ic); +#if 1 + init_journal(ic, 0, ic->journal_sections, 0); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +#endif + } + WRITE_ONCE(ic->suspending, 0); BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); @@ -2444,11 +2763,65 @@ static void dm_integrity_postsuspend(struct dm_target *ti) static void dm_integrity_resume(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; + DEBUG_print("resume\n"); - replay_journal(ic); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { + DEBUG_print("resume dirty_bitmap\n"); + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + if (ic->mode == 'B') { + if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal); + block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal); + if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n", ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + init_journal(ic, 0, ic->journal_sections, 0); + replay_journal(ic); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + } + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } else { + replay_journal(ic); + if (ic->mode == 'B') { + int mode; + ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); - if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR; + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + } + + DEBUG_print("testing recalc: %x\n", ic->sb->flags); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); + DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors); if (recalc_pos < ic->provided_data_sectors) { queue_work(ic->recalc_wq, &ic->recalc_work); } else if (recalc_pos > ic->provided_data_sectors) { @@ -2486,6 +2859,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += ic->mode == 'J'; arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'B'; + arg_count += ic->mode == 'B'; arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -2495,7 +2870,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" meta_device:%s", ic->meta_dev->name); if (ic->sectors_per_block != 1) DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); - if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + if (ic->recalculate_flag) DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); @@ -2504,6 +2879,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); } + if (ic->mode == 'B') { + DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); + } #define EMIT_ALG(a, n) \ do { \ @@ -3085,7 +3464,7 @@ bad: * device * offset from the start of the device * tag size - * D - direct writes, J - journal writes, R - recovery mode + * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode * number of optional arguments * optional arguments: * journal_sectors @@ -3095,6 +3474,8 @@ bad: * commit_time * meta_device * block_size + * sectors_per_bit + * bitmap_flush_interval * internal_hash * journal_crypt * journal_mac @@ -3111,10 +3492,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) {0, 9, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; - bool recalculate; bool should_write_sb; __u64 threshold; unsigned long long start; + __s8 log2_sectors_per_bitmap_bit = -1; + __s8 log2_blocks_per_bitmap_bit; + __u64 bits_in_journal; + __u64 n_bitmap_bits; #define DIRECT_ARGUMENTS 4 @@ -3138,6 +3522,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) init_waitqueue_head(&ic->copy_to_journal_wait); init_completion(&ic->crypto_backoff); atomic64_set(&ic->number_of_mismatches, 0); + ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL; r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); if (r) { @@ -3160,10 +3545,10 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } - if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { ic->mode = argv[3][0]; - else { - ti->error = "Invalid mode (expecting J, D, R)"; + } else { + ti->error = "Invalid mode (expecting J, B, D, R)"; r = -EINVAL; goto bad; } @@ -3173,7 +3558,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) buffer_sectors = DEFAULT_BUFFER_SECTORS; journal_watermark = DEFAULT_JOURNAL_WATERMARK; sync_msec = DEFAULT_SYNC_MSEC; - recalculate = false; ic->sectors_per_block = 1; as.argc = argc - DIRECT_ARGUMENTS; @@ -3185,6 +3569,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) while (extra_args--) { const char *opt_string; unsigned val; + unsigned long long llval; opt_string = dm_shift_arg(&as); if (!opt_string) { r = -EINVAL; @@ -3220,6 +3605,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } ic->sectors_per_block = val >> SECTOR_SHIFT; + } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) { + log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval); + } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) { + if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { + r = -EINVAL; + ti->error = "Invalid bitmap_flush_interval argument"; + } + ic->bitmap_flush_interval = msecs_to_jiffies(val); } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, "Invalid internal_hash argument"); @@ -3236,7 +3629,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto bad; } else if (!strcmp(opt_string, "recalculate")) { - recalculate = true; + ic->recalculate_flag = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3287,6 +3680,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) else ic->log2_tag_size = -1; + if (ic->mode == 'B' && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Bitmap mode can be only used with internal hash"; + goto bad; + } + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; timer_setup(&ic->autocommit_timer, autocommit_fn, 0); @@ -3332,7 +3731,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } INIT_WORK(&ic->commit_work, integrity_commit); - if (ic->mode == 'J') { + if (ic->mode == 'J' || ic->mode == 'B') { ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); if (!ic->writer_wq) { ti->error = "Cannot allocate workqueue"; @@ -3373,7 +3772,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_2) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -3433,6 +3832,27 @@ try_smaller_buffer: ti->error = "The device is too small"; goto bad; } + + if (log2_sectors_per_bitmap_bit < 0) + log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT); + if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block) + log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block; + + bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3); + if (bits_in_journal > UINT_MAX) + bits_in_journal = UINT_MAX; + while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) + log2_sectors_per_bitmap_bit++; + + log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block; + ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + if (should_write_sb) { + ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + } + n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) + + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit; + ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8); + if (!ic->meta_dev) ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); @@ -3457,25 +3877,21 @@ try_smaller_buffer: DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); DEBUG_print(" journal_entries %u\n", ic->journal_entries); DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); - DEBUG_print(" data_device_sectors 0x%llx\n", (unsigned long long)ic->data_device_sectors); + DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT); DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, (unsigned long long)ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal); - if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { + if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); } - if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { - if (!ic->internal_hash) { - r = -EINVAL; - ti->error = "Recalculate is only valid with internal hash"; - goto bad; - } + if (ic->internal_hash) { ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); if (!ic->recalc_wq ) { ti->error = "Cannot allocate workqueue"; @@ -3512,6 +3928,45 @@ try_smaller_buffer: r = create_journal(ic, &ti->error); if (r) goto bad; + + } + + if (ic->mode == 'B') { + unsigned i; + unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + + ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->recalc_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->may_write_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); + if (!ic->bbs) { + r = -ENOMEM; + goto bad; + } + INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work); + for (i = 0; i < ic->n_bitmap_blocks; i++) { + struct bitmap_block_status *bbs = &ic->bbs[i]; + unsigned sector, pl_index, pl_offset; + + INIT_WORK(&bbs->work, bitmap_block_work); + bbs->ic = ic; + bbs->idx = i; + bio_list_init(&bbs->bio_queue); + spin_lock_init(&bbs->bio_queue_lock); + + sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT); + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset; + } } if (should_write_sb) { @@ -3536,6 +3991,17 @@ try_smaller_buffer: if (r) goto bad; } + if (ic->mode == 'B') { + unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8); + if (!max_io_len) + max_io_len = 1U << 31; + DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len); + if (!ti->max_io_len || ti->max_io_len > max_io_len) { + r = dm_set_target_max_io_len(ti, max_io_len); + if (r) + goto bad; + } + } if (!ic->internal_hash) dm_integrity_set(ti, ic); @@ -3544,6 +4010,7 @@ try_smaller_buffer: ti->flush_supported = true; return 0; + bad: dm_integrity_dtr(ti); return r; @@ -3568,6 +4035,7 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->recalc_wq); vfree(ic->recalc_buffer); kvfree(ic->recalc_tags); + kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); mempool_exit(&ic->journal_io_mempool); @@ -3580,6 +4048,8 @@ static void dm_integrity_dtr(struct dm_target *ti) dm_integrity_free_page_list(ic->journal); dm_integrity_free_page_list(ic->journal_io); dm_integrity_free_page_list(ic->journal_xor); + dm_integrity_free_page_list(ic->recalc_bitmap); + dm_integrity_free_page_list(ic->may_write_bitmap); if (ic->journal_scatterlist) dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); if (ic->journal_io_scatterlist) @@ -3617,7 +4087,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, From 1f5a77591b13a302b60db0dcda57940f3e5d5214 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:25 +0200 Subject: [PATCH 36/42] dm integrity: handle machine reboot in bitmap mode When in bitmap mode the bitmap must be cleared when rebooting. This commit adds the reboot hook. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 54b3fe1403a8..42be03bbfafa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -257,6 +258,8 @@ struct dm_integrity_c { struct alg_spec journal_mac_alg; atomic64_t number_of_mismatches; + + struct notifier_block reboot_notifier; }; struct dm_integrity_range { @@ -2717,11 +2720,27 @@ clear_journal: init_journal_node(&ic->journal_tree[i]); } +static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier); + + if (ic->mode == 'B') { + DEBUG_print("dm_integrity_reboot\n"); + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + flush_workqueue(ic->commit_wq); + } + + return NOTIFY_DONE; +} + static void dm_integrity_postsuspend(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; int r; + WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); + del_timer_sync(&ic->autocommit_timer); WRITE_ONCE(ic->suspending, 1); @@ -2829,6 +2848,11 @@ static void dm_integrity_resume(struct dm_target *ti) recalc_write_super(ic); } } + + ic->reboot_notifier.notifier_call = dm_integrity_reboot; + ic->reboot_notifier.next = NULL; + ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */ + WARN_ON(register_reboot_notifier(&ic->reboot_notifier)); } static void dm_integrity_status(struct dm_target *ti, status_type_t type, From 482714932ecee063884b5d0ceddadbfafe89ae2b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 29 Apr 2019 14:57:26 +0200 Subject: [PATCH 37/42] dm integrity: implement synchronous mode for reboot handling Unfortunatelly, there may be bios coming even after the reboot notifier was called. We don't want these bios to make the bitmap dirty again. To address this, implement a synchronous mode - when a bio is about to be terminated, we clean the bitmap and terminate the bio after the clean operation succeeds. This obviously slows down bio processing, but it makes sure that when all bios are finished, the bitmap will be clean. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 43 ++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 42be03bbfafa..1968b0b1b280 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -171,6 +171,8 @@ struct dm_integrity_c { struct page_list *may_write_bitmap; struct bitmap_block_status *bbs; unsigned bitmap_flush_interval; + int synchronous_mode; + struct bio_list synchronous_bios; struct delayed_work bitmap_flush_work; struct crypto_skcipher *journal_crypt; @@ -1382,6 +1384,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio) int r = dm_integrity_failed(ic); if (unlikely(r) && !bio->bi_status) bio->bi_status = errno_to_blk_status(r); + if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) { + unsigned long flags; + spin_lock_irqsave(&ic->endio_wait.lock, flags); + bio_list_add(&ic->synchronous_bios, bio); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); + return; + } bio_endio(bio); } @@ -2494,6 +2504,7 @@ static void bitmap_flush_work(struct work_struct *work) struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work); struct dm_integrity_range range; unsigned long limit; + struct bio *bio; dm_integrity_flush_buffers(ic); @@ -2514,13 +2525,20 @@ static void bitmap_flush_work(struct work_struct *work) >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit) << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); } - DEBUG_print("zeroing journal\n"); + /*DEBUG_print("zeroing journal\n");*/ block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); - remove_range(ic, &range); + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) { + bio_endio(bio); + spin_unlock_irq(&ic->endio_wait.lock); + spin_lock_irq(&ic->endio_wait.lock); + } + spin_unlock_irq(&ic->endio_wait.lock); } @@ -2720,16 +2738,27 @@ clear_journal: init_journal_node(&ic->journal_tree[i]); } -static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x) +static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic) { - struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier); + DEBUG_print("dm_integrity_enter_synchronous_mode\n"); if (ic->mode == 'B') { - DEBUG_print("dm_integrity_reboot\n"); + ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1; + ic->synchronous_mode = 1; + cancel_delayed_work_sync(&ic->bitmap_flush_work); queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); flush_workqueue(ic->commit_wq); } +} + +static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier); + + DEBUG_print("dm_integrity_reboot\n"); + + dm_integrity_enter_synchronous_mode(ic); return NOTIFY_DONE; } @@ -2853,6 +2882,10 @@ static void dm_integrity_resume(struct dm_target *ti) ic->reboot_notifier.next = NULL; ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */ WARN_ON(register_reboot_notifier(&ic->reboot_notifier)); + +#if 0 + dm_integrity_enter_synchronous_mode(ic); +#endif } static void dm_integrity_status(struct dm_target *ti, status_type_t type, From 05d6909ea9d62bb357846177a84842e09fc15914 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 9 May 2019 15:25:49 -0400 Subject: [PATCH 38/42] dm integrity: whitespace, coding style and dead code cleanup Just some things that stood out like a sore thumb. Also, converted some printk(KERN_CRIT, ...) to DMCRIT(...) Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 104 ++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 43 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 1968b0b1b280..9af98a990079 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -488,13 +488,14 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) #define BITMAP_OP_SET 2 #define BITMAP_OP_CLEAR 3 -static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, sector_t sector, sector_t n_sectors, int mode) +static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, + sector_t sector, sector_t n_sectors, int mode) { unsigned long bit, end_bit, this_end_bit, page, end_page; unsigned long *data; if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { - DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)\n", + DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", (unsigned long long)sector, (unsigned long long)n_sectors, ic->sb->log2_sectors_per_block, @@ -507,7 +508,8 @@ static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, return true; bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); - end_bit = (sector + n_sectors - 1) >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + end_bit = (sector + n_sectors - 1) >> + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); page = bit / (PAGE_SIZE * 8); bit %= PAGE_SIZE * 8; @@ -620,8 +622,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un if (unlikely(section >= ic->journal_sections) || unlikely(offset >= limit)) { - printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", - function, section, offset, ic->journal_sections, limit); + DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)", + function, section, offset, ic->journal_sections, limit); BUG(); } #endif @@ -1666,7 +1668,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) else wanted_tag_size *= ic->tag_size; if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { - DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); + DMERR("Invalid integrity data size %u, expected %u", + bip->bip_iter.bi_size, wanted_tag_size); return DM_MAPIO_KILL; } } @@ -1954,15 +1957,15 @@ offload_to_thread: } if (ic->mode == 'B' && dio->write) { - if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { - struct bitmap_block_status *bbs = sector_to_bitmap_block(ic, dio->range.logical_sector); + if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + struct bitmap_block_status *bbs; + bbs = sector_to_bitmap_block(ic, dio->range.logical_sector); spin_lock(&bbs->bio_queue_lock); bio_list_add(&bbs->bio_queue, bio); spin_unlock(&bbs->bio_queue_lock); - queue_work(ic->writer_wq, &bbs->work); - return; } } @@ -1997,7 +2000,8 @@ offload_to_thread: dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) goto skip_check; if (ic->mode == 'B') { - if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) + if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) goto skip_check; } @@ -2039,20 +2043,14 @@ static void pad_uncommitted(struct dm_integrity_c *ic) ic->n_uncommitted_sections++; } if (WARN_ON(ic->journal_sections * ic->journal_section_entries != - (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors)) { - printk(KERN_CRIT "dm-integrity: " - "journal_sections %u, " - "journal_section_entries %u, " - "n_uncommitted_sections %u, " - "n_committed_sections %u, " - "journal_section_entries %u, " - "free_sectors %u\n", - ic->journal_sections, - ic->journal_section_entries, - ic->n_uncommitted_sections, - ic->n_committed_sections, - ic->journal_section_entries, - ic->free_sectors); + (ic->n_uncommitted_sections + ic->n_committed_sections) * + ic->journal_section_entries + ic->free_sectors)) { + DMCRIT("journal_sections %u, journal_section_entries %u, " + "n_uncommitted_sections %u, n_committed_sections %u, " + "journal_section_entries %u, free_sectors %u", + ic->journal_sections, ic->journal_section_entries, + ic->n_uncommitted_sections, ic->n_committed_sections, + ic->journal_section_entries, ic->free_sectors); } } @@ -2378,12 +2376,14 @@ next_chunk: if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { goto advance_and_next; } - while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { logical_sector += ic->sectors_per_block; n_sectors -= ic->sectors_per_block; cond_resched(); } - while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block, ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { n_sectors -= ic->sectors_per_block; cond_resched(); } @@ -2471,12 +2471,14 @@ static void bitmap_block_work(struct work_struct *w) dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { remove_range(ic, &dio->range); INIT_WORK(&dio->work, integrity_bio_wait); queue_work(ic->wait_wq, &dio->work); } else { - block_bitmap_op(ic, ic->journal, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->journal, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); bio_list_add(&waiting, bio); } } @@ -2484,12 +2486,15 @@ static void bitmap_block_work(struct work_struct *w) if (bio_list_empty(&waiting)) return; - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, + bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), + BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); while ((bio = bio_list_pop(&waiting))) { struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); remove_range(ic, &dio->range); INIT_WORK(&dio->work, integrity_bio_wait); @@ -2529,7 +2534,8 @@ static void bitmap_flush_work(struct work_struct *work) block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); spin_lock_irq(&ic->endio_wait.lock); remove_range_unlocked(ic, &range); @@ -2793,6 +2799,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) if (ic->mode == 'B') { dm_integrity_flush_buffers(ic); #if 1 + /* set to 0 to test bitmap replay code */ init_journal(ic, 0, ic->journal_sections, 0); ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); @@ -2816,22 +2823,26 @@ static void dm_integrity_resume(struct dm_target *ti) if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { DEBUG_print("resume dirty_bitmap\n"); - rw_journal_sectors(ic, REQ_OP_READ, 0, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); if (ic->mode == 'B') { if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal); block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal); - if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { + if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, + BITMAP_OP_TEST_ALL_CLEAR)) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); } } else { - DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n", ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit); + DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n", + ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit); ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET); - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); } @@ -2862,7 +2873,8 @@ static void dm_integrity_resume(struct dm_target *ti) block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode); block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode); block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode); - rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); } } @@ -2884,6 +2896,7 @@ static void dm_integrity_resume(struct dm_target *ti) WARN_ON(register_reboot_notifier(&ic->reboot_notifier)); #if 0 + /* set to 1 to stress test synchronous mode */ dm_integrity_enter_synchronous_mode(ic); #endif } @@ -3160,7 +3173,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str kvfree(sl); } -static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) +static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, + struct page_list *pl) { struct scatterlist **sl; unsigned i; @@ -3179,7 +3193,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int unsigned idx; page_list_location(ic, i, 0, &start_index, &start_offset); - page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); + page_list_location(ic, i, ic->journal_section_sectors - 1, + &end_index, &end_offset); n_pages = (end_index - start_index + 1); @@ -3380,7 +3395,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error) sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); memset(crypt_iv, 0x00, ivsize); - skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); + skcipher_request_set_crypt(req, sg, sg, + PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) @@ -3602,7 +3618,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } - if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || + !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { ic->mode = argv[3][0]; } else { ti->error = "Invalid mode (expecting J, B, D, R)"; @@ -3648,7 +3665,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_put_device(ti, ic->meta_dev); ic->meta_dev = NULL; } - r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev); + r = dm_get_device(ti, strchr(opt_string, ':') + 1, + dm_table_get_mode(ti->table), &ic->meta_dev); if (r) { ti->error = "Device lookup failed"; goto bad; @@ -3702,7 +3720,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (!journal_sectors) { journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, - ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); } if (!buffer_sectors) From 0f41fcf78849c902ddca564f99a8e23ccfc80333 Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Wed, 15 May 2019 13:50:54 -0300 Subject: [PATCH 39/42] dm ioctl: fix hang in early create error condition The dm_early_create() function (which deals with "dm-mod.create=" kernel command line option) calls dm_hash_insert() who gets an extra reference to the md object. In case of failure, this reference wasn't being released, causing dm_destroy() to hang, thus hanging the whole boot process. Fix this by calling __hash_remove() in the error path. Fixes: 6bbc923dfcf57d ("dm: add support to directly boot to a mapped device") Cc: stable@vger.kernel.org Signed-off-by: Helen Koike Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c740153b4e52..1e03bc89e20f 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi, /* alloc table */ r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md); if (r) - goto err_destroy_dm; + goto err_hash_remove; /* add targets */ for (i = 0; i < dmi->target_count; i++) { @@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi, err_destroy_table: dm_table_destroy(t); +err_hash_remove: + (void) __hash_remove(__get_name_cell(dmi->name)); + /* release reference from __get_name_cell */ + dm_put(md); err_destroy_dm: dm_put(md); dm_destroy(md); From 7a1cd7238fde6ab367384a4a2998cba48330c398 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 15 May 2019 16:23:43 +0200 Subject: [PATCH 40/42] dm crypt: move detailed message into debug level The information about tag size should not be printed without debug info set. Also print device major:minor in the error message to identify the device instance. Also use rate limiting and debug level for info about used crypto API implementaton. This is important because during online reencryption the existing message saturates syslog (because we are moving hotzone across the whole device). Cc: stable@vger.kernel.org Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 692cddf3fe2a..3b1a786d65c2 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -949,6 +949,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) { #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); + struct mapped_device *md = dm_table_get_md(ti->table); /* From now we require underlying device with our integrity profile */ if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { @@ -968,7 +969,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) if (crypt_integrity_aead(cc)) { cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; - DMINFO("Integrity AEAD, tag size %u, IV size %u.", + DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md), cc->integrity_tag_size, cc->integrity_iv_size); if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { @@ -976,7 +977,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) return -EINVAL; } } else if (cc->integrity_iv_size) - DMINFO("Additional per-sector space %u bytes for IV.", + DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md), cc->integrity_iv_size); if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { @@ -1891,7 +1892,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) * algorithm implementation is used. Help people debug performance * problems by logging the ->cra_driver_name. */ - DMINFO("%s using implementation \"%s\"", ciphermode, + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name); return 0; } @@ -1911,7 +1912,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) return err; } - DMINFO("%s using implementation \"%s\"", ciphermode, + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name); return 0; } From f710126cfc89c8df477002a26dee8407eb0b4acd Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 15 May 2019 16:22:30 +0200 Subject: [PATCH 41/42] dm crypt: print device name in integrity error message This message should better identify the DM device with the integrity failure. Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3b1a786d65c2..3fd8253ea2df 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1147,9 +1147,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc, r = crypto_aead_decrypt(req); } - if (r == -EBADMSG) - DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + if (r == -EBADMSG) { + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b), (unsigned long long)le64_to_cpu(*sector)); + } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) r = cc->iv_gen_ops->post(cc, org_iv, dmreq); @@ -1793,7 +1795,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); if (error == -EBADMSG) { - DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b), (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); io->error = BLK_STS_PROTECTION; } else if (error < 0) From 8454fca4f53bbe5e0a71613192674c8ce5c52318 Mon Sep 17 00:00:00 2001 From: Sheetal Singala <2396sheetal@gmail.com> Date: Fri, 10 May 2019 23:18:37 +0530 Subject: [PATCH 42/42] dm: fix a couple brace coding style issues Signed-off-by: Sheetal Singala <2396sheetal@gmail.com> Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 56c34a0a9cd9..1fb1333fefec 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) { + fmode_t mode) +{ struct table_device *td; list_for_each_entry(td, l, list) @@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev, } int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, - struct dm_dev **result) { + struct dm_dev **result) +{ int r; struct table_device *td;