From 892c7a77f6c821d3ecaf8f5e45cc3ca372f653c3 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 15 Dec 2020 13:31:25 -0800 Subject: [PATCH 01/31] dm dust: remove h from printk format specifier See Documentation/core-api/printk-formats.rst. commit cbacb5ab0aa0 ("docs: printk-formats: Stop encouraging use of unnecessary %h[xudi] and %hh[xudi]") Standard integer promotion is already done and %hx and %hhx is useless so do not encourage the use of %hh[xudi] or %h[xudi]. Signed-off-by: Tom Rix Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 072ea913cebc..cbe1058ee589 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -130,7 +130,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block, dd->badblock_count++; if (!dd->quiet_mode) { - DMINFO("%s: badblock added at block %llu with write fail count %hhu", + DMINFO("%s: badblock added at block %llu with write fail count %u", __func__, block, wr_fail_cnt); } spin_unlock_irqrestore(&dd->dust_lock, flags); From 74d1da3988f677daf2919dbb6bcd3bb13f094960 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 16 Dec 2020 14:15:42 +0100 Subject: [PATCH 02/31] dm crypt: Spelling s/cihper/cipher/ Fix a misspelling of "cipher". Signed-off-by: Geert Uytterhoeven Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 5a55617a08e6..a45d23d31b0a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -133,7 +133,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, DM_CRYPT_WRITE_INLINE }; enum cipher_flags { - CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ + CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */ }; From 23c4ecbc3e6af3e2c5d7ce2134a39b73b81947d0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 11 Jan 2021 09:25:55 +0000 Subject: [PATCH 03/31] dm integrity: fix spelling mistake "flusing" -> "flushing" There is a spelling mistake in a dm_integrity_io_error error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index b64fede032dc..7e48639e4036 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1399,7 +1399,7 @@ static void flush_notify(unsigned long error, void *fr_) { struct flush_request *fr = fr_; if (unlikely(error != 0)) - dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO); + dm_integrity_io_error(fr->ic, "flushing disk cache", -EIO); complete(&fr->comp); } From 62f263178c16df300f92098c1a6edca0be7d204d Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Tue, 12 Jan 2021 13:52:00 +0800 Subject: [PATCH 04/31] dm: cleanup of front padding calculation Add two helper macros calculating the offset of bio in struct dm_io and struct dm_target_io respectively. Besides, simplify the front padding calculation in dm_alloc_md_mempools(). Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 479ec5bea09e..65b21159a7a6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -105,12 +105,16 @@ struct dm_io { struct dm_target_io tio; }; +#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) +#define DM_IO_BIO_OFFSET \ + (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) + void *dm_per_bio_data(struct bio *bio, size_t data_size) { struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); if (!tio->inside_dm_io) - return (char *)bio - offsetof(struct dm_target_io, clone) - data_size; - return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size; + return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; + return (char *)bio - DM_IO_BIO_OFFSET - data_size; } EXPORT_SYMBOL_GPL(dm_per_bio_data); @@ -118,9 +122,9 @@ struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) { struct dm_io *io = (struct dm_io *)((char *)data + data_size); if (io->magic == DM_IO_MAGIC) - return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone)); + return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); BUG_ON(io->magic != DM_TIO_MAGIC); - return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone)); + return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); } EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); @@ -2849,8 +2853,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); - front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio); + front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; + io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); if (ret) goto out; From 4c9e9883c20a3ad5384e689bdbb1d0677da4094c Mon Sep 17 00:00:00 2001 From: Jinoh Kang Date: Sun, 17 Jan 2021 11:49:33 +0000 Subject: [PATCH 05/31] dm persistent data: fix return type of shadow_root() shadow_root() truncates 64-bit dm_block_t into 32-bit int. This is not an issue in practice, since dm metadata as of v5.11 can only hold at most 4161600 blocks (255 index entries * ~16k metadata blocks). Nevertheless, this can confuse users debugging some specific data corruption scenarios. Also, DM_SM_METADATA_MAX_BLOCKS may be bumped in the future, or persistent-data may find its use in other places. Therefore, switch the return type of shadow_root from int to dm_block_t. Signed-off-by: Jinoh Kang Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-btree-internal.h | 2 +- drivers/md/persistent-data/dm-btree-spine.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 564896659dd4..fe073d92f01e 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -100,7 +100,7 @@ struct dm_block *shadow_parent(struct shadow_spine *s); int shadow_has_parent(struct shadow_spine *s); -int shadow_root(struct shadow_spine *s); +dm_block_t shadow_root(struct shadow_spine *s); /* * Some inlines. diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index e03cb9e48773..8a2bfbfb218b 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -235,7 +235,7 @@ int shadow_has_parent(struct shadow_spine *s) return s->count >= 2; } -int shadow_root(struct shadow_spine *s) +dm_block_t shadow_root(struct shadow_spine *s) { return s->root; } From 09d85f8d8909ec8baa07479ba5777bbca24961f3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 21 Jan 2021 10:09:32 -0500 Subject: [PATCH 06/31] dm integrity: introduce the "fix_hmac" argument The "fix_hmac" argument improves security of internal_hash and journal_mac: - the section number is mixed to the mac, so that an attacker can't copy sectors from one journal section to another journal section - the superblock is protected by journal_mac - a 16-byte salt stored in the superblock is mixed to the mac, so that the attacker can't detect that two disks have the same hmac key and also to disallow the attacker to move sectors from one disk to another Signed-off-by: Mikulas Patocka Reported-by: Daniel Glockner Signed-off-by: Lukas Bulwahn # ReST fix Tested-by: Milan Broz Signed-off-by: Mike Snitzer --- .../device-mapper/dm-integrity.rst | 11 ++ drivers/md/dm-integrity.c | 138 ++++++++++++++++-- 2 files changed, 136 insertions(+), 13 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index 2cc5488acbd9..ef762857da95 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -186,6 +186,17 @@ fix_padding space-efficient. If this option is not present, large padding is used - that is for compatibility with older kernels. +fix_hmac + Improve security of internal_hash and journal_mac: + + - the section number is mixed to the mac, so that an attacker can't + copy sectors from one journal section to another journal section + - the superblock is protected by journal_mac + - a 16-byte salt stored in the superblock is mixed to the mac, so + that the attacker can't detect that two disks have the same hmac + key and also to disallow the attacker to move sectors from one + disk to another + legacy_recalculate Allow recalculating of volumes with HMAC keys. This is disabled by default for security reasons - an attacker could modify the volume, diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 7e48639e4036..46b5d542b8fe 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -40,6 +40,7 @@ #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) #define DISCARD_FILLER 0xf6 +#define SALT_SIZE 16 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -57,6 +58,7 @@ #define SB_VERSION_2 2 #define SB_VERSION_3 3 #define SB_VERSION_4 4 +#define SB_VERSION_5 5 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -72,12 +74,15 @@ struct superblock { __u8 log2_blocks_per_bitmap_bit; __u8 pad[2]; __u64 recalc_sector; + __u8 pad2[8]; + __u8 salt[SALT_SIZE]; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 #define SB_FLAG_DIRTY_BITMAP 0x4 #define SB_FLAG_FIXED_PADDING 0x8 +#define SB_FLAG_FIXED_HMAC 0x10 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -259,6 +264,7 @@ struct dm_integrity_c { bool recalculate_flag; bool discard; bool fix_padding; + bool fix_hmac; bool legacy_recalculate; struct alg_spec internal_hash_alg; @@ -389,8 +395,11 @@ static int dm_integrity_failed(struct dm_integrity_c *ic) static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic) { - if ((ic->internal_hash_alg.key || ic->journal_mac_alg.key) && - !ic->legacy_recalculate) + if (ic->legacy_recalculate) + return false; + if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ? + ic->internal_hash_alg.key || ic->journal_mac_alg.key : + ic->internal_hash_alg.key && !ic->journal_mac_alg.key) return true; return false; } @@ -477,7 +486,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) + ic->sb->version = SB_VERSION_5; + else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) ic->sb->version = SB_VERSION_4; else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) ic->sb->version = SB_VERSION_3; @@ -487,10 +498,58 @@ static void sb_set_version(struct dm_integrity_c *ic) ic->sb->version = SB_VERSION_1; } +static int sb_mac(struct dm_integrity_c *ic, bool wr) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned size = crypto_shash_digestsize(ic->journal_mac); + + if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) { + dm_integrity_io_error(ic, "digest is too long", -EINVAL); + return -EINVAL; + } + + desc->tfm = ic->journal_mac; + + r = crypto_shash_init(desc); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + return r; + } + + r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + return r; + } + + if (likely(wr)) { + r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + return r; + } + } else { + __u8 result[HASH_MAX_DIGESTSIZE]; + r = crypto_shash_final(desc, result); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + return r; + } + if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) { + dm_integrity_io_error(ic, "superblock mac", -EILSEQ); + return -EILSEQ; + } + } + + return 0; +} + static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) { struct dm_io_request io_req; struct dm_io_region io_loc; + int r; io_req.bi_op = op; io_req.bi_op_flags = op_flags; @@ -502,10 +561,28 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) io_loc.sector = ic->start; io_loc.count = SB_SECTORS; - if (op == REQ_OP_WRITE) + if (op == REQ_OP_WRITE) { sb_set_version(ic); + if (ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = sb_mac(ic, true); + if (unlikely(r)) + return r; + } + } - return dm_io(&io_req, 1, &io_loc, NULL); + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) + return r; + + if (op == REQ_OP_READ) { + if (ic->mode != 'R' && ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = sb_mac(ic, false); + if (unlikely(r)) + return r; + } + } + + return 0; } #define BITMAP_OP_TEST_ALL_SET 0 @@ -722,15 +799,32 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result desc->tfm = ic->journal_mac; r = crypto_shash_init(desc); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_init", r); goto err; } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + uint64_t section_le; + + r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + + section_le = cpu_to_le64(section); + r = crypto_shash_update(desc, (__u8 *)§ion_le, sizeof section_le); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + for (j = 0; j < ic->journal_section_entries; j++) { struct journal_entry *je = access_journal_entry(ic, section, j); r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); goto err; } @@ -740,7 +834,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result if (likely(size <= JOURNAL_MAC_SIZE)) { r = crypto_shash_final(desc, result); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_final", r); goto err; } @@ -753,7 +847,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result goto err; } r = crypto_shash_final(desc, digest); - if (unlikely(r)) { + if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_final", r); goto err; } @@ -1556,6 +1650,14 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector goto failed; } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) { + r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE); + if (unlikely(r < 0)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto failed; + } + } + r = crypto_shash_update(req, (const __u8 *)§or_le, sizeof sector_le); if (unlikely(r < 0)) { dm_integrity_io_error(ic, "crypto_shash_update", r); @@ -3149,6 +3251,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0; arg_count += ic->legacy_recalculate; DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start, ic->tag_size, ic->mode, arg_count); @@ -3173,6 +3276,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, } if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) DMEMIT(" fix_padding"); + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0) + DMEMIT(" fix_hmac"); if (ic->legacy_recalculate) DMEMIT(" legacy_recalculate"); @@ -3310,6 +3415,11 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec if (!journal_sections) journal_sections = 1; + if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC); + get_random_bytes(ic->sb->salt, SALT_SIZE); + } + if (!ic->meta_dev) { if (ic->fix_padding) ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); @@ -3804,7 +3914,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned extra_args; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 16, "Invalid number of feature args"}, + {0, 17, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; bool should_write_sb; @@ -3942,7 +4052,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto bad; } else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) { - r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, + r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error, "Invalid journal_mac argument"); if (r) goto bad; @@ -3952,6 +4062,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ic->discard = true; } else if (!strcmp(opt_string, "fix_padding")) { ic->fix_padding = true; + } else if (!strcmp(opt_string, "fix_hmac")) { + ic->fix_hmac = true; } else if (!strcmp(opt_string, "legacy_recalculate")) { ic->legacy_recalculate = true; } else { @@ -4110,7 +4222,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_4) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_5) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -4442,7 +4554,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 6, 0}, + .version = {1, 7, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, From cb728484a7710c202f02b96aa0962ce9b07aa5c2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 23 Jan 2021 09:19:56 -0500 Subject: [PATCH 07/31] dm writecache: fix performance degradation in ssd mode Fix a thinko in ssd_commit_superblock. region.count is in sectors, not bytes. This bug doesn't corrupt data, but it causes performance degradation. Signed-off-by: Mikulas Patocka Fixes: dc8a01ae1dbd ("dm writecache: optimize superblock write") Cc: stable@vger.kernel.org # v5.7+ Reported-by: J. Bruce Fields Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d5223a0e5cc5..1769653c3d6b 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -523,7 +523,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) region.bdev = wc->ssd_dev->bdev; region.sector = 0; - region.count = PAGE_SIZE; + region.count = PAGE_SIZE >> SECTOR_SHIFT; if (unlikely(region.sector + region.count > wc->metadata_sectors)) region.count = wc->metadata_sectors - region.sector; From 21ec672ecf18dd80e58936b0687da9098913c810 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Tue, 26 Jan 2021 10:40:02 +0800 Subject: [PATCH 08/31] dm writecache: fix unnecessary NULL check warnings Remove NULL checks before vfree() to fix these warnings: ./drivers/md/dm-writecache.c:2008:2-7: WARNING: NULL check before some freeing functions is not needed. ./drivers/md/dm-writecache.c:2024:2-7: WARNING: NULL check before some freeing functions is not needed. Signed-off-by: Tian Tao Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 1769653c3d6b..fd531d5a39c2 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -2004,8 +2004,7 @@ static void writecache_dtr(struct dm_target *ti) if (wc->ssd_dev) dm_put_device(ti, wc->ssd_dev); - if (wc->entries) - vfree(wc->entries); + vfree(wc->entries); if (wc->memory_map) { if (WC_MODE_PMEM(wc)) @@ -2020,8 +2019,7 @@ static void writecache_dtr(struct dm_target *ti) if (wc->dm_io) dm_io_client_destroy(wc->dm_io); - if (wc->dirty_bitmap) - vfree(wc->dirty_bitmap); + vfree(wc->dirty_bitmap); kfree(wc); } From 831475cc0b40f41c886ceb7b25de2598719a5478 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Fri, 22 Jan 2021 09:43:20 +0100 Subject: [PATCH 09/31] dm crypt: replaced #if defined with IS_ENABLED IS_ENABLED(CONFIG_ENCRYPTED_KEYS) is true whether the option is built-in or a module, so use it instead of #if defined checking for each separately. The other #if was to avoid a static function defined, but unused warning. As we now always build the callsite when the function is defined, we can remove that first #if guard. Suggested-by: Arnd Bergmann Signed-off-by: Ahmad Fatoum Acked-by: Dmitry Baryshkov Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a45d23d31b0a..1b3b1a806377 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2436,7 +2436,6 @@ static int set_key_user(struct crypt_config *cc, struct key *key) return 0; } -#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) static int set_key_encrypted(struct crypt_config *cc, struct key *key) { const struct encrypted_key_payload *ekp; @@ -2452,7 +2451,6 @@ static int set_key_encrypted(struct crypt_config *cc, struct key *key) return 0; } -#endif /* CONFIG_ENCRYPTED_KEYS */ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) { @@ -2482,11 +2480,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) { type = &key_type_user; set_key = set_key_user; -#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) - } else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { + } else if (IS_ENABLED(CONFIG_ENCRYPTED_KEYS) && + !strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { type = &key_type_encrypted; set_key = set_key_encrypted; -#endif } else { return -EINVAL; } From 363880c4eb36bd2a70104c165fbc7a6d49858a91 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Fri, 22 Jan 2021 09:43:21 +0100 Subject: [PATCH 10/31] dm crypt: support using trusted keys Commit 27f5411a718c ("dm crypt: support using encrypted keys") extended dm-crypt to allow use of "encrypted" keys along with "user" and "logon". Along the same lines, teach dm-crypt to support "trusted" keys as well. Signed-off-by: Ahmad Fatoum Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-crypt.rst | 2 +- drivers/md/Kconfig | 1 + drivers/md/dm-crypt.c | 23 ++++++++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index 1a6753b76dbb..aa2d04d95df6 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -67,7 +67,7 @@ Parameters:: the value passed in . - Either 'logon', 'user' or 'encrypted' kernel key type. + Either 'logon', 'user', 'encrypted' or 'trusted' kernel key type. The kernel keyring key description crypt target should look for diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9e44c09f6410..f2014385d48b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -270,6 +270,7 @@ config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) + depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b3b1a806377..ee20b586f526 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -2452,6 +2453,22 @@ static int set_key_encrypted(struct crypt_config *cc, struct key *key) return 0; } +static int set_key_trusted(struct crypt_config *cc, struct key *key) +{ + const struct trusted_key_payload *tkp; + + tkp = key->payload.data[0]; + if (!tkp) + return -EKEYREVOKED; + + if (cc->key_size != tkp->key_len) + return -EINVAL; + + memcpy(cc->key, tkp->key, cc->key_size); + + return 0; +} + static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) { char *new_key_string, *key_desc; @@ -2484,6 +2501,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string !strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { type = &key_type_encrypted; set_key = set_key_encrypted; + } else if (IS_ENABLED(CONFIG_TRUSTED_KEYS) && + !strncmp(key_string, "trusted:", key_desc - key_string + 1)) { + type = &key_type_trusted; + set_key = set_key_trusted; } else { return -EINVAL; } @@ -3555,7 +3576,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 22, 0}, + .version = {1, 23, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, From 054bee16163df023e2589db09fd27d81f7ad9e72 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 4 Feb 2021 05:20:52 -0500 Subject: [PATCH 11/31] dm writecache: return the exact table values that were set LVM doesn't like it when the target returns different values from what was set in the constructor. Fix dm-writecache so that the returned table values are exactly the same as requested values. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 54 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index fd531d5a39c2..e8382d73c73c 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -159,14 +159,22 @@ struct dm_writecache { bool overwrote_committed:1; bool memory_vmapped:1; + bool start_sector_set:1; bool high_wm_percent_set:1; bool low_wm_percent_set:1; bool max_writeback_jobs_set:1; bool autocommit_blocks_set:1; bool autocommit_time_set:1; + bool max_age_set:1; bool writeback_fua_set:1; bool flush_on_suspend:1; bool cleaner:1; + bool cleaner_set:1; + + unsigned high_wm_percent_value; + unsigned low_wm_percent_value; + unsigned autocommit_time_value; + unsigned max_age_value; unsigned writeback_all; struct workqueue_struct *writeback_wq; @@ -2203,6 +2211,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) goto invalid_optional; wc->start_sector = start_sector; + wc->start_sector_set = true; if (wc->start_sector != start_sector || wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) goto invalid_optional; @@ -2212,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (high_wm_percent < 0 || high_wm_percent > 100) goto invalid_optional; + wc->high_wm_percent_value = high_wm_percent; wc->high_wm_percent_set = true; } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2219,6 +2229,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (low_wm_percent < 0 || low_wm_percent > 100) goto invalid_optional; + wc->low_wm_percent_value = low_wm_percent; wc->low_wm_percent_set = true; } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2238,6 +2249,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (autocommit_msecs > 3600000) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); + wc->autocommit_time_value = autocommit_msecs; wc->autocommit_time_set = true; } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { unsigned max_age_msecs; @@ -2247,7 +2259,10 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (max_age_msecs > 86400000) goto invalid_optional; wc->max_age = msecs_to_jiffies(max_age_msecs); + wc->max_age_set = true; + wc->max_age_value = max_age_msecs; } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner_set = true; wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { @@ -2453,7 +2468,6 @@ static void writecache_status(struct dm_target *ti, status_type_t type, struct dm_writecache *wc = ti->private; unsigned extra_args; unsigned sz = 0; - uint64_t x; switch (type) { case STATUSTYPE_INFO: @@ -2465,11 +2479,11 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', wc->dev->name, wc->ssd_dev->name, wc->block_size); extra_args = 0; - if (wc->start_sector) + if (wc->start_sector_set) extra_args += 2; - if (wc->high_wm_percent_set && !wc->cleaner) + if (wc->high_wm_percent_set) extra_args += 2; - if (wc->low_wm_percent_set && !wc->cleaner) + if (wc->low_wm_percent_set) extra_args += 2; if (wc->max_writeback_jobs_set) extra_args += 2; @@ -2477,37 +2491,29 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; - if (wc->max_age != MAX_AGE_UNSPECIFIED) + if (wc->max_age_set) extra_args += 2; - if (wc->cleaner) + if (wc->cleaner_set) extra_args++; if (wc->writeback_fua_set) extra_args++; DMEMIT("%u", extra_args); - if (wc->start_sector) + if (wc->start_sector_set) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set && !wc->cleaner) { - x = (uint64_t)wc->freelist_high_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" high_watermark %u", 100 - (unsigned)x); - } - if (wc->low_wm_percent_set && !wc->cleaner) { - x = (uint64_t)wc->freelist_low_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" low_watermark %u", 100 - (unsigned)x); - } + if (wc->high_wm_percent_set) + DMEMIT(" high_watermark %u", wc->high_wm_percent_value); + if (wc->low_wm_percent_set) + DMEMIT(" low_watermark %u", wc->low_wm_percent_value); if (wc->max_writeback_jobs_set) DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); if (wc->autocommit_blocks_set) DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) - DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); - if (wc->max_age != MAX_AGE_UNSPECIFIED) - DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); - if (wc->cleaner) + DMEMIT(" autocommit_time %u", wc->autocommit_time_value); + if (wc->max_age_set) + DMEMIT(" max_age %u", wc->max_age_value); + if (wc->cleaner_set) DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); @@ -2517,7 +2523,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, From a4c8dd9c2d0987cf542a2a0c42684c9c6d78a04e Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Tue, 2 Feb 2021 11:35:28 +0800 Subject: [PATCH 12/31] dm table: fix iterate_devices based device capability checks According to the definition of dm_iterate_devices_fn: * This function must iterate through each section of device used by the * target until it encounters a non-zero return code, which it then returns. * Returns zero if no callout returned non-zero. For some target type (e.g. dm-stripe), one call of iterate_devices() may iterate multiple underlying devices internally, in which case a non-zero return code returned by iterate_devices_callout_fn will stop the iteration in advance. No iterate_devices_callout_fn should return non-zero unless device iteration should stop. Rename dm_table_requires_stable_pages() to dm_table_any_dev_attr() and elevate it for reuse to stop iterating (and return non-zero) on the first device that causes iterate_devices_callout_fn to return non-zero. Use dm_table_any_dev_attr() to properly iterate through devices. Rename device_is_nonrot() to device_is_rotational() and invert logic accordingly to fix improper disposition. Fixes: c3c4555edd10 ("dm table: clear add_random unless all devices have it set") Fixes: 4693c9668fdc ("dm table: propagate non rotational flag") Cc: stable@vger.kernel.org Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 97 +++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4acf2342f7ad..3e211e64f348 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1295,6 +1295,46 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +/* + * type->iterate_devices() should be called when the sanity check needs to + * iterate and check all underlying data devices. iterate_devices() will + * iterate all underlying data devices until it encounters a non-zero return + * code, returned by whether the input iterate_devices_callout_fn, or + * iterate_devices() itself internally. + * + * For some target type (e.g. dm-stripe), one call of iterate_devices() may + * iterate multiple underlying devices internally, in which case a non-zero + * return code returned by iterate_devices_callout_fn will stop the iteration + * in advance. + * + * Cases requiring _any_ underlying device supporting some kind of attribute, + * should use the iteration structure like dm_table_any_dev_attr(), or call + * it directly. @func should handle semantics of positive examples, e.g. + * capable of something. + * + * Cases requiring _all_ underlying devices supporting some kind of attribute, + * should use the iteration structure like dm_table_supports_nowait() or + * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that + * uses an @anti_func that handle semantics of counter examples, e.g. not + * capable of something. So: return !dm_table_any_dev_attr(t, anti_func); + */ +static bool dm_table_any_dev_attr(struct dm_table *t, + iterate_devices_callout_fn func) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, func, NULL)) + return true; + } + + return false; +} + static int count_device(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1595,12 +1635,12 @@ static int dm_table_supports_dax_write_cache(struct dm_table *t) return false; } -static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_nonrot(q); + return q && !blk_queue_nonrot(q); } static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, @@ -1611,23 +1651,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } -static bool dm_table_all_devices_attribute(struct dm_table *t, - iterate_devices_callout_fn func) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, func, NULL)) - return false; - } - - return true; -} - static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1779,27 +1802,6 @@ static int device_requires_stable_pages(struct dm_target *ti, return q && blk_queue_stable_writes(q); } -/* - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ -static bool dm_table_requires_stable_pages(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_requires_stable_pages, NULL)) - return true; - } - - return false; -} - void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1849,10 +1851,10 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_all_devices_attribute(t, device_is_nonrot)) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else + if (dm_table_any_dev_attr(t, device_is_rotational)) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; @@ -1864,8 +1866,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. + * If any underlying device requires stable pages, a table must require + * them as well. Only targets that support iterate_devices are considered: + * don't want error, zero, etc to require stable pages. */ - if (dm_table_requires_stable_pages(t)) + if (dm_table_any_dev_attr(t, device_requires_stable_pages)) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); @@ -1876,7 +1881,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not * have it set. */ - if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + if (blk_queue_add_random(q) && dm_table_any_dev_attr(t, device_is_not_random)) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); /* From 5b0fab508992c2e120971da658ce80027acbc405 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 8 Feb 2021 22:34:36 -0500 Subject: [PATCH 13/31] dm table: fix DAX iterate_devices based device capability checks Fix dm_table_supports_dax() and invert logic of both iterate_devices_callout_fn so that all devices' DAX capabilities are properly checked. Fixes: 545ed20e6df6 ("dm: add infrastructure for DAX support") Cc: stable@vger.kernel.org Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 37 ++++++++++--------------------------- drivers/md/dm.c | 2 +- drivers/md/dm.h | 2 +- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3e211e64f348..71f7ec508cf7 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -820,24 +820,24 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) EXPORT_SYMBOL_GPL(dm_table_set_type); /* validate the dax capability of the target device span */ -int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { int blocksize = *(int *) data, id; bool rc; id = dax_read_lock(); - rc = dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); + rc = !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); dax_read_unlock(id); return rc; } /* Check devices support synchronous DAX */ -static int device_dax_synchronous(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - return dev->dax_dev && dax_synchronous(dev->dax_dev); + return !dev->dax_dev || !dax_synchronous(dev->dax_dev); } bool dm_table_supports_dax(struct dm_table *t, @@ -854,7 +854,7 @@ bool dm_table_supports_dax(struct dm_table *t, return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, iterate_fn, blocksize)) + ti->type->iterate_devices(ti, iterate_fn, blocksize)) return false; } @@ -925,7 +925,7 @@ static int dm_table_determine_type(struct dm_table *t) verify_bio_based: /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; - if (dm_table_supports_dax(t, device_supports_dax, &page_size) || + if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) || (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { t->type = DM_TYPE_DAX_BIO_BASED; } @@ -1618,23 +1618,6 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int dm_table_supports_dax_write_cache(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, - device_dax_write_cache_enabled, NULL)) - return true; - } - - return false; -} - static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1839,15 +1822,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); - if (dm_table_supports_dax(t, device_supports_dax, &page_size)) { + if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) { blk_queue_flag_set(QUEUE_FLAG_DAX, q); - if (dm_table_supports_dax(t, device_dax_synchronous, NULL)) + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL)) set_dax_synchronous(t->md->dax_dev); } else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); - if (dm_table_supports_dax_write_cache(t)) + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled)) dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 65b21159a7a6..64fda1429e39 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1133,7 +1133,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd if (!map) goto out; - ret = dm_table_supports_dax(map, device_supports_dax, &blocksize); + ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize); out: dm_put_live_table(md, srcu_idx); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index fffe1e289c53..b441ad772c18 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -73,7 +73,7 @@ void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn, int *blocksize); -int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data); void dm_lock_md_type(struct mapped_device *md); From 24f6b6036c9eec21191646930ad42808e6180510 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 8 Feb 2021 22:46:38 -0500 Subject: [PATCH 14/31] dm table: fix zoned iterate_devices based device capability checks Fix dm_table_supports_zoned_model() and invert logic of both iterate_devices_callout_fn so that all devices' zoned capabilities are properly checked. Add one more parameter to dm_table_any_dev_attr(), which is actually used as the @data parameter of iterate_devices_callout_fn, so that dm_table_matches_zone_sectors() can be replaced by dm_table_any_dev_attr(). Fixes: dd88d313bef02 ("dm table: add zoned block devices validation") Cc: stable@vger.kernel.org Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 48 +++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 71f7ec508cf7..77086db8b920 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1316,10 +1316,10 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) * should use the iteration structure like dm_table_supports_nowait() or * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that * uses an @anti_func that handle semantics of counter examples, e.g. not - * capable of something. So: return !dm_table_any_dev_attr(t, anti_func); + * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); */ static bool dm_table_any_dev_attr(struct dm_table *t, - iterate_devices_callout_fn func) + iterate_devices_callout_fn func, void *data) { struct dm_target *ti; unsigned int i; @@ -1328,7 +1328,7 @@ static bool dm_table_any_dev_attr(struct dm_table *t, ti = dm_table_get_target(t, i); if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, func, NULL)) + ti->type->iterate_devices(ti, func, data)) return true; } @@ -1371,13 +1371,13 @@ bool dm_table_has_no_data_devices(struct dm_table *table) return true; } -static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); enum blk_zoned_model *zoned_model = data; - return q && blk_queue_zoned_model(q) == *zoned_model; + return !q || blk_queue_zoned_model(q) != *zoned_model; } static bool dm_table_supports_zoned_model(struct dm_table *t, @@ -1394,37 +1394,20 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model)) + ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model)) return false; } return true; } -static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); unsigned int *zone_sectors = data; - return q && blk_queue_zone_sectors(q) == *zone_sectors; -} - -static bool dm_table_matches_zone_sectors(struct dm_table *t, - unsigned int zone_sectors) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors)) - return false; - } - - return true; + return !q || blk_queue_zone_sectors(q) != *zone_sectors; } static int validate_hardware_zoned_model(struct dm_table *table, @@ -1444,7 +1427,7 @@ static int validate_hardware_zoned_model(struct dm_table *table, if (!zone_sectors || !is_power_of_2(zone_sectors)) return -EINVAL; - if (!dm_table_matches_zone_sectors(table, zone_sectors)) { + if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) { DMERR("%s: zone sectors is not consistent across all devices", dm_device_name(table->md)); return -EINVAL; @@ -1830,11 +1813,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); - if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled)) + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_any_dev_attr(t, device_is_rotational)) + if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); @@ -1853,7 +1836,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * them as well. Only targets that support iterate_devices are considered: * don't want error, zero, etc to require stable pages. */ - if (dm_table_any_dev_attr(t, device_requires_stable_pages)) + if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); else blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); @@ -1864,7 +1847,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not * have it set. */ - if (blk_queue_add_random(q) && dm_table_any_dev_attr(t, device_is_not_random)) + if (blk_queue_add_random(q) && + dm_table_any_dev_attr(t, device_is_not_random, NULL)) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); /* From cccb493cdae76a51e1258c0738e2b43869655d39 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 8 Feb 2021 11:57:05 -0500 Subject: [PATCH 15/31] dm table: remove needless request_queue NULL pointer checks Since commit ff9ea323816d ("block, bdi: an active gendisk always has a request_queue associated with it") the request_queue pointer returned from bdev_get_queue() shall never be NULL. Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 77086db8b920..2bc256d61550 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1377,7 +1377,7 @@ static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, struct request_queue *q = bdev_get_queue(dev->bdev); enum blk_zoned_model *zoned_model = data; - return !q || blk_queue_zoned_model(q) != *zoned_model; + return blk_queue_zoned_model(q) != *zoned_model; } static bool dm_table_supports_zoned_model(struct dm_table *t, @@ -1407,7 +1407,7 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev * struct request_queue *q = bdev_get_queue(dev->bdev); unsigned int *zone_sectors = data; - return !q || blk_queue_zone_sectors(q) != *zone_sectors; + return blk_queue_zone_sectors(q) != *zone_sectors; } static int validate_hardware_zoned_model(struct dm_table *table, @@ -1556,7 +1556,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, unsigned long flush = (unsigned long) data; struct request_queue *q = bdev_get_queue(dev->bdev); - return q && (q->queue_flags & flush); + return (q->queue_flags & flush); } static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) @@ -1606,7 +1606,7 @@ static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_nonrot(q); + return !blk_queue_nonrot(q); } static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, @@ -1614,7 +1614,7 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_add_random(q); + return !blk_queue_add_random(q); } static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1622,7 +1622,7 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !q->limits.max_write_same_sectors; + return !q->limits.max_write_same_sectors; } static bool dm_table_supports_write_same(struct dm_table *t) @@ -1649,7 +1649,7 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !q->limits.max_write_zeroes_sectors; + return !q->limits.max_write_zeroes_sectors; } static bool dm_table_supports_write_zeroes(struct dm_table *t) @@ -1676,7 +1676,7 @@ static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_nowait(q); + return !blk_queue_nowait(q); } static bool dm_table_supports_nowait(struct dm_table *t) @@ -1703,7 +1703,7 @@ static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_discard(q); + return !blk_queue_discard(q); } static bool dm_table_supports_discards(struct dm_table *t) @@ -1737,7 +1737,7 @@ static int device_not_secure_erase_capable(struct dm_target *ti, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !blk_queue_secure_erase(q); + return !blk_queue_secure_erase(q); } static bool dm_table_supports_secure_erase(struct dm_table *t) @@ -1765,7 +1765,7 @@ static int device_requires_stable_pages(struct dm_target *ti, { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_stable_writes(q); + return blk_queue_stable_writes(q); } void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, From 4134455f2aafdfeab50cabb4cccb35e916034b93 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 9 Feb 2021 10:56:20 -0500 Subject: [PATCH 16/31] dm writecache: fix writing beyond end of underlying device when shrinking Do not attempt to write any data beyond the end of the underlying data device while shrinking it. The DM writecache device must be suspended when the underlying data device is shrunk. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e8382d73c73c..baf5a9ce2b25 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -148,6 +148,7 @@ struct dm_writecache { size_t metadata_sectors; size_t n_blocks; uint64_t seq_count; + sector_t data_device_sectors; void *block_start; struct wc_entry *entries; unsigned block_size; @@ -977,6 +978,8 @@ static void writecache_resume(struct dm_target *ti) wc_lock(wc); + wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT; + if (WC_MODE_PMEM(wc)) { persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); } else { @@ -1646,6 +1649,10 @@ static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t void *address = memory_data(wc, e); persistent_memory_flush_cache(address, block_size); + + if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) + return true; + return bio_add_page(&wb->bio, persistent_memory_page(address), block_size, persistent_memory_page_offset(address)) != 0; } @@ -1717,6 +1724,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); + } else if (unlikely(!bio_sectors(bio))) { + bio->bi_status = BLK_STS_OK; + bio_endio(bio); } else { submit_bio(bio); } @@ -1760,6 +1770,14 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac e = f; } + if (unlikely(to.sector + to.count > wc->data_device_sectors)) { + if (to.sector >= wc->data_device_sectors) { + writecache_copy_endio(0, 0, c); + continue; + } + from.count = to.count = wc->data_device_sectors - to.sector; + } + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); __writeback_throttle(wc, wbl); From d9928ac5eba5b129299e9d032b79d436336339f6 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 9 Feb 2021 16:53:05 -0500 Subject: [PATCH 17/31] dm writecache: use bdev_nr_sectors() instead of open-coded equivalent Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index baf5a9ce2b25..844c4be11768 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -978,7 +978,7 @@ static void writecache_resume(struct dm_target *ti) wc_lock(wc); - wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT; + wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); if (WC_MODE_PMEM(wc)) { persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); From de89afc1e40fdfa5f8b666e5d07c43d21a1d3be0 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 22 Jan 2021 17:19:30 +0200 Subject: [PATCH 18/31] dm era: Recover committed writeset after crash Following a system crash, dm-era fails to recover the committed writeset for the current era, leading to lost writes. That is, we lose the information about what blocks were written during the affected era. dm-era assumes that the writeset of the current era is archived when the device is suspended. So, when resuming the device, it just moves on to the next era, ignoring the committed writeset. This assumption holds when the device is properly shut down. But, when the system crashes, the code that suspends the target never runs, so the writeset for the current era is not archived. There are three issues that cause the committed writeset to get lost: 1. dm-era doesn't load the committed writeset when opening the metadata 2. The code that resizes the metadata wipes the information about the committed writeset (assuming it was loaded at step 1) 3. era_preresume() starts a new era, without taking into account that the current era might not have been archived, due to a system crash. To fix this: 1. Load the committed writeset when opening the metadata 2. Fix the code that resizes the metadata to make sure it doesn't wipe the loaded writeset 3. Fix era_preresume() to check for a loaded writeset and archive it, before starting a new era. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index b24e3839bb3a..854b1be8b452 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -71,8 +71,6 @@ static size_t bitset_size(unsigned nr_bits) */ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) { - ws->md.nr_bits = nr_blocks; - ws->md.root = INVALID_WRITESET_ROOT; ws->bits = vzalloc(bitset_size(nr_blocks)); if (!ws->bits) { DMERR("%s: couldn't allocate in memory bitset", __func__); @@ -85,12 +83,14 @@ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) /* * Wipes the in-core bitset, and creates a new on disk bitset. */ -static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) +static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws, + dm_block_t nr_blocks) { int r; - memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); + memset(ws->bits, 0, bitset_size(nr_blocks)); + ws->md.nr_bits = nr_blocks; r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); if (r) { DMERR("%s: setup_on_disk_bitset failed", __func__); @@ -579,6 +579,7 @@ static int open_metadata(struct era_metadata *md) md->nr_blocks = le32_to_cpu(disk->nr_blocks); md->current_era = le32_to_cpu(disk->current_era); + ws_unpack(&disk->current_writeset, &md->current_writeset->md); md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); md->era_array_root = le64_to_cpu(disk->era_array_root); md->metadata_snap = le64_to_cpu(disk->metadata_snap); @@ -870,7 +871,6 @@ static int metadata_era_archive(struct era_metadata *md) } ws_pack(&md->current_writeset->md, &value); - md->current_writeset->md.root = INVALID_WRITESET_ROOT; keys[0] = md->current_era; __dm_bless_for_disk(&value); @@ -882,6 +882,7 @@ static int metadata_era_archive(struct era_metadata *md) return r; } + md->current_writeset->md.root = INVALID_WRITESET_ROOT; md->archived_writesets = true; return 0; @@ -898,7 +899,7 @@ static int metadata_new_era(struct era_metadata *md) int r; struct writeset *new_writeset = next_writeset(md); - r = writeset_init(&md->bitset_info, new_writeset); + r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks); if (r) { DMERR("%s: writeset_init failed", __func__); return r; @@ -951,7 +952,7 @@ static int metadata_commit(struct era_metadata *md) int r; struct dm_block *sblock; - if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, &md->current_writeset->md.root); if (r) { @@ -1565,7 +1566,7 @@ static int era_preresume(struct dm_target *ti) start_worker(era); - r = in_worker0(era, metadata_new_era); + r = in_worker0(era, metadata_era_rollover); if (r) { DMERR("%s: metadata_era_rollover failed", __func__); return r; From 2099b145d77c1d53f5711f029c37cc537897cee6 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 22 Jan 2021 17:19:31 +0200 Subject: [PATCH 19/31] dm era: Update in-core bitset after committing the metadata In case of a system crash, dm-era might fail to mark blocks as written in its metadata, although the corresponding writes to these blocks were passed down to the origin device and completed successfully. Consider the following sequence of events: 1. We write to a block that has not been yet written in the current era 2. era_map() checks the in-core bitmap for the current era and sees that the block is not marked as written. 3. The write is deferred for submission after the metadata have been updated and committed. 4. The worker thread processes the deferred write (process_deferred_bios()) and marks the block as written in the in-core bitmap, **before** committing the metadata. 5. The worker thread starts committing the metadata. 6. We do more writes that map to the same block as the write of step (1) 7. era_map() checks the in-core bitmap and sees that the block is marked as written, **although the metadata have not been committed yet**. 8. These writes are passed down to the origin device immediately and the device reports them as completed. 9. The system crashes, e.g., power failure, before the commit from step (5) finishes. When the system recovers and we query the dm-era target for the list of written blocks it doesn't report the aforementioned block as written, although the writes of step (6) completed successfully. The issue is that era_map() decides whether to defer or not a write based on non committed information. The root cause of the bug is that we update the in-core bitmap, **before** committing the metadata. Fix this by updating the in-core bitmap **after** successfully committing the metadata. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 854b1be8b452..62f679faf9e7 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -134,7 +134,7 @@ static int writeset_test_and_set(struct dm_disk_bitset *info, { int r; - if (!test_and_set_bit(block, ws->bits)) { + if (!test_bit(block, ws->bits)) { r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); if (r) { /* FIXME: fail mode */ @@ -1226,8 +1226,10 @@ static void process_deferred_bios(struct era *era) int r; struct bio_list deferred_bios, marked_bios; struct bio *bio; + struct blk_plug plug; bool commit_needed = false; bool failed = false; + struct writeset *ws = era->md->current_writeset; bio_list_init(&deferred_bios); bio_list_init(&marked_bios); @@ -1237,9 +1239,11 @@ static void process_deferred_bios(struct era *era) bio_list_init(&era->deferred_bios); spin_unlock(&era->deferred_lock); + if (bio_list_empty(&deferred_bios)) + return; + while ((bio = bio_list_pop(&deferred_bios))) { - r = writeset_test_and_set(&era->md->bitset_info, - era->md->current_writeset, + r = writeset_test_and_set(&era->md->bitset_info, ws, get_block(era, bio)); if (r < 0) { /* @@ -1247,7 +1251,6 @@ static void process_deferred_bios(struct era *era) * FIXME: finish. */ failed = true; - } else if (r == 0) commit_needed = true; @@ -1263,9 +1266,19 @@ static void process_deferred_bios(struct era *era) if (failed) while ((bio = bio_list_pop(&marked_bios))) bio_io_error(bio); - else - while ((bio = bio_list_pop(&marked_bios))) + else { + blk_start_plug(&plug); + while ((bio = bio_list_pop(&marked_bios))) { + /* + * Only update the in-core writeset if the on-disk one + * was updated too. + */ + if (commit_needed) + set_bit(get_block(era, bio), ws->bits); submit_bio_noacct(bio); + } + blk_finish_plug(&plug); + } } static void process_rpc_calls(struct era *era) From 2524933307fd0036d5c32357c693c021ab09a0b0 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 22 Jan 2021 17:22:04 +0200 Subject: [PATCH 20/31] dm era: Reinitialize bitset cache before digesting a new writeset In case of devices with at most 64 blocks, the digestion of consecutive eras uses the writeset of the first era as the writeset of all eras to digest, leading to lost writes. That is, we lose the information about what blocks were written during the affected eras. The digestion code uses a dm_disk_bitset object to access the archived writesets. This structure includes a one word (64-bit) cache to reduce the number of array lookups. This structure is initialized only once, in metadata_digest_start(), when we kick off digestion. But, when we insert a new writeset into the writeset tree, before the digestion of the previous writeset is done, or equivalently when there are multiple writesets in the writeset tree to digest, then all these writesets are digested using the same cache and the cache is not re-initialized when moving from one writeset to the next. For devices with more than 64 blocks, i.e., the size of the cache, the cache is indirectly invalidated when we move to a next set of blocks, so we avoid the bug. But for devices with at most 64 blocks we end up using the same cached data for digesting all archived writesets, i.e., the cache is loaded when digesting the first writeset and it never gets reloaded, until the digestion is done. As a result, the writeset of the first era to digest is used as the writeset of all the following archived eras, leading to lost writes. Fix this by reinitializing the dm_disk_bitset structure, and thus invalidating the cache, every time the digestion code starts digesting a new writeset. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 62f679faf9e7..33c02b1f5678 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -747,6 +747,12 @@ static int metadata_digest_lookup_writeset(struct era_metadata *md, ws_unpack(&disk, &d->writeset); d->value = cpu_to_le32(key); + /* + * We initialise another bitset info to avoid any caching side effects + * with the previous one. + */ + dm_disk_bitset_init(md->tm, &d->info); + d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); d->current_bit = 0; d->step = metadata_digest_transcribe_writeset; @@ -760,12 +766,6 @@ static int metadata_digest_start(struct era_metadata *md, struct digest *d) return 0; memset(d, 0, sizeof(*d)); - - /* - * We initialise another bitset info to avoid any caching side - * effects with the previous one. - */ - dm_disk_bitset_init(md->tm, &d->info); d->step = metadata_digest_lookup_writeset; return 0; From c8e846ff93d5eaa5384f6f325a1687ac5921aade Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 22 Jan 2021 17:25:53 +0200 Subject: [PATCH 21/31] dm era: Verify the data block size hasn't changed dm-era doesn't support changing the data block size of existing devices, so check explicitly that the requested block size for a new target matches the one stored in the metadata. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Reviewed-by: Ming-Hung Tsai Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 33c02b1f5678..da3d8ace2398 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -564,6 +564,15 @@ static int open_metadata(struct era_metadata *md) } disk = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk->data_block_size) != md->block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk->data_block_size), md->block_size); + r = -EINVAL; + goto bad; + } + r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, disk->metadata_space_map_root, sizeof(disk->metadata_space_map_root), @@ -575,7 +584,6 @@ static int open_metadata(struct era_metadata *md) setup_infos(md); - md->block_size = le32_to_cpu(disk->data_block_size); md->nr_blocks = le32_to_cpu(disk->nr_blocks); md->current_era = le32_to_cpu(disk->current_era); From 904e6b266619c2da5c58b5dce14ae30629e39645 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 22 Jan 2021 17:25:54 +0200 Subject: [PATCH 22/31] dm era: Fix bitset memory leaks Deallocate the memory allocated for the in-core bitsets when destroying the target and in error paths. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Reviewed-by: Ming-Hung Tsai Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index da3d8ace2398..1abd280ff8dc 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -47,6 +47,7 @@ struct writeset { static void writeset_free(struct writeset *ws) { vfree(ws->bits); + ws->bits = NULL; } static int setup_on_disk_bitset(struct dm_disk_bitset *info, @@ -811,6 +812,8 @@ static struct era_metadata *metadata_open(struct block_device *bdev, static void metadata_close(struct era_metadata *md) { + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); destroy_persistent_data_objects(md); kfree(md); } @@ -848,6 +851,7 @@ static int metadata_resize(struct era_metadata *md, void *arg) r = writeset_alloc(&md->writesets[1], *new_size); if (r) { DMERR("%s: writeset_alloc failed for writeset 1", __func__); + writeset_free(&md->writesets[0]); return r; } @@ -858,6 +862,8 @@ static int metadata_resize(struct era_metadata *md, void *arg) &value, &md->era_array_root); if (r) { DMERR("%s: dm_array_resize failed", __func__); + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); return r; } From 64f2d15afe7b336aafebdcd14cc835ecf856df4b Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Fri, 22 Jan 2021 17:25:55 +0200 Subject: [PATCH 23/31] dm era: Use correct value size in equality function of writeset tree Fix the writeset tree equality test function to use the right value size when comparing two btree values. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Reviewed-by: Ming-Hung Tsai Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 1abd280ff8dc..d0e75fd31c1e 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -389,7 +389,7 @@ static void ws_dec(void *context, const void *value) static int ws_eq(void *context, const void *value1, const void *value2) { - return !memcmp(value1, value2, sizeof(struct writeset_metadata)); + return !memcmp(value1, value2, sizeof(struct writeset_disk)); } /*----------------------------------------------------------------*/ From cca2c6aebe86f68103a8615074b3578e854b5016 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Thu, 11 Feb 2021 16:22:43 +0200 Subject: [PATCH 24/31] dm era: only resize metadata in preresume Metadata resize shouldn't happen in the ctr. The ctr loads a temporary (inactive) table that will only become active upon resume. That is why resize should always be done in terms of resume. Otherwise a load (ctr) whose inactive table never becomes active will incorrectly resize the metadata. Also, perform the resize directly in preresume, instead of using the worker to do it. The worker might run other metadata operations, e.g., it could start digestion, before resizing the metadata. These operations will end up using the old size. This could lead to errors, like: device-mapper: era: metadata_digest_transcribe_writeset: dm_array_set_value failed device-mapper: era: process_old_eras: digest step failed, stopping digestion The reason of the above error is that the worker started the digestion of the archived writeset using the old, larger size. As a result, metadata_digest_transcribe_writeset tried to write beyond the end of the era array. Fixes: eec40579d84873 ("dm: add era target") Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-era-target.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index d0e75fd31c1e..d9ac7372108c 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1501,15 +1501,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) } era->md = md; - era->nr_blocks = calc_nr_blocks(era); - - r = metadata_resize(era->md, &era->nr_blocks); - if (r) { - ti->error = "couldn't resize metadata"; - era_destroy(era); - return -ENOMEM; - } - era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); if (!era->wq) { ti->error = "could not create workqueue for metadata object"; @@ -1584,9 +1575,17 @@ static int era_preresume(struct dm_target *ti) dm_block_t new_size = calc_nr_blocks(era); if (era->nr_blocks != new_size) { - r = in_worker1(era, metadata_resize, &new_size); - if (r) + r = metadata_resize(era->md, &new_size); + if (r) { + DMERR("%s: metadata_resize failed", __func__); return r; + } + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); + return r; + } era->nr_blocks = new_size; } From 7bdcc48f4e80b01fd6057dfd382236a5b8123b61 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Mon, 1 Feb 2021 05:10:15 +0000 Subject: [PATCH 25/31] block/keyslot-manager: Introduce passthrough keyslot manager The device mapper may map over devices that have inline encryption capabilities, and to make use of those capabilities, the DM device must itself advertise those inline encryption capabilities. One way to do this would be to have the DM device set up a keyslot manager with a "sufficiently large" number of keyslots, but that would use a lot of memory. Also, the DM device itself has no "keyslots", and it doesn't make much sense to talk about "programming a key into a DM device's keyslot manager", so all that extra memory used to represent those keyslots is just wasted. All a DM device really needs to be able to do is advertise the crypto capabilities of the underlying devices in a coherent manner and expose a way to evict keys from the underlying devices. There are also devices with inline encryption hardware that do not have a limited number of keyslots. One can send a raw encryption key along with a bio to these devices (as opposed to typical inline encryption hardware that require users to first program a raw encryption key into a keyslot, and send the index of that keyslot along with the bio). These devices also only need the same things from the keyslot manager that DM devices need - a way to advertise crypto capabilities and potentially a way to expose a function to evict keys from hardware. So we introduce a "passthrough" keyslot manager that provides a way to represent a keyslot manager that doesn't have just a limited number of keyslots, and for which do not require keys to be programmed into keyslots. DM devices can set up a passthrough keyslot manager in their request queues, and advertise appropriate crypto capabilities based on those of the underlying devices. Blk-crypto does not attempt to program keys into any keyslots in the passthrough keyslot manager. Instead, if/when the bio is resubmitted to the underlying device, blk-crypto will try to program the key into the underlying device's keyslot manager. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Acked-by: Jens Axboe Signed-off-by: Mike Snitzer --- block/keyslot-manager.c | 39 +++++++++++++++++++++++++++++++++ include/linux/keyslot-manager.h | 2 ++ 2 files changed, 41 insertions(+) diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c index 86f8195d8039..ac7ce83a76e8 100644 --- a/block/keyslot-manager.c +++ b/block/keyslot-manager.c @@ -62,6 +62,11 @@ static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) pm_runtime_put_sync(ksm->dev); } +static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm) +{ + return ksm->num_slots == 0; +} + /** * blk_ksm_init() - Initialize a keyslot manager * @ksm: The keyslot_manager to initialize. @@ -205,6 +210,10 @@ blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, int err; *slot_ptr = NULL; + + if (blk_ksm_is_passthrough(ksm)) + return BLK_STS_OK; + down_read(&ksm->lock); slot = blk_ksm_find_and_grab_keyslot(ksm, key); up_read(&ksm->lock); @@ -325,6 +334,16 @@ int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, struct blk_ksm_keyslot *slot; int err = 0; + if (blk_ksm_is_passthrough(ksm)) { + if (ksm->ksm_ll_ops.keyslot_evict) { + blk_ksm_hw_enter(ksm); + err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1); + blk_ksm_hw_exit(ksm); + return err; + } + return 0; + } + blk_ksm_hw_enter(ksm); slot = blk_ksm_find_keyslot(ksm, key); if (!slot) @@ -360,6 +379,9 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) { unsigned int slot; + if (blk_ksm_is_passthrough(ksm)) + return; + /* This is for device initialization, so don't resume the device */ down_write(&ksm->lock); for (slot = 0; slot < ksm->num_slots; slot++) { @@ -401,3 +423,20 @@ void blk_ksm_unregister(struct request_queue *q) { q->ksm = NULL; } + +/** + * blk_ksm_init_passthrough() - Init a passthrough keyslot manager + * @ksm: The keyslot manager to init + * + * Initialize a passthrough keyslot manager. + * Called by e.g. storage drivers to set up a keyslot manager in their + * request_queue, when the storage driver wants to manage its keys by itself. + * This is useful for inline encryption hardware that doesn't have the concept + * of keyslots, and for layered devices. + */ +void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm) +{ + memset(ksm, 0, sizeof(*ksm)); + init_rwsem(&ksm->lock); +} +EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough); diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h index 18f3f5346843..323e15dd6fa7 100644 --- a/include/linux/keyslot-manager.h +++ b/include/linux/keyslot-manager.h @@ -103,4 +103,6 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); void blk_ksm_destroy(struct blk_keyslot_manager *ksm); +void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); + #endif /* __LINUX_KEYSLOT_MANAGER_H */ From d3b17a243790a34bd63fcef3fde63e29e2744938 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Mon, 1 Feb 2021 05:10:16 +0000 Subject: [PATCH 26/31] block/keyslot-manager: Introduce functions for device mapper support Introduce blk_ksm_update_capabilities() to update the capabilities of a keyslot manager (ksm) in-place. The pointer to a ksm in a device's request queue may not be easily replaced, because upper layers like the filesystem might access it (e.g. for programming keys/checking capabilities) at the same time the device wants to replace that request queue's ksm (and free the old ksm's memory). This function allows the device to update the capabilities of the ksm in its request queue directly. Devices can safely update the ksm this way without any synchronization with upper layers *only* if the updated (new) ksm continues to support all the crypto capabilities that the old ksm did (see description below for blk_ksm_is_superset() for why this is so). Also introduce blk_ksm_is_superset() which checks whether one ksm's capabilities are a (not necessarily strict) superset of another ksm's. The blk-crypto framework requires that crypto capabilities that were advertised when a bio was created continue to be supported by the device until that bio is ended - in practice this probably means that a device's advertised crypto capabilities can *never* "shrink" (since there's no synchronization between bio creation and when a device may want to change its advertised capabilities) - so a previously advertised crypto capability must always continue to be supported. This function can be used to check that a new ksm is a valid replacement for an old ksm. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Acked-by: Jens Axboe Signed-off-by: Mike Snitzer --- block/keyslot-manager.c | 107 ++++++++++++++++++++++++++++++++ include/linux/keyslot-manager.h | 9 +++ 2 files changed, 116 insertions(+) diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c index ac7ce83a76e8..9f9494b80148 100644 --- a/block/keyslot-manager.c +++ b/block/keyslot-manager.c @@ -424,6 +424,113 @@ void blk_ksm_unregister(struct request_queue *q) q->ksm = NULL; } +/** + * blk_ksm_intersect_modes() - restrict supported modes by child device + * @parent: The keyslot manager for parent device + * @child: The keyslot manager for child device, or NULL + * + * Clear any crypto mode support bits in @parent that aren't set in @child. + * If @child is NULL, then all parent bits are cleared. + * + * Only use this when setting up the keyslot manager for a layered device, + * before it's been exposed yet. + */ +void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, + const struct blk_keyslot_manager *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported); + i++) { + parent->crypto_modes_supported[i] &= + child->crypto_modes_supported[i]; + } + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->crypto_modes_supported, 0, + sizeof(parent->crypto_modes_supported)); + } +} +EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes); + +/** + * blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes + * and DUN bytes that another KSM supports. Here, + * "superset" refers to the mathematical meaning of the + * word - i.e. if two KSMs have the *same* capabilities, + * they *are* considered supersets of each other. + * @ksm_superset: The KSM that we want to verify is a superset + * @ksm_subset: The KSM that we want to verify is a subset + * + * Return: True if @ksm_superset supports a superset of the crypto modes and DUN + * bytes that @ksm_subset supports. + */ +bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, + struct blk_keyslot_manager *ksm_subset) +{ + int i; + + if (!ksm_subset) + return true; + + if (!ksm_superset) + return false; + + for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) { + if (ksm_subset->crypto_modes_supported[i] & + (~ksm_superset->crypto_modes_supported[i])) { + return false; + } + } + + if (ksm_subset->max_dun_bytes_supported > + ksm_superset->max_dun_bytes_supported) { + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_ksm_is_superset); + +/** + * blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of + * another KSM + * @target_ksm: The KSM whose restrictions to update. + * @reference_ksm: The KSM to whose restrictions this function will update + * @target_ksm's restrictions to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @reference_ksm must support all the crypto capabilities that + * @target_ksm does + * (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, + struct blk_keyslot_manager *reference_ksm) +{ + memcpy(target_ksm->crypto_modes_supported, + reference_ksm->crypto_modes_supported, + sizeof(target_ksm->crypto_modes_supported)); + + target_ksm->max_dun_bytes_supported = + reference_ksm->max_dun_bytes_supported; +} +EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities); + /** * blk_ksm_init_passthrough() - Init a passthrough keyslot manager * @ksm: The keyslot manager to init diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h index 323e15dd6fa7..164568f52be7 100644 --- a/include/linux/keyslot-manager.h +++ b/include/linux/keyslot-manager.h @@ -103,6 +103,15 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); void blk_ksm_destroy(struct blk_keyslot_manager *ksm); +void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent, + const struct blk_keyslot_manager *child); + void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm); +bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset, + struct blk_keyslot_manager *ksm_subset); + +void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm, + struct blk_keyslot_manager *reference_ksm); + #endif /* __LINUX_KEYSLOT_MANAGER_H */ From aa6ce87a768226802f9a231b3909fe81c503852c Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Mon, 1 Feb 2021 05:10:17 +0000 Subject: [PATCH 27/31] dm: add support for passing through inline crypto support Update the device-mapper core to support exposing the inline crypto support of the underlying device(s) through the device-mapper device. This works by creating a "passthrough keyslot manager" for the dm device, which declares support for encryption settings which all underlying devices support. When a supported setting is used, the bio cloning code handles cloning the crypto context to the bios for all the underlying devices. When an unsupported setting is used, the blk-crypto fallback is used as usual. Crypto support on each underlying device is ignored unless the corresponding dm target opts into exposing it. This is needed because for inline crypto to semantically operate on the original bio, the data must not be transformed by the dm target. Thus, targets like dm-linear can expose crypto support of the underlying device, but targets like dm-crypt can't. (dm-crypt could use inline crypto itself, though.) A DM device's table can only be changed if the "new" inline encryption capabilities are a (*not* necessarily strict) superset of the "old" inline encryption capabilities. Attempts to make changes to the table that result in some inline encryption capability becoming no longer supported will be rejected. For the sake of clarity, key eviction from underlying devices will be handled in a future patch. Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Signed-off-by: Satya Tangirala Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 5 ++ drivers/md/dm-table.c | 162 ++++++++++++++++++++++++++++++++++ drivers/md/dm.c | 18 +++- include/linux/device-mapper.h | 11 +++ include/uapi/linux/dm-ioctl.h | 4 +- 5 files changed, 197 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 086d293c2b03..bf3e66f39a4a 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -162,6 +163,10 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct blk_keyslot_manager *ksm; +#endif }; static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2bc256d61550..8c4ed3ec9409 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -187,6 +187,8 @@ static void free_devices(struct list_head *devices, struct mapped_device *md) } } +static void dm_table_destroy_keyslot_manager(struct dm_table *t); + void dm_table_destroy(struct dm_table *t) { unsigned int i; @@ -215,6 +217,8 @@ void dm_table_destroy(struct dm_table *t) dm_free_md_mempools(t->mempools); + dm_table_destroy_keyslot_manager(t); + kfree(t); } @@ -1203,6 +1207,157 @@ static int dm_table_register_integrity(struct dm_table *t) return 0; } +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +struct dm_keyslot_manager { + struct blk_keyslot_manager ksm; + struct mapped_device *md; +}; + +static int device_intersect_crypto_modes(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct blk_keyslot_manager *parent = data; + struct blk_keyslot_manager *child = bdev_get_queue(dev->bdev)->ksm; + + blk_ksm_intersect_modes(parent, child); + return 0; +} + +void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +{ + struct dm_keyslot_manager *dksm = container_of(ksm, + struct dm_keyslot_manager, + ksm); + + if (!ksm) + return; + + blk_ksm_destroy(ksm); + kfree(dksm); +} + +static void dm_table_destroy_keyslot_manager(struct dm_table *t) +{ + dm_destroy_keyslot_manager(t->ksm); + t->ksm = NULL; +} + +/* + * Constructs and initializes t->ksm with a keyslot manager that + * represents the common set of crypto capabilities of the devices + * described by the dm_table. However, if the constructed keyslot + * manager does not support a superset of the crypto capabilities + * supported by the current keyslot manager of the mapped_device, + * it returns an error instead, since we don't support restricting + * crypto capabilities on table changes. Finally, if the constructed + * keyslot manager doesn't actually support any crypto modes at all, + * it just returns NULL. + */ +static int dm_table_construct_keyslot_manager(struct dm_table *t) +{ + struct dm_keyslot_manager *dksm; + struct blk_keyslot_manager *ksm; + struct dm_target *ti; + unsigned int i; + bool ksm_is_empty = true; + + dksm = kmalloc(sizeof(*dksm), GFP_KERNEL); + if (!dksm) + return -ENOMEM; + dksm->md = t->md; + + ksm = &dksm->ksm; + blk_ksm_init_passthrough(ksm); + ksm->max_dun_bytes_supported = UINT_MAX; + memset(ksm->crypto_modes_supported, 0xFF, + sizeof(ksm->crypto_modes_supported)); + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (!dm_target_passes_crypto(ti->type)) { + blk_ksm_intersect_modes(ksm, NULL); + break; + } + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, device_intersect_crypto_modes, + ksm); + } + + if (t->md->queue && !blk_ksm_is_superset(ksm, t->md->queue->ksm)) { + DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); + dm_destroy_keyslot_manager(ksm); + return -EINVAL; + } + + /* + * If the new KSM doesn't actually support any crypto modes, we may as + * well represent it with a NULL ksm. + */ + ksm_is_empty = true; + for (i = 0; i < ARRAY_SIZE(ksm->crypto_modes_supported); i++) { + if (ksm->crypto_modes_supported[i]) { + ksm_is_empty = false; + break; + } + } + + if (ksm_is_empty) { + dm_destroy_keyslot_manager(ksm); + ksm = NULL; + } + + /* + * t->ksm is only set temporarily while the table is being set + * up, and it gets set to NULL after the capabilities have + * been transferred to the request_queue. + */ + t->ksm = ksm; + + return 0; +} + +static void dm_update_keyslot_manager(struct request_queue *q, + struct dm_table *t) +{ + if (!t->ksm) + return; + + /* Make the ksm less restrictive */ + if (!q->ksm) { + blk_ksm_register(t->ksm, q); + } else { + blk_ksm_update_capabilities(q->ksm, t->ksm); + dm_destroy_keyslot_manager(t->ksm); + } + t->ksm = NULL; +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static int dm_table_construct_keyslot_manager(struct dm_table *t) +{ + return 0; +} + +void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm) +{ +} + +static void dm_table_destroy_keyslot_manager(struct dm_table *t) +{ +} + +static void dm_update_keyslot_manager(struct request_queue *q, + struct dm_table *t) +{ +} + +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ + /* * Prepares the table for use by building the indices, * setting the type, and allocating mempools. @@ -1229,6 +1384,12 @@ int dm_table_complete(struct dm_table *t) return r; } + r = dm_table_construct_keyslot_manager(t); + if (r) { + DMERR("could not construct keyslot manager."); + return r; + } + r = dm_table_alloc_md_mempools(t, t->md); if (r) DMERR("unable to allocate mempools"); @@ -1863,6 +2024,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } #endif + dm_update_keyslot_manager(q, t); blk_queue_update_readahead(q); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 64fda1429e39..7021aea82aa4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -28,6 +28,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" @@ -1722,6 +1723,19 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); +#ifdef CONFIG_BLK_INLINE_ENCRYPTION +static void dm_queue_destroy_keyslot_manager(struct request_queue *q) +{ + dm_destroy_keyslot_manager(q->ksm); +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q) +{ +} +#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ + static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) @@ -1743,8 +1757,10 @@ static void cleanup_mapped_device(struct mapped_device *md) put_disk(md->disk); } - if (md->queue) + if (md->queue) { + dm_queue_destroy_keyslot_manager(md->queue); blk_cleanup_queue(md->queue); + } cleanup_srcu_struct(&md->io_barrier); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 61a66fb8ebb3..47588130ef5e 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -257,6 +257,12 @@ struct target_type { #define DM_TARGET_NOWAIT 0x00000080 #define dm_target_supports_nowait(type) ((type)->features & DM_TARGET_NOWAIT) +/* + * A target supports passing through inline crypto support. + */ +#define DM_TARGET_PASSES_CRYPTO 0x00000100 +#define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO) + struct dm_target { struct dm_table *table; struct target_type *type; @@ -533,6 +539,11 @@ void dm_table_run_md_queue_async(struct dm_table *t); struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *t); +/* + * Table keyslot manager functions + */ +void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm); + /* * A wrapper around vmalloc. */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 4933b6b67b85..fcff6669137b 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -272,9 +272,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 43 +#define DM_VERSION_MINOR 44 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2020-10-01)" +#define DM_VERSION_EXTRA "-ioctl (2021-02-01)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ From 9355a9eb21a5c9b859ec838beb1874eef2e2a6d9 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Mon, 1 Feb 2021 05:10:18 +0000 Subject: [PATCH 28/31] dm: support key eviction from keyslot managers of underlying devices Now that device mapper supports inline encryption, add the ability to evict keys from all underlying devices. When an upper layer requests a key eviction, we simply iterate through all underlying devices and evict that key from each device. Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Signed-off-by: Satya Tangirala Signed-off-by: Mike Snitzer --- block/blk-crypto.c | 1 + drivers/md/dm-table.c | 53 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 09fcb18fa778..c5bdaafffa29 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -409,3 +409,4 @@ int blk_crypto_evict_key(struct request_queue *q, */ return blk_crypto_fallback_evict_key(key); } +EXPORT_SYMBOL_GPL(blk_crypto_evict_key); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8c4ed3ec9409..95391f78b8d5 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1214,6 +1214,58 @@ struct dm_keyslot_manager { struct mapped_device *md; }; +struct dm_keyslot_evict_args { + const struct blk_crypto_key *key; + int err; +}; + +static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_keyslot_evict_args *args = data; + int err; + + err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key); + if (!args->err) + args->err = err; + /* Always try to evict the key from all devices. */ + return 0; +} + +/* + * When an inline encryption key is evicted from a device-mapper device, evict + * it from all the underlying devices. + */ +static int dm_keyslot_evict(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, unsigned int slot) +{ + struct dm_keyslot_manager *dksm = container_of(ksm, + struct dm_keyslot_manager, + ksm); + struct mapped_device *md = dksm->md; + struct dm_keyslot_evict_args args = { key }; + struct dm_table *t; + int srcu_idx; + int i; + struct dm_target *ti; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + return 0; + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); + } + dm_put_live_table(md, srcu_idx); + return args.err; +} + +static struct blk_ksm_ll_ops dm_ksm_ll_ops = { + .keyslot_evict = dm_keyslot_evict, +}; + static int device_intersect_crypto_modes(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1270,6 +1322,7 @@ static int dm_table_construct_keyslot_manager(struct dm_table *t) ksm = &dksm->ksm; blk_ksm_init_passthrough(ksm); + ksm->ksm_ll_ops = dm_ksm_ll_ops; ksm->max_dun_bytes_supported = UINT_MAX; memset(ksm->crypto_modes_supported, 0xFF, sizeof(ksm->crypto_modes_supported)); From 3db564b4f5925f126c36cc033dfdbec0b6a785a9 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Mon, 1 Feb 2021 05:10:19 +0000 Subject: [PATCH 29/31] dm: set DM_TARGET_PASSES_CRYPTO feature for some targets dm-linear and dm-flakey obviously can pass through inline crypto support. Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Signed-off-by: Satya Tangirala Signed-off-by: Mike Snitzer --- drivers/md/dm-flakey.c | 4 +++- drivers/md/dm-linear.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index a2cc9e45cbba..30c6bc151213 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -482,8 +482,10 @@ static struct target_type flakey_target = { .name = "flakey", .version = {1, 5, 0}, #ifdef CONFIG_BLK_DEV_ZONED - .features = DM_TARGET_ZONED_HM, + .features = DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, .report_zones = flakey_report_zones, +#else + .features = DM_TARGET_PASSES_CRYPTO, #endif .module = THIS_MODULE, .ctr = flakey_ctr, diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 00774b5d7668..fc9c4272c10d 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -229,10 +229,11 @@ static struct target_type linear_target = { .version = {1, 4, 0}, #ifdef CONFIG_BLK_DEV_ZONED .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ZONED_HM, + DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, .report_zones = linear_report_zones, #else - .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | + DM_TARGET_PASSES_CRYPTO, #endif .module = THIS_MODULE, .ctr = linear_ctr, From e3290b9491ff5b7ee40f9e0a4c06821988a2a2bf Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 10 Feb 2021 17:38:30 -0500 Subject: [PATCH 30/31] dm: simplify target code conditional on CONFIG_BLK_DEV_ZONED Allow removal of CONFIG_BLK_DEV_ZONED conditionals in target_type definition of various targets. Suggested-by: Eric Biggers Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 6 ++---- drivers/md/dm-flakey.c | 6 ++---- drivers/md/dm-linear.c | 7 ++----- include/linux/device-mapper.h | 16 ++++++++++++++-- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ee20b586f526..ae0f0a4e3689 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3134,7 +3134,6 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar } #ifdef CONFIG_BLK_DEV_ZONED - static int crypt_report_zones(struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones) { @@ -3145,7 +3144,8 @@ static int crypt_report_zones(struct dm_target *ti, return blkdev_report_zones(cc->dev->bdev, sector, nr_zones, dm_report_zones_cb, args); } - +#else +#define crypt_report_zones NULL #endif /* @@ -3580,10 +3580,8 @@ static struct target_type crypt_target = { .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -#ifdef CONFIG_BLK_DEV_ZONED .features = DM_TARGET_ZONED_HM, .report_zones = crypt_report_zones, -#endif .map = crypt_map, .status = crypt_status, .postsuspend = crypt_postsuspend, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 30c6bc151213..b7fee9936f05 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -469,6 +469,8 @@ static int flakey_report_zones(struct dm_target *ti, return blkdev_report_zones(fc->dev->bdev, sector, nr_zones, dm_report_zones_cb, args); } +#else +#define flakey_report_zones NULL #endif static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) @@ -481,12 +483,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ static struct target_type flakey_target = { .name = "flakey", .version = {1, 5, 0}, -#ifdef CONFIG_BLK_DEV_ZONED .features = DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, .report_zones = flakey_report_zones, -#else - .features = DM_TARGET_PASSES_CRYPTO, -#endif .module = THIS_MODULE, .ctr = flakey_ctr, .dtr = flakey_dtr, diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index fc9c4272c10d..92db0f5e7f28 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -146,6 +146,8 @@ static int linear_report_zones(struct dm_target *ti, return blkdev_report_zones(lc->dev->bdev, sector, nr_zones, dm_report_zones_cb, args); } +#else +#define linear_report_zones NULL #endif static int linear_iterate_devices(struct dm_target *ti, @@ -227,14 +229,9 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, static struct target_type linear_target = { .name = "linear", .version = {1, 4, 0}, -#ifdef CONFIG_BLK_DEV_ZONED .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, .report_zones = linear_report_zones, -#else - .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_PASSES_CRYPTO, -#endif .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 47588130ef5e..c98d847b0f0b 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -93,9 +93,18 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv, typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); +#ifdef CONFIG_BLK_DEV_ZONED typedef int (*dm_report_zones_fn) (struct dm_target *ti, struct dm_report_zones_args *args, unsigned int nr_zones); +#else +/* + * Define dm_report_zones_fn so that targets can assign to NULL if + * CONFIG_BLK_DEV_ZONED disabled. Otherwise each target needs to do + * awkward #ifdefs in their target_type, etc. + */ +typedef int (*dm_report_zones_fn) (struct dm_target *dummy); +#endif /* * These iteration functions are typically used to check (and combine) @@ -187,9 +196,7 @@ struct target_type { dm_status_fn status; dm_message_fn message; dm_prepare_ioctl_fn prepare_ioctl; -#ifdef CONFIG_BLK_DEV_ZONED dm_report_zones_fn report_zones; -#endif dm_busy_fn busy; dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; @@ -248,8 +255,13 @@ struct target_type { /* * Indicates that a target supports host-managed zoned block devices. */ +#ifdef CONFIG_BLK_DEV_ZONED #define DM_TARGET_ZONED_HM 0x00000040 #define dm_target_supports_zoned_hm(type) ((type)->features & DM_TARGET_ZONED_HM) +#else +#define DM_TARGET_ZONED_HM 0x00000000 +#define dm_target_supports_zoned_hm(type) (false) +#endif /* * A target handles REQ_NOWAIT From a666e5c05e7c4aaabb2c5d58117b0946803d03d2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 10 Feb 2021 15:26:23 -0500 Subject: [PATCH 31/31] dm: fix deadlock when swapping to encrypted device The system would deadlock when swapping to a dm-crypt device. The reason is that for each incoming write bio, dm-crypt allocates memory that holds encrypted data. These excessive allocations exhaust all the memory and the result is either deadlock or OOM trigger. This patch limits the number of in-flight swap bios, so that the memory consumed by dm-crypt is limited. The limit is enforced if the target set the "limit_swap_bios" variable and if the bio has REQ_SWAP set. Non-swap bios are not affected becuase taking the semaphore would cause performance degradation. This is similar to request-based drivers - they will also block when the number of requests is over the limit. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 4 +++ drivers/md/dm-crypt.c | 1 + drivers/md/dm.c | 60 +++++++++++++++++++++++++++++++++++ include/linux/device-mapper.h | 5 +++ 4 files changed, 70 insertions(+) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index bf3e66f39a4a..5953ff2bd260 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -103,6 +103,10 @@ struct mapped_device { /* kobject and completion */ struct dm_kobject_holder kobj_holder; + int swap_bios; + struct semaphore swap_bios_semaphore; + struct mutex swap_bios_lock; + struct dm_stats stats; /* for blk-mq request-based DM support */ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index ae0f0a4e3689..11c105ecd165 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3342,6 +3342,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) wake_up_process(cc->write_thread); ti->num_flush_bios = 1; + ti->limit_swap_bios = true; return 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7021aea82aa4..50b693d776d6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -153,6 +153,16 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; +#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) +static int swap_bios = DEFAULT_SWAP_BIOS; +static int get_swap_bios(void) +{ + int latch = READ_ONCE(swap_bios); + if (unlikely(latch <= 0)) + latch = DEFAULT_SWAP_BIOS; + return latch; +} + /* * For mempools pre-allocation at the table loading time. */ @@ -974,6 +984,11 @@ void disable_write_zeroes(struct mapped_device *md) limits->max_write_zeroes_sectors = 0; } +static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) +{ + return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); +} + static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; @@ -1025,6 +1040,11 @@ static void clone_endio(struct bio *bio) } } + if (unlikely(swap_bios_limit(tio->ti, bio))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } + free_tio(tio); dec_pending(io, error); } @@ -1258,6 +1278,22 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); +static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) +{ + mutex_lock(&md->swap_bios_lock); + while (latch < md->swap_bios) { + cond_resched(); + down(&md->swap_bios_semaphore); + md->swap_bios--; + } + while (latch > md->swap_bios) { + cond_resched(); + up(&md->swap_bios_semaphore); + md->swap_bios++; + } + mutex_unlock(&md->swap_bios_lock); +} + static blk_qc_t __map_bio(struct dm_target_io *tio) { int r; @@ -1277,6 +1313,14 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) atomic_inc(&io->io_count); sector = clone->bi_iter.bi_sector; + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + int latch = get_swap_bios(); + if (unlikely(latch != md->swap_bios)) + __set_swap_bios_limit(md, latch); + down(&md->swap_bios_semaphore); + } + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@ -1287,10 +1331,18 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_DM_REQUEUE); break; @@ -1767,6 +1819,7 @@ static void cleanup_mapped_device(struct mapped_device *md) mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); + mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); } @@ -1834,6 +1887,10 @@ static struct mapped_device *alloc_dev(int minor) init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->swap_bios = get_swap_bios(); + sema_init(&md->swap_bios_semaphore, md->swap_bios); + mutex_init(&md->swap_bios_lock); + md->disk->major = _major; md->disk->first_minor = minor; md->disk->fops = &dm_blk_dops; @@ -3117,6 +3174,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); +module_param(swap_bios, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL"); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c98d847b0f0b..7f4ac87c0b32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -343,6 +343,11 @@ struct dm_target { * whether or not its underlying devices have support. */ bool discards_supported:1; + + /* + * Set if we need to limit the number of in-flight bios when swapping. + */ + bool limit_swap_bios:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size);