- Adjust various DM structure members to improve alignment relative to

4.18 block's mempool_t and bioset changes. - Add DM writecache target that offers writeback caching to persistent memory or SSD. - Small DM core error message change to give context for why a DM table type transition wasn't allowed. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJbHsFxAAoJEMUj8QotnQNaHAgIAJPTwTOZboTzjQLrdiYEQ6q5 lk7ZJP44+VlnY+iPRzyf36JyjVgIoZ82gWMW28hJmbq1dWaVphWA9yxYemFqfkSb F7oqcWl/C2J7U8Zk5U+gJKGQXRBhhIIYO7W3KWKTfF1cSx1AcqM2Au5IPejBG/sP h42Pfil22Rfg1U3kpxU8UQHe/V9cr/3eaRu0rD477HKqob1M08jP+27jdTu1vmNH uGGDWz5Dgra2IIxx797f4gn2hHJ825dDgaFF35JkTbKRom/xk8GlREy5wxqFvkbI Ti45mMlRdBFxXkFyvToVMtbCfkcZ617hag8KV4/BZ/4zmGBLFQXddHMAgJeYChk= =KH0g -----END PGP SIGNATURE----- Merge tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper updates from Mike Snitzer: - Adjust various DM structure members to improve alignment relative to 4.18 block's mempool_t and bioset changes. - Add DM writecache target that offers writeback caching to persistent memory or SSD. - Small DM core error message change to give context for why a DM table type transition wasn't allowed. * tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm: add writecache target dm: adjust structure members to improve alignment dm: report which conflicting type caused error during table_load()
2018-06-12 18:12:08 -07:00 · 2018-06-12 18:12:08 -07:00 · 4597fcff07
parent a205f0c974 48debafe4f
commit 4597fcff07
14 changed files with 2472 additions and 80 deletions
--- a/Documentation/device-mapper/writecache.txt
+++ b/Documentation/device-mapper/writecache.txt
@ -0,0 +1,68 @@
+The writecache target caches writes on persistent memory or on SSD. It
+doesn't cache reads because reads are supposed to be cached in page cache
+in normal RAM.
+
+When the device is constructed, the first sector should be zeroed or the
+first sector should contain valid superblock from previous invocation.
+
+Constructor parameters:
+1. type of the cache device - "p" or "s"
+	p - persistent memory
+	s - SSD
+2. the underlying device that will be cached
+3. the cache device
+4. block size (4096 is recommended; the maximum block size is the page
+   size)
+5. the number of optional parameters (the parameters with an argument
+   count as two)
+	high_watermark n	(default: 50)
+		start writeback when the number of used blocks reach this
+		watermark
+	low_watermark x		(default: 45)
+		stop writeback when the number of used blocks drops below
+		this watermark
+	writeback_jobs n	(default: unlimited)
+		limit the number of blocks that are in flight during
+		writeback. Setting this value reduces writeback
+		throughput, but it may improve latency of read requests
+	autocommit_blocks n	(default: 64 for pmem, 65536 for ssd)
+		when the application writes this amount of blocks without
+		issuing the FLUSH request, the blocks are automatically
+		commited
+	autocommit_time ms	(default: 1000)
+		autocommit time in milliseconds. The data is automatically
+		commited if this time passes and no FLUSH request is
+		received
+	fua			(by default on)
+		applicable only to persistent memory - use the FUA flag
+		when writing data from persistent memory back to the
+		underlying device
+	nofua
+		applicable only to persistent memory - don't use the FUA
+		flag when writing back data and send the FLUSH request
+		afterwards
+		- some underlying devices perform better with fua, some
+		  with nofua. The user should test it
+
+Status:
+1. error indicator - 0 if there was no error, otherwise error number
+2. the number of blocks
+3. the number of free blocks
+4. the number of blocks under writeback
+
+Messages:
+	flush
+		flush the cache device. The message returns successfully
+		if the cache device was flushed without an error
+	flush_on_suspend
+		flush the cache device on next suspend. Use this message
+		when you are going to remove the cache device. The proper
+		sequence for removing the cache device is:
+		1. send the "flush_on_suspend" message
+		2. load an inactive table with a linear target that maps
+		   to the underlying device
+		3. suspend the device
+		4. ask for status and verify that there are no errors
+		5. resume the device, so that it will use the linear
+		   target
+		6. the cache device is now inactive and it can be deleted
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@ -334,6 +334,17 @@ config DM_CACHE_SMQ
         of less memory utilization, improved performance and increased
         adaptability in the face of changing workloads.

+config DM_WRITECACHE
+	tristate "Writecache target"
+	depends on BLK_DEV_DM
+	---help---
+	   The writecache target caches writes on persistent memory or SSD.
+	   It is intended for databases or other programs that need extremely
+	   low commit latency.
+
+	   The writecache target doesn't cache reads because reads are supposed
+	   to be cached in standard RAM.
+
 config DM_ERA
       tristate "Era target (EXPERIMENTAL)"
       depends on BLK_DEV_DM
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@ -67,6 +67,7 @@ obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
 obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
+obj-$(CONFIG_DM_WRITECACHE)	+= dm-writecache.o

 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@ -19,8 +19,8 @@

 struct dm_bio_prison {
 	spinlock_t lock;
-	mempool_t cell_pool;
 	struct rb_root cells;
+	mempool_t cell_pool;
 };

 static struct kmem_cache *_cell_cache;
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@ -21,8 +21,8 @@ struct dm_bio_prison_v2 {
 	struct workqueue_struct *wq;

 	spinlock_t lock;
-	mempool_t cell_pool;
 	struct rb_root cells;
+	mempool_t cell_pool;
 };

 static struct kmem_cache *_cell_cache;
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@ -371,7 +371,13 @@ struct cache_stats {

 struct cache {
 	struct dm_target *ti;
-	struct dm_target_callbacks callbacks;
+	spinlock_t lock;
+
+	/*
+	 * Fields for converting from sectors to blocks.
+	 */
+	int sectors_per_block_shift;
+	sector_t sectors_per_block;

 	struct dm_cache_metadata *cmd;

@ -402,13 +408,11 @@ struct cache {
 	dm_cblock_t cache_size;

 	/*
-	 * Fields for converting from sectors to blocks.
+	 * Invalidation fields.
 	 */
-	sector_t sectors_per_block;
-	int sectors_per_block_shift;
+	spinlock_t invalidation_lock;
+	struct list_head invalidation_requests;

-	spinlock_t lock;
-	struct bio_list deferred_bios;
 	sector_t migration_threshold;
 	wait_queue_head_t migration_wait;
 	atomic_t nr_allocated_migrations;
@ -419,13 +423,11 @@ struct cache {
 	 */
 	atomic_t nr_io_migrations;

+	struct bio_list deferred_bios;
+
 	struct rw_semaphore quiesce_lock;

-	/*
-	 * cache_size entries, dirty if set
-	 */
-	atomic_t nr_dirty;
-	unsigned long *dirty_bitset;
+	struct dm_target_callbacks callbacks;

 	/*
 	 * origin_blocks entries, discarded if set.
@ -442,24 +444,20 @@ struct cache {
 	const char **ctr_args;

 	struct dm_kcopyd_client *copier;
-	struct workqueue_struct *wq;
 	struct work_struct deferred_bio_worker;
 	struct work_struct migration_worker;
+	struct workqueue_struct *wq;
 	struct delayed_work waker;
 	struct dm_bio_prison_v2 *prison;
-	struct bio_set bs;

-	mempool_t migration_pool;
+	/*
+	 * cache_size entries, dirty if set
+	 */
+	unsigned long *dirty_bitset;
+	atomic_t nr_dirty;

-	struct dm_cache_policy *policy;
 	unsigned policy_nr_args;
-
-	bool need_tick_bio:1;
-	bool sized:1;
-	bool invalidate:1;
-	bool commit_requested:1;
-	bool loaded_mappings:1;
-	bool loaded_discards:1;
+	struct dm_cache_policy *policy;

 	/*
 	 * Cache features such as write-through.
@ -468,18 +466,23 @@ struct cache {

 	struct cache_stats stats;

-	/*
-	 * Invalidation fields.
-	 */
-	spinlock_t invalidation_lock;
-	struct list_head invalidation_requests;
+	bool need_tick_bio:1;
+	bool sized:1;
+	bool invalidate:1;
+	bool commit_requested:1;
+	bool loaded_mappings:1;
+	bool loaded_discards:1;
+
+	struct rw_semaphore background_work_lock;
+
+	struct batcher committer;
+	struct work_struct commit_ws;

 	struct io_tracker tracker;

-	struct work_struct commit_ws;
-	struct batcher committer;
+	mempool_t migration_pool;

-	struct rw_semaphore background_work_lock;
+	struct bio_set bs;
 };

 struct per_bio_data {
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@ -31,6 +31,9 @@ struct dm_kobject_holder {
 struct mapped_device {
 	struct mutex suspend_lock;

+	struct mutex table_devices_lock;
+	struct list_head table_devices;
+
 	/*
 	 * The current mapping (struct dm_table *).
 	 * Use dm_get_live_table{_fast} or take suspend_lock for
@ -38,17 +41,14 @@ struct mapped_device {
 	 */
 	void __rcu *map;

-	struct list_head table_devices;
-	struct mutex table_devices_lock;
-
 	unsigned long flags;

-	struct request_queue *queue;
-	int numa_node_id;
-
-	enum dm_queue_mode type;
 	/* Protect queue and type against concurrent access. */
 	struct mutex type_lock;
+	enum dm_queue_mode type;
+
+	int numa_node_id;
+	struct request_queue *queue;

 	atomic_t holders;
 	atomic_t open_count;
@ -56,21 +56,21 @@ struct mapped_device {
 	struct dm_target *immutable_target;
 	struct target_type *immutable_target_type;

+	char name[16];
 	struct gendisk *disk;
 	struct dax_device *dax_dev;
-	char name[16];
-
-	void *interface_ptr;

 	/*
 	 * A list of ios that arrived while we were suspended.
 	 */
-	atomic_t pending[2];
-	wait_queue_head_t wait;
 	struct work_struct work;
+	wait_queue_head_t wait;
+	atomic_t pending[2];
 	spinlock_t deferred_lock;
 	struct bio_list deferred;

+	void *interface_ptr;
+
 	/*
 	 * Event handling.
 	 */
@ -83,17 +83,17 @@ struct mapped_device {
 	/* the number of internal suspends */
 	unsigned internal_suspend_count;

-	/*
-	 * Processing queue (flush)
-	 */
-	struct workqueue_struct *wq;
-
 	/*
 	 * io objects are allocated from here.
 	 */
 	struct bio_set io_bs;
 	struct bio_set bs;

+	/*
+	 * Processing queue (flush)
+	 */
+	struct workqueue_struct *wq;
+
 	/*
 	 * freeze/thaw support require holding onto a super block
 	 */
@ -102,11 +102,11 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;

-	struct block_device *bdev;
-
 	/* kobject and completion */
 	struct dm_kobject_holder kobj_holder;

+	struct block_device *bdev;
+
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;

--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@ -139,25 +139,13 @@ struct crypt_config {
 	struct dm_dev *dev;
 	sector_t start;

-	/*
-	 * pool for per bio private data, crypto requests,
-	 * encryption requeusts/buffer pages and integrity tags
-	 */
-	mempool_t req_pool;
-	mempool_t page_pool;
-	mempool_t tag_pool;
-	unsigned tag_pool_max_sectors;
-
 	struct percpu_counter n_allocated_pages;

-	struct bio_set bs;
-	struct mutex bio_alloc_lock;
-
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;

-	struct task_struct *write_thread;
 	wait_queue_head_t write_thread_wait;
+	struct task_struct *write_thread;
 	struct rb_root write_tree;

 	char *cipher;
@ -213,6 +201,18 @@ struct crypt_config {
 	unsigned int integrity_iv_size;
 	unsigned int on_disk_tag_size;

+	/*
+	 * pool for per bio private data, crypto requests,
+	 * encryption requeusts/buffer pages and integrity tags
+	 */
+	unsigned tag_pool_max_sectors;
+	mempool_t tag_pool;
+	mempool_t req_pool;
+	mempool_t page_pool;
+
+	struct bio_set bs;
+	struct mutex bio_alloc_lock;
+
 	u8 *authenc_key; /* space for keys in authenc() format (if used) */
 	u8 key[0];
 };
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@ -1344,7 +1344,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
 			goto err_unlock_md_type;
 		}
 	} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
-		DMWARN("can't change device type after initial table load.");
+		DMWARN("can't change device type (old=%u vs new=%u) after initial table load.",
+		       dm_get_md_type(md), dm_table_get_type(t));
 		r = -EINVAL;
 		goto err_unlock_md_type;
 	}
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@ -45,7 +45,6 @@ struct dm_kcopyd_client {
 	struct dm_io_client *io_client;

 	wait_queue_head_t destroyq;
-	atomic_t nr_jobs;

 	mempool_t job_pool;

@ -54,6 +53,8 @@ struct dm_kcopyd_client {

 	struct dm_kcopyd_throttle *throttle;

+	atomic_t nr_jobs;
+
 /*
 * We maintain three lists of jobs:
 *
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@ -63,28 +63,29 @@ struct dm_region_hash {

 	/* hash table */
 	rwlock_t hash_lock;
-	mempool_t region_pool;
 	unsigned mask;
 	unsigned nr_buckets;
 	unsigned prime;
 	unsigned shift;
 	struct list_head *buckets;

-	unsigned max_recovery; /* Max # of regions to recover in parallel */
-
-	spinlock_t region_lock;
-	atomic_t recovery_in_flight;
-	struct semaphore recovery_count;
-	struct list_head clean_regions;
-	struct list_head quiesced_regions;
-	struct list_head recovered_regions;
-	struct list_head failed_recovered_regions;
-
 	/*
 	 * If there was a flush failure no regions can be marked clean.
 	 */
 	int flush_failure;

+	unsigned max_recovery; /* Max # of regions to recover in parallel */
+
+	spinlock_t region_lock;
+	atomic_t recovery_in_flight;
+	struct list_head clean_regions;
+	struct list_head quiesced_regions;
+	struct list_head recovered_regions;
+	struct list_head failed_recovered_regions;
+	struct semaphore recovery_count;
+
+	mempool_t region_pool;
+
 	void *context;
 	sector_t target_begin;

--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@ -240,9 +240,9 @@ struct pool {
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;

+	struct work_struct worker;
 	struct workqueue_struct *wq;
 	struct throttle throttle;
-	struct work_struct worker;
 	struct delayed_work waker;
 	struct delayed_work no_space_timeout;

@ -260,7 +260,6 @@ struct pool {
 	struct dm_deferred_set *all_io_ds;

 	struct dm_thin_new_mapping *next_mapping;
-	mempool_t mapping_pool;

 	process_bio_fn process_bio;
 	process_bio_fn process_discard;
@ -273,6 +272,8 @@ struct pool {
 	process_mapping_fn process_prepared_discard_pt2;

 	struct dm_bio_prison_cell **cell_sort_array;
+
+	mempool_t mapping_pool;
 };

 static enum pool_mode get_pool_mode(struct pool *pool);
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@ -52,9 +52,9 @@ struct dmz_target {
 	struct dmz_reclaim	*reclaim;

 	/* For chunk work */
-	struct mutex		chunk_lock;
 	struct radix_tree_root	chunk_rxtree;
 	struct workqueue_struct *chunk_wq;
+	struct mutex		chunk_lock;

 	/* For cloned BIOs to zones */
 	struct bio_set		bio_set;