- A major update for DM cache that reduces the latency for deciding

whether blocks should migrate to/from the cache. The bio-prison-v2 interface supports this improvement by enabling direct dispatch of work to workqueues rather than having to delay the actual work dispatch to the DM cache core. So the dm-cache policies are much more nimble by being able to drive IO as they see fit. One immediate benefit from the improved latency is a cache that should be much more adaptive to changing workloads. - Add a new DM integrity target that emulates a block device that has additional per-sector tags that can be used for storing integrity information. - Add a new authenticated encryption feature to the DM crypt target that builds on the capabilities provided by the DM integrity target. - Add MD interface for switching the raid4/5/6 journal mode and update the DM raid target to use it to enable aid4/5/6 journal write-back support. - Switch the DM verity target over to using the asynchronous hash crypto API (this helps work better with architectures that have access to off-CPU algorithm providers, which should reduce CPU utilization). - Various request-based DM and DM multipath fixes and improvements from Bart and Christoph. - A DM thinp target fix for a bio structure leak that occurs for each discard IFF discard passdown is enabled. - A fix for a possible deadlock in DM bufio and a fix to re-check the new buffer allocation watermark in the face of competing admin changes to the 'max_cache_size_bytes' tunable. - A couple DM core cleanups. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJZB6vtAAoJEMUj8QotnQNaoicIALuZTLElgAzxzA28cfk1+1Ea Gd09CfJ3M6cvk/YGUU7WwiSYIwu16yOJALG4sLcYnEmUCzvKfFPcl/RpeSJHPpYM 0aVXa6NIJw7K2r3C17toiK2DRMHYw6QU843WeWI93vBW13lDJklNJL9fM7GBEOLH NMSNw2mAq9ajtLlnJhM3ZfhloA7/u/jektvlBO1AA3RQ5Kx1cXVXFPqN7FdRfcqp 4RuEMe9faAadlXLsj3bia5IBmF/W0Qza6JilP+NLKLWB4fm7LZDjN/k+TsHWMa9e cGR73TgUGLMBJX+sDJy8R3oeBG9JZkFVkD7I30eCjzyhSOs/54XNYQ23EkqHJU0= =9Ryi -----END PGP SIGNATURE----- Merge tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper updates from Mike Snitzer: - A major update for DM cache that reduces the latency for deciding whether blocks should migrate to/from the cache. The bio-prison-v2 interface supports this improvement by enabling direct dispatch of work to workqueues rather than having to delay the actual work dispatch to the DM cache core. So the dm-cache policies are much more nimble by being able to drive IO as they see fit. One immediate benefit from the improved latency is a cache that should be much more adaptive to changing workloads. - Add a new DM integrity target that emulates a block device that has additional per-sector tags that can be used for storing integrity information. - Add a new authenticated encryption feature to the DM crypt target that builds on the capabilities provided by the DM integrity target. - Add MD interface for switching the raid4/5/6 journal mode and update the DM raid target to use it to enable aid4/5/6 journal write-back support. - Switch the DM verity target over to using the asynchronous hash crypto API (this helps work better with architectures that have access to off-CPU algorithm providers, which should reduce CPU utilization). - Various request-based DM and DM multipath fixes and improvements from Bart and Christoph. - A DM thinp target fix for a bio structure leak that occurs for each discard IFF discard passdown is enabled. - A fix for a possible deadlock in DM bufio and a fix to re-check the new buffer allocation watermark in the face of competing admin changes to the 'max_cache_size_bytes' tunable. - A couple DM core cleanups. * tag 'for-4.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (50 commits) dm bufio: check new buffer allocation watermark every 30 seconds dm bufio: avoid a possible ABBA deadlock dm mpath: make it easier to detect unintended I/O request flushes dm mpath: cleanup QUEUE_IF_NO_PATH bit manipulation by introducing assign_bit() dm mpath: micro-optimize the hot path relative to MPATHF_QUEUE_IF_NO_PATH dm: introduce enum dm_queue_mode to cleanup related code dm mpath: verify __pg_init_all_paths locking assumptions at runtime dm: verify suspend_locking assumptions at runtime dm block manager: remove an unused argument from dm_block_manager_create() dm rq: check blk_mq_register_dev() return value in dm_mq_init_request_queue() dm mpath: delay requeuing while path initialization is in progress dm mpath: avoid that path removal can trigger an infinite loop dm mpath: split and rename activate_path() to prepare for its expanded use dm ioctl: prevent stack leak in dm ioctl call dm integrity: use previously calculated log2 of sectors_per_block dm integrity: use hex2bin instead of open-coded variant dm crypt: replace custom implementation of hex2bin() dm crypt: remove obsolete references to per-CPU state dm verity: switch to using asynchronous hash crypto API dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues ...
2017-05-03 10:31:20 -07:00 · 2017-05-03 10:31:20 -07:00 · d35a878ae1
parent e5021876c9 390020ad2a
commit d35a878ae1
45 changed files with 7810 additions and 3195 deletions
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
 	      <offset> [<#opt_params> <opt_params>]

 <cipher>
-    Encryption cipher and an optional IV generation mode.
-    (In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
-    Examples:
-       des
-       aes-cbc-essiv:sha256
-       twofish-ecb
+    Encryption cipher, encryption mode and Initial Vector (IV) generator.

-    /proc/crypto contains supported crypto modes
+    The cipher specifications format is:
+       cipher[:keycount]-chainmode-ivmode[:ivopts]
+    Examples:
+       aes-cbc-essiv:sha256
+       aes-xts-plain64
+       serpent-xts-plain64
+
+    Cipher format also supports direct specification with kernel crypt API
+    format (selected by capi: prefix). The IV specification is the same
+    as for the first format type.
+    This format is mainly used for specification of authenticated modes.
+
+    The crypto API cipher specifications format is:
+        capi:cipher_api_spec-ivmode[:ivopts]
+    Examples:
+        capi:cbc(aes)-essiv:sha256
+        capi:xts(aes)-plain64
+    Examples of authenticated modes:
+        capi:gcm(aes)-random
+        capi:authenc(hmac(sha256),xts(aes))-random
+        capi:rfc7539(chacha20,poly1305)-random
+
+    The /proc/crypto contains a list of curently loaded crypto modes.

 <key>
    Key used for encryption. It is encoded either as a hexadecimal number
@ -93,6 +110,32 @@ submit_from_crypt_cpus
    thread because it benefits CFQ to have writes submitted using the
    same context.

+integrity:<bytes>:<type>
+    The device requires additional <bytes> metadata per-sector stored
+    in per-bio integrity structure. This metadata must by provided
+    by underlying dm-integrity target.
+
+    The <type> can be "none" if metadata is used only for persistent IV.
+
+    For Authenticated Encryption with Additional Data (AEAD)
+    the <type> is "aead". An AEAD mode additionally calculates and verifies
+    integrity for the encrypted device. The additional space is then
+    used for storing authentication tag (and persistent IV if needed).
+
+sector_size:<bytes>
+    Use <bytes> as the encryption unit instead of 512 bytes sectors.
+    This option can be in range 512 - 4096 bytes and must be power of two.
+    Virtual device will announce this size as a minimal IO and logical sector.
+
+iv_large_sectors
+   IV generators will use sector number counted in <sector_size> units
+   instead of default 512 bytes sectors.
+
+   For example, if <sector_size> is 4096 bytes, plain64 IV for the second
+   sector will be 8 (without flag) and 1 if iv_large_sectors is present.
+   The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
+   if this flag is specified.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
--- a/Documentation/device-mapper/dm-integrity.txt
+++ b/Documentation/device-mapper/dm-integrity.txt
@ -0,0 +1,199 @@
+The dm-integrity target emulates a block device that has additional
+per-sector tags that can be used for storing integrity information.
+
+A general problem with storing integrity tags with every sector is that
+writing the sector and the integrity tag must be atomic - i.e. in case of
+crash, either both sector and integrity tag or none of them is written.
+
+To guarantee write atomicity, the dm-integrity target uses journal, it
+writes sector data and integrity tags into a journal, commits the journal
+and then copies the data and integrity tags to their respective location.
+
+The dm-integrity target can be used with the dm-crypt target - in this
+situation the dm-crypt target creates the integrity data and passes them
+to the dm-integrity target via bio_integrity_payload attached to the bio.
+In this mode, the dm-crypt and dm-integrity targets provide authenticated
+disk encryption - if the attacker modifies the encrypted device, an I/O
+error is returned instead of random data.
+
+The dm-integrity target can also be used as a standalone target, in this
+mode it calculates and verifies the integrity tag internally. In this
+mode, the dm-integrity target can be used to detect silent data
+corruption on the disk or in the I/O path.
+
+
+When loading the target for the first time, the kernel driver will format
+the device. But it will only format the device if the superblock contains
+zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
+target can't be loaded.
+
+To use the target for the first time:
+1. overwrite the superblock with zeroes
+2. load the dm-integrity target with one-sector size, the kernel driver
+	will format the device
+3. unload the dm-integrity target
+4. read the "provided_data_sectors" value from the superblock
+5. load the dm-integrity target with the the target size
+	"provided_data_sectors"
+6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
+	with the size "provided_data_sectors"
+
+
+Target arguments:
+
+1. the underlying block device
+
+2. the number of reserved sector at the beginning of the device - the
+	dm-integrity won't read of write these sectors
+
+3. the size of the integrity tag (if "-" is used, the size is taken from
+	the internal-hash algorithm)
+
+4. mode:
+	D - direct writes (without journal) - in this mode, journaling is
+		not used and data sectors and integrity tags are written
+		separately. In case of crash, it is possible that the data
+		and integrity tag doesn't match.
+	J - journaled writes - data and integrity tags are written to the
+		journal and atomicity is guaranteed. In case of crash,
+		either both data and tag or none of them are written. The
+		journaled mode degrades write throughput twice because the
+		data have to be written twice.
+	R - recovery mode - in this mode, journal is not replayed,
+		checksums are not checked and writes to the device are not
+		allowed. This mode is useful for data recovery if the
+		device cannot be activated in any of the other standard
+		modes.
+
+5. the number of additional arguments
+
+Additional arguments:
+
+journal_sectors:number
+	The size of journal, this argument is used only if formatting the
+	device. If the device is already formatted, the value from the
+	superblock is used.
+
+interleave_sectors:number
+	The number of interleaved sectors. This values is rounded down to
+	a power of two. If the device is already formatted, the value from
+	the superblock is used.
+
+buffer_sectors:number
+	The number of sectors in one buffer. The value is rounded down to
+	a power of two.
+
+	The tag area is accessed using buffers, the buffer size is
+	configurable. The large buffer size means that the I/O size will
+	be larger, but there could be less I/Os issued.
+
+journal_watermark:number
+	The journal watermark in percents. When the size of the journal
+	exceeds this watermark, the thread that flushes the journal will
+	be started.
+
+commit_time:number
+	Commit time in milliseconds. When this time passes, the journal is
+	written. The journal is also written immediatelly if the FLUSH
+	request is received.
+
+internal_hash:algorithm(:key)	(the key is optional)
+	Use internal hash or crc.
+	When this argument is used, the dm-integrity target won't accept
+	integrity tags from the upper target, but it will automatically
+	generate and verify the integrity tags.
+
+	You can use a crc algorithm (such as crc32), then integrity target
+	will protect the data against accidental corruption.
+	You can also use a hmac algorithm (for example
+	"hmac(sha256):0123456789abcdef"), in this mode it will provide
+	cryptographic authentication of the data without encryption.
+
+	When this argument is not used, the integrity tags are accepted
+	from an upper layer target, such as dm-crypt. The upper layer
+	target should check the validity of the integrity tags.
+
+journal_crypt:algorithm(:key)	(the key is optional)
+	Encrypt the journal using given algorithm to make sure that the
+	attacker can't read the journal. You can use a block cipher here
+	(such as "cbc(aes)") or a stream cipher (for example "chacha20",
+	"salsa20", "ctr(aes)" or "ecb(arc4)").
+
+	The journal contains history of last writes to the block device,
+	an attacker reading the journal could see the last sector nubmers
+	that were written. From the sector numbers, the attacker can infer
+	the size of files that were written. To protect against this
+	situation, you can encrypt the journal.
+
+journal_mac:algorithm(:key)	(the key is optional)
+	Protect sector numbers in the journal from accidental or malicious
+	modification. To protect against accidental modification, use a
+	crc algorithm, to protect against malicious modification, use a
+	hmac algorithm with a key.
+
+	This option is not needed when using internal-hash because in this
+	mode, the integrity of journal entries is checked when replaying
+	the journal. Thus, modified sector number would be detected at
+	this stage.
+
+block_size:number
+	The size of a data block in bytes.  The larger the block size the
+	less overhead there is for per-block integrity metadata.
+	Supported values are 512, 1024, 2048 and 4096 bytes.  If not
+	specified the default block size is 512 bytes.
+
+The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
+be changed when reloading the target (load an inactive table and swap the
+tables with suspend and resume). The other arguments should not be changed
+when reloading the target because the layout of disk data depend on them
+and the reloaded target would be non-functional.
+
+
+The layout of the formatted block device:
+* reserved sectors (they are not used by this target, they can be used for
+  storing LUKS metadata or for other purpose), the size of the reserved
+  area is specified in the target arguments
+* superblock (4kiB)
+	* magic string - identifies that the device was formatted
+	* version
+	* log2(interleave sectors)
+	* integrity tag size
+	* the number of journal sections
+	* provided data sectors - the number of sectors that this target
+	  provides (i.e. the size of the device minus the size of all
+	  metadata and padding). The user of this target should not send
+	  bios that access data beyond the "provided data sectors" limit.
+	* flags - a flag is set if journal_mac is used
+* journal
+	The journal is divided into sections, each section contains:
+	* metadata area (4kiB), it contains journal entries
+	  every journal entry contains:
+		* logical sector (specifies where the data and tag should
+		  be written)
+		* last 8 bytes of data
+		* integrity tag (the size is specified in the superblock)
+	    every metadata sector ends with
+		* mac (8-bytes), all the macs in 8 metadata sectors form a
+		  64-byte value. It is used to store hmac of sector
+		  numbers in the journal section, to protect against a
+		  possibility that the attacker tampers with sector
+		  numbers in the journal.
+		* commit id
+	* data area (the size is variable; it depends on how many journal
+	  entries fit into the metadata area)
+	    every sector in the data area contains:
+		* data (504 bytes of data, the last 8 bytes are stored in
+		  the journal entry)
+		* commit id
+	To test if the whole journal section was written correctly, every
+	512-byte sector of the journal ends with 8-byte commit id. If the
+	commit id matches on all sectors in a journal section, then it is
+	assumed that the section was written correctly. If the commit id
+	doesn't match, the section was written partially and it should not
+	be replayed.
+* one or more runs of interleaved tags and data. Each run contains:
+	* tag area - it contains integrity tags. There is one tag for each
+	  sector in the data area
+	* data area - it contains data sectors. The number of data sectors
+	  in one run must be a power of two. log2 of this value is stored
+	  in the superblock.
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
 		Takeover/reshape is not possible with a raid4/5/6 journal device;
 		it has to be deconfigured before requesting these.

+	[journal_mode <mode>]
+		This option sets the caching mode on journaled raid4/5/6 raid sets
+		(see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
+		If 'writeback' is selected the journal device has to be resilient
+		and must not suffer from the 'write hole' problem itself (e.g. use
+		raid1 or raid10) to avoid a single point of failure.
+
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
@ -254,7 +261,8 @@ recovery.  Here is a fuller description of the individual fields:
 	<data_offset>   The current data offset to the start of the user data on
 			each component device of a raid set (see the respective
 			raid parameter to support out-of-place reshaping).
-	<journal_char>	'A' - active raid4/5/6 journal device.
+	<journal_char>	'A' - active write-through journal device.
+			'a' - active write-back journal device.
 			'D' - dead journal device.
 			'-' - no journal device.

@ -331,3 +339,7 @@ Version History
 	'D' on the status line.  If '- -' is passed into the constructor, emit
 	'- -' on the table line and '-' as the status line health character.
 1.10.0  Add support for raid4/5/6 journal device
+1.10.1  Fix data corruption on reshape request
+1.11.0  Fix table line argument order
+	(wrong raid10_copies/raid10_format sequence)
+1.11.1  Add raid4/5/6 journal write-back support via journal_mode option
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@ -325,14 +325,6 @@ config DM_CACHE_SMQ
         of less memory utilization, improved performance and increased
         adaptability in the face of changing workloads.

-config DM_CACHE_CLEANER
-       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
-       depends on DM_CACHE
-       default y
-       ---help---
-         A simple cache policy that writes back all data to the
-         origin.  Used when decommissioning a dm-cache.
-
 config DM_ERA
       tristate "Era target (EXPERIMENTAL)"
       depends on BLK_DEV_DM
@ -365,6 +357,7 @@ config DM_LOG_USERSPACE
 config DM_RAID
       tristate "RAID 1/4/5/6/10 target"
       depends on BLK_DEV_DM
+       select MD_RAID0
       select MD_RAID1
       select MD_RAID10
       select MD_RAID456
@ -508,4 +501,14 @@ config DM_LOG_WRITES

 	  If unsure, say N.

+config DM_INTEGRITY
+	tristate "Integrity target"
+	depends on BLK_DEV_DM
+	select BLK_DEV_INTEGRITY
+	select DM_BUFIO
+	select CRYPTO
+	select ASYNC_XOR
+	---help---
+	   This is the integrity target.
+
 endif # MD
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@ -11,10 +11,11 @@ dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
-dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
+		    dm-cache-background-tracker.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
@ -56,9 +57,9 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
-obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
+obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o

 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@ -5,7 +5,8 @@
 */

 #include "dm.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"

 #include <linux/spinlock.h>
 #include <linux/mempool.h>
@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);

 /*----------------------------------------------------------------*/

-static int __init dm_bio_prison_init(void)
+static int __init dm_bio_prison_init_v1(void)
 {
 	_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
 	if (!_cell_cache)
@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void)
 	return 0;
 }

-static void __exit dm_bio_prison_exit(void)
+static void dm_bio_prison_exit_v1(void)
 {
 	kmem_cache_destroy(_cell_cache);
 	_cell_cache = NULL;
 }

+static int (*_inits[])(void) __initdata = {
+	dm_bio_prison_init_v1,
+	dm_bio_prison_init_v2,
+};
+
+static void (*_exits[])(void) = {
+	dm_bio_prison_exit_v1,
+	dm_bio_prison_exit_v2,
+};
+
+static int __init dm_bio_prison_init(void)
+{
+	const int count = ARRAY_SIZE(_inits);
+
+	int r, i;
+
+	for (i = 0; i < count; i++) {
+		r = _inits[i]();
+		if (r)
+			goto bad;
+	}
+
+	return 0;
+
+      bad:
+	while (i--)
+		_exits[i]();
+
+	return r;
+}
+
+static void __exit dm_bio_prison_exit(void)
+{
+	int i = ARRAY_SIZE(_exits);
+
+	while (i--)
+		_exits[i]();
+}
+
 /*
 * module hooks
 */
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011-2012 Red Hat, Inc.
+ * Copyright (C) 2011-2017 Red Hat, Inc.
 *
 * This file is released under the GPL.
 */
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2012-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison-v2.h"
+
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+
+/*----------------------------------------------------------------*/
+
+#define MIN_CELLS 1024
+
+struct dm_bio_prison_v2 {
+	struct workqueue_struct *wq;
+
+	spinlock_t lock;
+	mempool_t *cell_pool;
+	struct rb_root cells;
+};
+
+static struct kmem_cache *_cell_cache;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @nr_cells should be the number of cells you want in use _concurrently_.
+ * Don't confuse it with the number of distinct keys.
+ */
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
+{
+	struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+
+	if (!prison)
+		return NULL;
+
+	prison->wq = wq;
+	spin_lock_init(&prison->lock);
+
+	prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
+	if (!prison->cell_pool) {
+		kfree(prison);
+		return NULL;
+	}
+
+	prison->cells = RB_ROOT;
+
+	return prison;
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
+
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
+{
+	mempool_destroy(prison->cell_pool);
+	kfree(prison);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
+
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
+{
+	return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
+
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+				struct dm_bio_prison_cell_v2 *cell)
+{
+	mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
+
+static void __setup_new_cell(struct dm_cell_key_v2 *key,
+			     struct dm_bio_prison_cell_v2 *cell)
+{
+	memset(cell, 0, sizeof(*cell));
+	memcpy(&cell->key, key, sizeof(cell->key));
+	bio_list_init(&cell->bios);
+}
+
+static int cmp_keys(struct dm_cell_key_v2 *lhs,
+		    struct dm_cell_key_v2 *rhs)
+{
+	if (lhs->virtual < rhs->virtual)
+		return -1;
+
+	if (lhs->virtual > rhs->virtual)
+		return 1;
+
+	if (lhs->dev < rhs->dev)
+		return -1;
+
+	if (lhs->dev > rhs->dev)
+		return 1;
+
+	if (lhs->block_end <= rhs->block_begin)
+		return -1;
+
+	if (lhs->block_begin >= rhs->block_end)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Returns true if node found, otherwise it inserts a new one.
+ */
+static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
+			     struct dm_cell_key_v2 *key,
+			     struct dm_bio_prison_cell_v2 *cell_prealloc,
+			     struct dm_bio_prison_cell_v2 **result)
+{
+	int r;
+	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+	while (*new) {
+		struct dm_bio_prison_cell_v2 *cell =
+			container_of(*new, struct dm_bio_prison_cell_v2, node);
+
+		r = cmp_keys(key, &cell->key);
+
+		parent = *new;
+		if (r < 0)
+			new = &((*new)->rb_left);
+
+		else if (r > 0)
+			new = &((*new)->rb_right);
+
+		else {
+			*result = cell;
+			return true;
+		}
+	}
+
+	__setup_new_cell(key, cell_prealloc);
+	*result = cell_prealloc;
+	rb_link_node(&cell_prealloc->node, parent, new);
+	rb_insert_color(&cell_prealloc->node, &prison->cells);
+
+	return false;
+}
+
+static bool __get(struct dm_bio_prison_v2 *prison,
+		  struct dm_cell_key_v2 *key,
+		  unsigned lock_level,
+		  struct bio *inmate,
+		  struct dm_bio_prison_cell_v2 *cell_prealloc,
+		  struct dm_bio_prison_cell_v2 **cell)
+{
+	if (__find_or_insert(prison, key, cell_prealloc, cell)) {
+		if ((*cell)->exclusive_lock) {
+			if (lock_level <= (*cell)->exclusive_level) {
+				bio_list_add(&(*cell)->bios, inmate);
+				return false;
+			}
+		}
+
+		(*cell)->shared_count++;
+
+	} else
+		(*cell)->shared_count = 1;
+
+	return true;
+}
+
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct bio *inmate,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_get_v2);
+
+static bool __put(struct dm_bio_prison_v2 *prison,
+		  struct dm_bio_prison_cell_v2 *cell)
+{
+	BUG_ON(!cell->shared_count);
+	cell->shared_count--;
+
+	// FIXME: shared locks granted above the lock level could starve this
+	if (!cell->shared_count) {
+		if (cell->exclusive_lock){
+			if (cell->quiesce_continuation) {
+				queue_work(prison->wq, cell->quiesce_continuation);
+				cell->quiesce_continuation = NULL;
+			}
+		} else {
+			rb_erase(&cell->node, &prison->cells);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_bio_prison_cell_v2 *cell)
+{
+	bool r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __put(prison, cell);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_put_v2);
+
+static int __lock(struct dm_bio_prison_v2 *prison,
+		  struct dm_cell_key_v2 *key,
+		  unsigned lock_level,
+		  struct dm_bio_prison_cell_v2 *cell_prealloc,
+		  struct dm_bio_prison_cell_v2 **cell_result)
+{
+	struct dm_bio_prison_cell_v2 *cell;
+
+	if (__find_or_insert(prison, key, cell_prealloc, &cell)) {
+		if (cell->exclusive_lock)
+			return -EBUSY;
+
+		cell->exclusive_lock = true;
+		cell->exclusive_level = lock_level;
+		*cell_result = cell;
+
+		// FIXME: we don't yet know what level these shared locks
+		// were taken at, so have to quiesce them all.
+		return cell->shared_count > 0;
+
+	} else {
+		cell = cell_prealloc;
+		cell->shared_count = 0;
+		cell->exclusive_lock = true;
+		cell->exclusive_level = lock_level;
+		*cell_result = cell;
+	}
+
+	return 0;
+}
+
+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_lock_v2);
+
+static void __quiesce(struct dm_bio_prison_v2 *prison,
+		      struct dm_bio_prison_cell_v2 *cell,
+		      struct work_struct *continuation)
+{
+	if (!cell->shared_count)
+		queue_work(prison->wq, continuation);
+	else
+		cell->quiesce_continuation = continuation;
+}
+
+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
+			struct dm_bio_prison_cell_v2 *cell,
+			struct work_struct *continuation)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	__quiesce(prison, cell, continuation);
+	spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
+
+static int __promote(struct dm_bio_prison_v2 *prison,
+		     struct dm_bio_prison_cell_v2 *cell,
+		     unsigned new_lock_level)
+{
+	if (!cell->exclusive_lock)
+		return -EINVAL;
+
+	cell->exclusive_level = new_lock_level;
+	return cell->shared_count > 0;
+}
+
+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
+			    struct dm_bio_prison_cell_v2 *cell,
+			    unsigned new_lock_level)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __promote(prison, cell, new_lock_level);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_lock_promote_v2);
+
+static bool __unlock(struct dm_bio_prison_v2 *prison,
+		     struct dm_bio_prison_cell_v2 *cell,
+		     struct bio_list *bios)
+{
+	BUG_ON(!cell->exclusive_lock);
+
+	bio_list_merge(bios, &cell->bios);
+	bio_list_init(&cell->bios);
+
+	if (cell->shared_count) {
+		cell->exclusive_lock = 0;
+		return false;
+	}
+
+	rb_erase(&cell->node, &prison->cells);
+	return true;
+}
+
+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
+		       struct dm_bio_prison_cell_v2 *cell,
+		       struct bio_list *bios)
+{
+	bool r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __unlock(prison, cell, bios);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_unlock_v2);
+
+/*----------------------------------------------------------------*/
+
+int __init dm_bio_prison_init_v2(void)
+{
+	_cell_cache = KMEM_CACHE(dm_bio_prison_cell_v2, 0);
+	if (!_cell_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void dm_bio_prison_exit_v2(void)
+{
+	kmem_cache_destroy(_cell_cache);
+	_cell_cache = NULL;
+}
--- a/drivers/md/dm-bio-prison-v2.h
+++ b/drivers/md/dm-bio-prison-v2.h
@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2011-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BIO_PRISON_V2_H
+#define DM_BIO_PRISON_V2_H
+
+#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
+#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
+
+#include <linux/bio.h>
+#include <linux/rbtree.h>
+#include <linux/workqueue.h>
+
+/*----------------------------------------------------------------*/
+
+int dm_bio_prison_init_v2(void);
+void dm_bio_prison_exit_v2(void);
+
+/*
+ * Sometimes we can't deal with a bio straight away.  We put them in prison
+ * where they can't cause any mischief.  Bios are put in a cell identified
+ * by a key, multiple bios can be in the same cell.  When the cell is
+ * subsequently unlocked the bios become available.
+ */
+struct dm_bio_prison_v2;
+
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
+struct dm_cell_key_v2 {
+	int virtual;
+	dm_thin_id dev;
+	dm_block_t block_begin, block_end;
+};
+
+/*
+ * Treat this as opaque, only in header so callers can manage allocation
+ * themselves.
+ */
+struct dm_bio_prison_cell_v2 {
+	// FIXME: pack these
+	bool exclusive_lock;
+	unsigned exclusive_level;
+	unsigned shared_count;
+	struct work_struct *quiesce_continuation;
+
+	struct rb_node node;
+	struct dm_cell_key_v2 key;
+	struct bio_list bios;
+};
+
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq);
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison);
+
+/*
+ * These two functions just wrap a mempool.  This is a transitory step:
+ * Eventually all bio prison clients should manage their own cell memory.
+ *
+ * Like mempool_alloc(), dm_bio_prison_alloc_cell_v2() can only fail if called
+ * in interrupt context or passed GFP_NOWAIT.
+ */
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison,
+						    gfp_t gfp);
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+				struct dm_bio_prison_cell_v2 *cell);
+
+/*
+ * Shared locks have a bio associated with them.
+ *
+ * If the lock is granted the caller can continue to use the bio, and must
+ * call dm_cell_put_v2() to drop the reference count when finished using it.
+ *
+ * If the lock cannot be granted then the bio will be tracked within the
+ * cell, and later given to the holder of the exclusive lock.
+ *
+ * See dm_cell_lock_v2() for discussion of the lock_level parameter.
+ *
+ * Compare *cell_result with cell_prealloc to see if the prealloc was used.
+ * If cell_prealloc was used then inmate wasn't added to it.
+ *
+ * Returns true if the lock is granted.
+ */
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct bio *inmate,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result);
+
+/*
+ * Decrement the shared reference count for the lock.  Returns true if
+ * returning ownership of the cell (ie. you should free it).
+ */
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_bio_prison_cell_v2 *cell);
+
+/*
+ * Locks a cell.  No associated bio.  Exclusive locks get priority.  These
+ * locks constrain whether the io locks are granted according to level.
+ *
+ * Shared locks will still be granted if the lock_level is > (not = to) the
+ * exclusive lock level.
+ *
+ * If an _exclusive_ lock is already held then -EBUSY is returned.
+ *
+ * Return values:
+ *  < 0 - error
+ *  0   - locked; no quiescing needed
+ *  1   - locked; quiescing needed
+ */
+int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
+		    struct dm_cell_key_v2 *key,
+		    unsigned lock_level,
+		    struct dm_bio_prison_cell_v2 *cell_prealloc,
+		    struct dm_bio_prison_cell_v2 **cell_result);
+
+void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
+			struct dm_bio_prison_cell_v2 *cell,
+			struct work_struct *continuation);
+
+/*
+ * Promotes an _exclusive_ lock to a higher lock level.
+ *
+ * Return values:
+ *  < 0 - error
+ *  0   - promoted; no quiescing needed
+ *  1   - promoted; quiescing needed
+ */
+int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
+			    struct dm_bio_prison_cell_v2 *cell,
+			    unsigned new_lock_level);
+
+/*
+ * Adds any held bios to the bio list.
+ *
+ * There may be shared locks still held at this point even if you quiesced
+ * (ie. different lock levels).
+ *
+ * Returns true if returning ownership of the cell (ie. you should free
+ * it).
+ */
+bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
+		       struct dm_bio_prison_cell_v2 *cell,
+		       struct bio_list *bios);
+
+/*----------------------------------------------------------------*/
+
+#endif
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@ -110,6 +110,8 @@ struct dm_bufio_client {
 	struct rb_root buffer_tree;
 	wait_queue_head_t free_buffer_wait;

+	sector_t start;
+
 	int async_write_error;

 	struct list_head client_list;
@ -557,8 +559,8 @@ static void dmio_complete(unsigned long error, void *context)
 	b->bio.bi_end_io(&b->bio);
 }

-static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
-		     bio_end_io_t *end_io)
+static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
+		     unsigned n_sectors, bio_end_io_t *end_io)
 {
 	int r;
 	struct dm_io_request io_req = {
@ -570,8 +572,8 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
 	};
 	struct dm_io_region region = {
 		.bdev = b->c->bdev,
-		.sector = block << b->c->sectors_per_block_bits,
-		.count = b->c->block_size >> SECTOR_SHIFT,
+		.sector = sector,
+		.count = n_sectors,
 	};

 	if (b->data_mode != DATA_MODE_VMALLOC) {
@ -606,14 +608,14 @@ static void inline_endio(struct bio *bio)
 	end_fn(bio);
 }

-static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
-			   bio_end_io_t *end_io)
+static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
+			   unsigned n_sectors, bio_end_io_t *end_io)
 {
 	char *ptr;
 	int len;

 	bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
-	b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
+	b->bio.bi_iter.bi_sector = sector;
 	b->bio.bi_bdev = b->c->bdev;
 	b->bio.bi_end_io = inline_endio;
 	/*
@ -628,7 +630,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
 	 */
 	ptr = b->data;
-	len = b->c->block_size;
+	len = n_sectors << SECTOR_SHIFT;

 	if (len >= PAGE_SIZE)
 		BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
@ -640,7 +642,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 				  len < PAGE_SIZE ? len : PAGE_SIZE,
 				  offset_in_page(ptr))) {
 			BUG_ON(b->c->block_size <= PAGE_SIZE);
-			use_dmio(b, rw, block, end_io);
+			use_dmio(b, rw, sector, n_sectors, end_io);
 			return;
 		}

@ -651,17 +653,22 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
 	submit_bio(&b->bio);
 }

-static void submit_io(struct dm_buffer *b, int rw, sector_t block,
-		      bio_end_io_t *end_io)
+static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
 {
+	unsigned n_sectors;
+	sector_t sector;
+
 	if (rw == WRITE && b->c->write_callback)
 		b->c->write_callback(b);

-	if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
+	sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
+	n_sectors = 1 << b->c->sectors_per_block_bits;
+
+	if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
 	    b->data_mode != DATA_MODE_VMALLOC)
-		use_inline_bio(b, rw, block, end_io);
+		use_inline_bio(b, rw, sector, n_sectors, end_io);
 	else
-		use_dmio(b, rw, block, end_io);
+		use_dmio(b, rw, sector, n_sectors, end_io);
 }

 /*----------------------------------------------------------------
@ -713,7 +720,7 @@ static void __write_dirty_buffer(struct dm_buffer *b,
 	wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);

 	if (!write_list)
-		submit_io(b, WRITE, b->block, write_endio);
+		submit_io(b, WRITE, write_endio);
 	else
 		list_add_tail(&b->write_list, write_list);
 }
@ -726,7 +733,7 @@ static void __flush_write_list(struct list_head *write_list)
 		struct dm_buffer *b =
 			list_entry(write_list->next, struct dm_buffer, write_list);
 		list_del(&b->write_list);
-		submit_io(b, WRITE, b->block, write_endio);
+		submit_io(b, WRITE, write_endio);
 		cond_resched();
 	}
 	blk_finish_plug(&plug);
@ -933,10 +940,11 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 {
 	unsigned long buffers;

-	if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) {
-		mutex_lock(&dm_bufio_clients_lock);
-		__cache_size_refresh();
-		mutex_unlock(&dm_bufio_clients_lock);
+	if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+		if (mutex_trylock(&dm_bufio_clients_lock)) {
+			__cache_size_refresh();
+			mutex_unlock(&dm_bufio_clients_lock);
+		}
 	}

 	buffers = dm_bufio_cache_size_per_client >>
@ -1094,7 +1102,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 		return NULL;

 	if (need_submit)
-		submit_io(b, READ, b->block, read_endio);
+		submit_io(b, READ, read_endio);

 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);

@ -1164,7 +1172,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
 			dm_bufio_unlock(c);

 			if (need_submit)
-				submit_io(b, READ, b->block, read_endio);
+				submit_io(b, READ, read_endio);
 			dm_bufio_release(b);

 			cond_resched();
@ -1405,7 +1413,7 @@ retry:
 		old_block = b->block;
 		__unlink_buffer(b);
 		__link_buffer(b, new_block, b->list_mode);
-		submit_io(b, WRITE, new_block, write_endio);
+		submit_io(b, WRITE, write_endio);
 		wait_on_bit_io(&b->state, B_WRITING,
 			       TASK_UNINTERRUPTIBLE);
 		__unlink_buffer(b);
@ -1762,6 +1770,12 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);

+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
+{
+	c->start = start;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
+
 static unsigned get_max_age_hz(void)
 {
 	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
@ -1782,9 +1796,17 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
 	struct dm_buffer *b, *tmp;
 	unsigned retain_target = get_retain_buffers(c);
 	unsigned count;
+	LIST_HEAD(write_list);

 	dm_bufio_lock(c);

+	__check_watermark(c, &write_list);
+	if (unlikely(!list_empty(&write_list))) {
+		dm_bufio_unlock(c);
+		__flush_write_list(&write_list);
+		dm_bufio_lock(c);
+	}
+
 	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
 		if (count <= retain_target)
@ -1809,6 +1831,8 @@ static void cleanup_old_buffers(void)

 	mutex_lock(&dm_bufio_clients_lock);

+	__cache_size_refresh();
+
 	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
 		__evict_old_buffers(c, max_age_hz);

--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@ -31,6 +31,13 @@ dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
 */
 void dm_bufio_client_destroy(struct dm_bufio_client *c);

+/*
+ * Set the sector range.
+ * When this function is called, there must be no I/O in progress on the bufio
+ * client.
+ */
+void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start);
+
 /*
 * WARNING: to avoid deadlocks, these conditions are observed:
 *
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-background-tracker.h"
+
+/*----------------------------------------------------------------*/
+
+#define DM_MSG_PREFIX "dm-background-tracker"
+
+struct bt_work {
+	struct list_head list;
+	struct rb_node node;
+	struct policy_work work;
+};
+
+struct background_tracker {
+	unsigned max_work;
+	atomic_t pending_promotes;
+	atomic_t pending_writebacks;
+	atomic_t pending_demotes;
+
+	struct list_head issued;
+	struct list_head queued;
+	struct rb_root pending;
+
+	struct kmem_cache *work_cache;
+};
+
+struct background_tracker *btracker_create(unsigned max_work)
+{
+	struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
+
+	b->max_work = max_work;
+	atomic_set(&b->pending_promotes, 0);
+	atomic_set(&b->pending_writebacks, 0);
+	atomic_set(&b->pending_demotes, 0);
+
+	INIT_LIST_HEAD(&b->issued);
+	INIT_LIST_HEAD(&b->queued);
+
+	b->pending = RB_ROOT;
+	b->work_cache = KMEM_CACHE(bt_work, 0);
+	if (!b->work_cache) {
+		DMERR("couldn't create mempool for background work items");
+		kfree(b);
+		b = NULL;
+	}
+
+	return b;
+}
+EXPORT_SYMBOL_GPL(btracker_create);
+
+void btracker_destroy(struct background_tracker *b)
+{
+	kmem_cache_destroy(b->work_cache);
+	kfree(b);
+}
+EXPORT_SYMBOL_GPL(btracker_destroy);
+
+static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
+{
+	if (from_oblock(lhs) < from_oblock(rhs))
+		return -1;
+
+	if (from_oblock(rhs) < from_oblock(lhs))
+		return 1;
+
+	return 0;
+}
+
+static bool __insert_pending(struct background_tracker *b,
+			     struct bt_work *nw)
+{
+	int cmp;
+	struct bt_work *w;
+	struct rb_node **new = &b->pending.rb_node, *parent = NULL;
+
+	while (*new) {
+		w = container_of(*new, struct bt_work, node);
+
+		parent = *new;
+		cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+
+		else
+			/* already present */
+			return false;
+	}
+
+	rb_link_node(&nw->node, parent, new);
+	rb_insert_color(&nw->node, &b->pending);
+
+	return true;
+}
+
+static struct bt_work *__find_pending(struct background_tracker *b,
+				      dm_oblock_t oblock)
+{
+	int cmp;
+	struct bt_work *w;
+	struct rb_node **new = &b->pending.rb_node;
+
+	while (*new) {
+		w = container_of(*new, struct bt_work, node);
+
+		cmp = cmp_oblock(w->work.oblock, oblock);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+
+		else
+			break;
+	}
+
+	return *new ? w : NULL;
+}
+
+
+static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
+{
+	switch (w->op) {
+	case POLICY_PROMOTE:
+		atomic_add(delta, &b->pending_promotes);
+		break;
+
+	case POLICY_DEMOTE:
+		atomic_add(delta, &b->pending_demotes);
+		break;
+
+	case POLICY_WRITEBACK:
+		atomic_add(delta, &b->pending_writebacks);
+		break;
+	}
+}
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
+{
+	return atomic_read(&b->pending_writebacks);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
+
+unsigned btracker_nr_demotions_queued(struct background_tracker *b)
+{
+	return atomic_read(&b->pending_demotes);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
+
+static bool max_work_reached(struct background_tracker *b)
+{
+	// FIXME: finish
+	return false;
+}
+
+int btracker_queue(struct background_tracker *b,
+		   struct policy_work *work,
+		   struct policy_work **pwork)
+{
+	struct bt_work *w;
+
+	if (pwork)
+		*pwork = NULL;
+
+	if (max_work_reached(b))
+		return -ENOMEM;
+
+	w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+	if (!w)
+		return -ENOMEM;
+
+	memcpy(&w->work, work, sizeof(*work));
+
+	if (!__insert_pending(b, w)) {
+		/*
+		 * There was a race, we'll just ignore this second
+		 * bit of work for the same oblock.
+		 */
+		kmem_cache_free(b->work_cache, w);
+		return -EINVAL;
+	}
+
+	if (pwork) {
+		*pwork = &w->work;
+		list_add(&w->list, &b->issued);
+	} else
+		list_add(&w->list, &b->queued);
+	update_stats(b, &w->work, 1);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_queue);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work)
+{
+	struct bt_work *w;
+
+	if (list_empty(&b->queued))
+		return -ENODATA;
+
+	w = list_first_entry(&b->queued, struct bt_work, list);
+	list_move(&w->list, &b->issued);
+	*work = &w->work;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_issue);
+
+void btracker_complete(struct background_tracker *b,
+		       struct policy_work *op)
+{
+	struct bt_work *w = container_of(op, struct bt_work, work);
+
+	update_stats(b, &w->work, -1);
+	rb_erase(&w->node, &b->pending);
+	list_del(&w->list);
+	kmem_cache_free(b->work_cache, w);
+}
+EXPORT_SYMBOL_GPL(btracker_complete);
+
+bool btracker_promotion_already_present(struct background_tracker *b,
+					dm_oblock_t oblock)
+{
+	return __find_pending(b, oblock) != NULL;
+}
+EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
+
+/*----------------------------------------------------------------*/
--- a/drivers/md/dm-cache-background-tracker.h
+++ b/drivers/md/dm-cache-background-tracker.h
@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_CACHE_BACKGROUND_WORK_H
+#define DM_CACHE_BACKGROUND_WORK_H
+
+#include <linux/vmalloc.h>
+#include "dm-cache-policy.h"
+
+/*----------------------------------------------------------------*/
+
+struct background_work;
+struct background_tracker;
+
+/*
+ * FIXME: discuss lack of locking in all methods.
+ */
+struct background_tracker *btracker_create(unsigned max_work);
+void btracker_destroy(struct background_tracker *b);
+
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
+unsigned btracker_nr_demotions_queued(struct background_tracker *b);
+
+/*
+ * returns -EINVAL iff the work is already queued.  -ENOMEM if the work
+ * couldn't be queued for another reason.
+ */
+int btracker_queue(struct background_tracker *b,
+		   struct policy_work *work,
+		   struct policy_work **pwork);
+
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work);
+void btracker_complete(struct background_tracker *b,
+		       struct policy_work *op);
+bool btracker_promotion_already_present(struct background_tracker *b,
+					dm_oblock_t oblock);
+
+/*----------------------------------------------------------------*/
+
+#endif
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@ -27,8 +27,6 @@
 #define MIN_CACHE_VERSION 1
 #define MAX_CACHE_VERSION 2

-#define CACHE_METADATA_CACHE_SIZE 64
-
 /*
 *  3 for btree insert +
 *  2 for btree lookup used within space map
@ -535,7 +533,6 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
 {
 	int r;
 	cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
-					  CACHE_METADATA_CACHE_SIZE,
 					  CACHE_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(cmd->bm)) {
 		DMERR("could not create block manager");
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@ -50,6 +50,8 @@
 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP	  0UL
 #define DM_CACHE_FEATURE_INCOMPAT_SUPP	  0UL

+struct dm_cache_metadata;
+
 /*
 * Reopens or creates a new, empty metadata volume.  Returns an ERR_PTR on
 * failure.  If reopening then features must match.
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@ -1,469 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * writeback cache policy supporting flushing out dirty cache blocks.
- *
- * This file is released under the GPL.
- */
-
-#include "dm-cache-policy.h"
-#include "dm.h"
-
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-/*----------------------------------------------------------------*/
-
-#define DM_MSG_PREFIX "cache cleaner"
-
-/* Cache entry struct. */
-struct wb_cache_entry {
-	struct list_head list;
-	struct hlist_node hlist;
-
-	dm_oblock_t oblock;
-	dm_cblock_t cblock;
-	bool dirty:1;
-	bool pending:1;
-};
-
-struct hash {
-	struct hlist_head *table;
-	dm_block_t hash_bits;
-	unsigned nr_buckets;
-};
-
-struct policy {
-	struct dm_cache_policy policy;
-	spinlock_t lock;
-
-	struct list_head free;
-	struct list_head clean;
-	struct list_head clean_pending;
-	struct list_head dirty;
-
-	/*
-	 * We know exactly how many cblocks will be needed,
-	 * so we can allocate them up front.
-	 */
-	dm_cblock_t cache_size, nr_cblocks_allocated;
-	struct wb_cache_entry *cblocks;
-	struct hash chash;
-};
-
-/*----------------------------------------------------------------------------*/
-
-/*
- * Low-level functions.
- */
-static unsigned next_power(unsigned n, unsigned min)
-{
-	return roundup_pow_of_two(max(n, min));
-}
-
-static struct policy *to_policy(struct dm_cache_policy *p)
-{
-	return container_of(p, struct policy, policy);
-}
-
-static struct list_head *list_pop(struct list_head *q)
-{
-	struct list_head *r = q->next;
-
-	list_del(r);
-
-	return r;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Allocate/free various resources. */
-static int alloc_hash(struct hash *hash, unsigned elts)
-{
-	hash->nr_buckets = next_power(elts >> 4, 16);
-	hash->hash_bits = __ffs(hash->nr_buckets);
-	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
-
-	return hash->table ? 0 : -ENOMEM;
-}
-
-static void free_hash(struct hash *hash)
-{
-	vfree(hash->table);
-}
-
-static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
-{
-	int r = -ENOMEM;
-
-	p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
-	if (p->cblocks) {
-		unsigned u = from_cblock(cache_size);
-
-		while (u--)
-			list_add(&p->cblocks[u].list, &p->free);
-
-		p->nr_cblocks_allocated = 0;
-
-		/* Cache entries hash. */
-		r = alloc_hash(&p->chash, from_cblock(cache_size));
-		if (r)
-			vfree(p->cblocks);
-	}
-
-	return r;
-}
-
-static void free_cache_blocks_and_hash(struct policy *p)
-{
-	free_hash(&p->chash);
-	vfree(p->cblocks);
-}
-
-static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
-{
-	struct wb_cache_entry *e;
-
-	BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
-
-	e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
-
-	return e;
-}
-
-/*----------------------------------------------------------------------------*/
-
-/* Hash functions (lookup, insert, remove). */
-static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
-{
-	struct hash *hash = &p->chash;
-	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
-	struct wb_cache_entry *cur;
-	struct hlist_head *bucket = &hash->table[h];
-
-	hlist_for_each_entry(cur, bucket, hlist) {
-		if (cur->oblock == oblock) {
-			/* Move upfront bucket for faster access. */
-			hlist_del(&cur->hlist);
-			hlist_add_head(&cur->hlist, bucket);
-			return cur;
-		}
-	}
-
-	return NULL;
-}
-
-static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
-{
-	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
-
-	hlist_add_head(&e->hlist, &p->chash.table[h]);
-}
-
-static void remove_cache_hash_entry(struct wb_cache_entry *e)
-{
-	hlist_del(&e->hlist);
-}
-
-/* Public interface (see dm-cache-policy.h */
-static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
-		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_locker *locker,
-		  struct policy_result *result)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	result->op = POLICY_MISS;
-
-	if (can_block)
-		spin_lock_irqsave(&p->lock, flags);
-
-	else if (!spin_trylock_irqsave(&p->lock, flags))
-		return -EWOULDBLOCK;
-
-	e = lookup_cache_entry(p, oblock);
-	if (e) {
-		result->op = POLICY_HIT;
-		result->cblock = e->cblock;
-
-	}
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return 0;
-}
-
-static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
-	int r;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	if (!spin_trylock_irqsave(&p->lock, flags))
-		return -EWOULDBLOCK;
-
-	e = lookup_cache_entry(p, oblock);
-	if (e) {
-		*cblock = e->cblock;
-		r = 0;
-
-	} else
-		r = -ENOENT;
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return r;
-}
-
-static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-
-	e = lookup_cache_entry(p, oblock);
-	BUG_ON(!e);
-
-	if (set) {
-		if (!e->dirty) {
-			e->dirty = true;
-			list_move(&e->list, &p->dirty);
-		}
-
-	} else {
-		if (e->dirty) {
-			e->pending = false;
-			e->dirty = false;
-			list_move(&e->list, &p->clean);
-		}
-	}
-}
-
-static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	__set_clear_dirty(pe, oblock, true);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	__set_clear_dirty(pe, oblock, false);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
-{
-	insert_cache_hash_entry(p, e);
-	if (e->dirty)
-		list_add(&e->list, &p->dirty);
-	else
-		list_add(&e->list, &p->clean);
-}
-
-static int wb_load_mapping(struct dm_cache_policy *pe,
-			   dm_oblock_t oblock, dm_cblock_t cblock,
-			   uint32_t hint, bool hint_valid)
-{
-	int r;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e = alloc_cache_entry(p);
-
-	if (e) {
-		e->cblock = cblock;
-		e->oblock = oblock;
-		e->dirty = false; /* blocks default to clean */
-		add_cache_entry(p, e);
-		r = 0;
-
-	} else
-		r = -ENOMEM;
-
-	return r;
-}
-
-static void wb_destroy(struct dm_cache_policy *pe)
-{
-	struct policy *p = to_policy(pe);
-
-	free_cache_blocks_and_hash(p);
-	kfree(p);
-}
-
-static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
-{
-	struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
-
-	BUG_ON(!r);
-
-	remove_cache_hash_entry(r);
-	list_del(&r->list);
-
-	return r;
-}
-
-static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	e = __wb_force_remove_mapping(p, oblock);
-	list_add_tail(&e->list, &p->free);
-	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
-	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static void wb_force_mapping(struct dm_cache_policy *pe,
-				dm_oblock_t current_oblock, dm_oblock_t oblock)
-{
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-	e = __wb_force_remove_mapping(p, current_oblock);
-	e->oblock = oblock;
-	add_cache_entry(p, e);
-	spin_unlock_irqrestore(&p->lock, flags);
-}
-
-static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
-{
-	struct list_head *l;
-	struct wb_cache_entry *r;
-
-	if (list_empty(&p->dirty))
-		return NULL;
-
-	l = list_pop(&p->dirty);
-	r = container_of(l, struct wb_cache_entry, list);
-	list_add(l, &p->clean_pending);
-
-	return r;
-}
-
-static int wb_writeback_work(struct dm_cache_policy *pe,
-			     dm_oblock_t *oblock,
-			     dm_cblock_t *cblock,
-			     bool critical_only)
-{
-	int r = -ENOENT;
-	struct policy *p = to_policy(pe);
-	struct wb_cache_entry *e;
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->lock, flags);
-
-	e = get_next_dirty_entry(p);
-	if (e) {
-		*oblock = e->oblock;
-		*cblock = e->cblock;
-		r = 0;
-	}
-
-	spin_unlock_irqrestore(&p->lock, flags);
-
-	return r;
-}
-
-static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
-{
-	return to_policy(pe)->nr_cblocks_allocated;
-}
-
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct policy *p)
-{
-	p->policy.destroy = wb_destroy;
-	p->policy.map = wb_map;
-	p->policy.lookup = wb_lookup;
-	p->policy.set_dirty = wb_set_dirty;
-	p->policy.clear_dirty = wb_clear_dirty;
-	p->policy.load_mapping = wb_load_mapping;
-	p->policy.get_hint = NULL;
-	p->policy.remove_mapping = wb_remove_mapping;
-	p->policy.writeback_work = wb_writeback_work;
-	p->policy.force_mapping = wb_force_mapping;
-	p->policy.residency = wb_residency;
-	p->policy.tick = NULL;
-}
-
-static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
-					 sector_t origin_size,
-					 sector_t cache_block_size)
-{
-	int r;
-	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
-
-	if (!p)
-		return NULL;
-
-	init_policy_functions(p);
-	INIT_LIST_HEAD(&p->free);
-	INIT_LIST_HEAD(&p->clean);
-	INIT_LIST_HEAD(&p->clean_pending);
-	INIT_LIST_HEAD(&p->dirty);
-
-	p->cache_size = cache_size;
-	spin_lock_init(&p->lock);
-
-	/* Allocate cache entry structs and add them to free list. */
-	r = alloc_cache_blocks_with_hash(p, cache_size);
-	if (!r)
-		return &p->policy;
-
-	kfree(p);
-
-	return NULL;
-}
-/*----------------------------------------------------------------------------*/
-
-static struct dm_cache_policy_type wb_policy_type = {
-	.name = "cleaner",
-	.version = {1, 0, 0},
-	.hint_size = 4,
-	.owner = THIS_MODULE,
-	.create = wb_create
-};
-
-static int __init wb_init(void)
-{
-	int r = dm_cache_policy_register(&wb_policy_type);
-
-	if (r < 0)
-		DMERR("register failed %d", r);
-	else
-		DMINFO("version %u.%u.%u loaded",
-		       wb_policy_type.version[0],
-		       wb_policy_type.version[1],
-		       wb_policy_type.version[2]);
-
-	return r;
-}
-
-static void __exit wb_exit(void)
-{
-	dm_cache_policy_unregister(&wb_policy_type);
-}
-
-module_init(wb_init);
-module_exit(wb_exit);
-
-MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("cleaner cache policy");
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@ -12,40 +12,59 @@

 /*----------------------------------------------------------------*/

-/*
- * Little inline functions that simplify calling the policy methods.
- */
-static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-			     bool can_block, bool can_migrate, bool discarded_oblock,
-			     struct bio *bio, struct policy_locker *locker,
-			     struct policy_result *result)
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy, bool *background_queued)
 {
-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
+	return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
 }

-static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static inline int policy_lookup_with_work(struct dm_cache_policy *p,
+					  dm_oblock_t oblock, dm_cblock_t *cblock,
+					  int data_dir, bool fast_copy,
+					  struct policy_work **work)
 {
-	BUG_ON(!p->lookup);
-	return p->lookup(p, oblock, cblock);
+	if (!p->lookup_with_work) {
+		*work = NULL;
+		return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
+	}
+
+	return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
 }

-static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_get_background_work(struct dm_cache_policy *p,
+					     bool idle, struct policy_work **result)
 {
-	if (p->set_dirty)
-		p->set_dirty(p, oblock);
+	return p->get_background_work(p, idle, result);
 }

-static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline void policy_complete_background_work(struct dm_cache_policy *p,
+						   struct policy_work *work,
+						   bool success)
 {
-	if (p->clear_dirty)
-		p->clear_dirty(p, oblock);
+	return p->complete_background_work(p, work, success);
+}
+
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	p->set_dirty(p, cblock);
+}
+
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	p->clear_dirty(p, cblock);
 }

 static inline int policy_load_mapping(struct dm_cache_policy *p,
 				      dm_oblock_t oblock, dm_cblock_t cblock,
-				      uint32_t hint, bool hint_valid)
+				      bool dirty, uint32_t hint, bool hint_valid)
 {
-	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+	return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
+}
+
+static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
+					    dm_cblock_t cblock)
+{
+	return p->invalidate_mapping(p, cblock);
 }

 static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
@ -54,30 +73,6 @@ static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
 	return p->get_hint ? p->get_hint(p, cblock) : 0;
 }

-static inline int policy_writeback_work(struct dm_cache_policy *p,
-					dm_oblock_t *oblock,
-					dm_cblock_t *cblock,
-					bool critical_only)
-{
-	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
-}
-
-static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
-{
-	p->remove_mapping(p, oblock);
-}
-
-static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
-{
-	return p->remove_cblock(p, cblock);
-}
-
-static inline void policy_force_mapping(struct dm_cache_policy *p,
-					dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-	return p->force_mapping(p, current_oblock, new_oblock);
-}
-
 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
 {
 	return p->residency(p);
@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 	return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
 }

+static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+	return p->allow_migrations(p, allow);
+}
+
 /*----------------------------------------------------------------*/

 /*
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@ -13,147 +13,94 @@

 /*----------------------------------------------------------------*/

-/* FIXME: make it clear which methods are optional.  Get debug policy to
- * double check this at start.
- */
-
 /*
 * The cache policy makes the important decisions about which blocks get to
 * live on the faster cache device.
- *
- * When the core target has to remap a bio it calls the 'map' method of the
- * policy.  This returns an instruction telling the core target what to do.
- *
- * POLICY_HIT:
- *   That block is in the cache.  Remap to the cache and carry on.
- *
- * POLICY_MISS:
- *   This block is on the origin device.  Remap and carry on.
- *
- * POLICY_NEW:
- *   This block is currently on the origin device, but the policy wants to
- *   move it.  The core should:
- *
- *   - hold any further io to this origin block
- *   - copy the origin to the given cache block
- *   - release all the held blocks
- *   - remap the original block to the cache
- *
- * POLICY_REPLACE:
- *   This block is currently on the origin device.  The policy wants to
- *   move it to the cache, with the added complication that the destination
- *   cache block needs a writeback first.  The core should:
- *
- *   - hold any further io to this origin block
- *   - hold any further io to the origin block that's being written back
- *   - writeback
- *   - copy new block to cache
- *   - release held blocks
- *   - remap bio to cache and reissue.
- *
- * Should the core run into trouble while processing a POLICY_NEW or
- * POLICY_REPLACE instruction it will roll back the policies mapping using
- * remove_mapping() or force_mapping().  These methods must not fail.  This
- * approach avoids having transactional semantics in the policy (ie, the
- * core informing the policy when a migration is complete), and hence makes
- * it easier to write new policies.
- *
- * In general policy methods should never block, except in the case of the
- * map function when can_migrate is set.  So be careful to implement using
- * bounded, preallocated memory.
 */
 enum policy_operation {
-	POLICY_HIT,
-	POLICY_MISS,
-	POLICY_NEW,
-	POLICY_REPLACE
-};
-
-/*
- * When issuing a POLICY_REPLACE the policy needs to make a callback to
- * lock the block being demoted.  This doesn't need to occur during a
- * writeback operation since the block remains in the cache.
- */
-struct policy_locker;
-typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
-
-struct policy_locker {
-	policy_lock_fn fn;
+	POLICY_PROMOTE,
+	POLICY_DEMOTE,
+	POLICY_WRITEBACK
 };

 /*
 * This is the instruction passed back to the core target.
 */
-struct policy_result {
+struct policy_work {
 	enum policy_operation op;
-	dm_oblock_t old_oblock;	/* POLICY_REPLACE */
-	dm_cblock_t cblock;	/* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+	dm_oblock_t oblock;
+	dm_cblock_t cblock;
 };

 /*
- * The cache policy object.  Just a bunch of methods.  It is envisaged that
- * this structure will be embedded in a bigger, policy specific structure
- * (ie. use container_of()).
+ * The cache policy object.  It is envisaged that this structure will be
+ * embedded in a bigger, policy specific structure (ie. use container_of()).
 */
 struct dm_cache_policy {
-
-	/*
-	 * FIXME: make it clear which methods are optional, and which may
-	 * block.
-	 */
-
 	/*
 	 * Destroys this object.
 	 */
 	void (*destroy)(struct dm_cache_policy *p);

 	/*
-	 * See large comment above.
-	 *
-	 * oblock      - the origin block we're interested in.
-	 *
-	 * can_block - indicates whether the current thread is allowed to
-	 *             block.  -EWOULDBLOCK returned if it can't and would.
-	 *
-	 * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
-	 *               instructions.  If denied and the policy would have
-	 *               returned one of these instructions it should
-	 *               return -EWOULDBLOCK.
-	 *
-	 * discarded_oblock - indicates whether the whole origin block is
-	 *               in a discarded state (FIXME: better to tell the
-	 *               policy about this sooner, so it can recycle that
-	 *               cache block if it wants.)
-	 * bio         - the bio that triggered this call.
-	 * result      - gets filled in with the instruction.
-	 *
-	 * May only return 0, or -EWOULDBLOCK (if !can_migrate)
-	 */
-	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
-		   bool can_block, bool can_migrate, bool discarded_oblock,
-		   struct bio *bio, struct policy_locker *locker,
-		   struct policy_result *result);
-
-	/*
-	 * Sometimes we want to see if a block is in the cache, without
-	 * triggering any update of stats.  (ie. it's not a real hit).
+	 * Find the location of a block.
 	 *
 	 * Must not block.
 	 *
-	 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
-	 * (-EWOULDBLOCK would be typical).
+	 * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
+	 * other errors (-EWOULDBLOCK would be typical).  data_dir should be
+	 * READ or WRITE. fast_copy should be set if migrating this block would
+	 * be 'cheap' somehow (eg, discarded data). background_queued will be set
+	 * if a migration has just been queued.
 	 */
-	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+		      int data_dir, bool fast_copy, bool *background_queued);

-	void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
-	void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+	/*
+	 * Sometimes the core target can optimise a migration, eg, the
+	 * block may be discarded, or the bio may cover an entire block.
+	 * In order to optimise it needs the migration immediately though
+	 * so it knows to do something different with the bio.
+	 *
+	 * This method is optional (policy-internal will fallback to using
+	 * lookup).
+	 */
+	int (*lookup_with_work)(struct dm_cache_policy *p,
+				dm_oblock_t oblock, dm_cblock_t *cblock,
+				int data_dir, bool fast_copy,
+				struct policy_work **work);
+
+	/*
+	 * Retrieves background work.  Returns -ENODATA when there's no
+	 * background work.
+	 */
+	int (*get_background_work)(struct dm_cache_policy *p, bool idle,
+			           struct policy_work **result);
+
+	/*
+	 * You must pass in the same work pointer that you were given, not
+	 * a copy.
+	 */
+	void (*complete_background_work)(struct dm_cache_policy *p,
+					 struct policy_work *work,
+					 bool success);
+
+	void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
+	void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);

 	/*
 	 * Called when a cache target is first created.  Used to load a
 	 * mapping from the metadata device into the policy.
 	 */
 	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
-			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+			    dm_cblock_t cblock, bool dirty,
+			    uint32_t hint, bool hint_valid);
+
+	/*
+	 * Drops the mapping, irrespective of whether it's clean or dirty.
+	 * Returns -ENODATA if cblock is not mapped.
+	 */
+	int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);

 	/*
 	 * Gets the hint for a given cblock.  Called in a single threaded
@ -161,36 +108,6 @@ struct dm_cache_policy {
 	 */
 	uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);

-	/*
-	 * Override functions used on the error paths of the core target.
-	 * They must succeed.
-	 */
-	void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
-	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
-			      dm_oblock_t new_oblock);
-
-	/*
-	 * This is called via the invalidate_cblocks message.  It is
-	 * possible the particular cblock has already been removed due to a
-	 * write io in passthrough mode.  In which case this should return
-	 * -ENODATA.
-	 */
-	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
-
-	/*
-	 * Provide a dirty block to be written back by the core target.  If
-	 * critical_only is set then the policy should only provide work if
-	 * it urgently needs it.
-	 *
-	 * Returns:
-	 *
-	 * 0 and @cblock,@oblock: block to write back provided
-	 *
-	 * -ENODATA: no dirty blocks available
-	 */
-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
-			      bool critical_only);
-
 	/*
 	 * How full is the cache?
 	 */
@ -202,6 +119,8 @@ struct dm_cache_policy {
 	 * queue merging has occurred).  To stop the policy being fooled by
 	 * these, the core target sends regular tick() calls to the policy.
 	 * The policy should only count an entry as hit once per tick.
+	 *
+	 * This method is optional.
 	 */
 	void (*tick)(struct dm_cache_policy *p, bool can_block);

@ -213,6 +132,8 @@ struct dm_cache_policy {
 	int (*set_config_value)(struct dm_cache_policy *p,
 				const char *key, const char *value);

+	void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
+
 	/*
 	 * Book keeping ptr for the policy register, not for general use.
 	 */
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@ -47,7 +47,7 @@ struct mapped_device {
 	struct request_queue *queue;
 	int numa_node_id;

-	unsigned type;
+	enum dm_queue_mode type;
 	/* Protect queue and type against concurrent access. */
 	struct mutex type_lock;

--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@ -340,6 +340,7 @@ out:
 static struct target_type delay_target = {
 	.name	     = "delay",
 	.version     = {1, 2, 1},
+	.features    = DM_TARGET_PASSES_INTEGRITY,
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@ -254,7 +254,6 @@ static struct dm_block_validator sb_validator = {
 * Low level metadata handling
 *--------------------------------------------------------------*/
 #define DM_ERA_METADATA_BLOCK_SIZE 4096
-#define DM_ERA_METADATA_CACHE_SIZE 64
 #define ERA_MAX_CONCURRENT_LOCKS 5

 struct era_metadata {
@ -615,7 +614,6 @@ static int create_persistent_data_objects(struct era_metadata *md,
 	int r;

 	md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
-					 DM_ERA_METADATA_CACHE_SIZE,
 					 ERA_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(md->bm)) {
 		DMERR("could not create block manager");
@ -961,18 +959,18 @@ static int metadata_commit(struct era_metadata *md)
 		}
 	}

-	r = save_sm_root(md);
-	if (r) {
-		DMERR("%s: save_sm_root failed", __func__);
-		return r;
-	}
-
 	r = dm_tm_pre_commit(md->tm);
 	if (r) {
 		DMERR("%s: pre commit failed", __func__);
 		return r;
 	}

+	r = save_sm_root(md);
+	if (r) {
+		DMERR("%s: save_sm_root failed", __func__);
+		return r;
+	}
+
 	r = superblock_lock(md, &sblock);
 	if (r) {
 		DMERR("%s: superblock lock failed", __func__);
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@ -37,14 +37,6 @@ struct hash_cell {
 	struct dm_table *new_map;
 };

-/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
-	int undefined__;
-};
-
 struct vers_iter {
    size_t param_size;
    struct dm_target_versions *vers, *old_vers;
@ -1268,7 +1260,7 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }

-static bool is_valid_type(unsigned cur, unsigned new)
+static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
 {
 	if (cur == new ||
 	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
@ -1778,12 +1770,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
 	    cmd == DM_LIST_VERSIONS_CMD)
 		return 0;

-	if ((cmd == DM_DEV_CREATE_CMD)) {
+	if (cmd == DM_DEV_CREATE_CMD) {
 		if (!*param->name) {
 			DMWARN("name not supplied when creating device");
 			return -EINVAL;
 		}
-	} else if ((*param->uuid && *param->name)) {
+	} else if (*param->uuid && *param->name) {
 		DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
 		return -EINVAL;
 	}
@ -1848,7 +1840,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
 	if (r)
 		goto out;

-	param->data_size = sizeof(*param);
+	param->data_size = offsetof(struct dm_ioctl, data);
 	r = fn(param, input_param_size);

 	if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@ -163,6 +163,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
 static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 3, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@ -90,7 +90,7 @@ struct multipath {
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */

-	unsigned queue_mode;
+	enum dm_queue_mode queue_mode;

 	struct mutex work_mutex;
 	struct work_struct trigger_event;
@ -111,7 +111,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);

 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void trigger_event(struct work_struct *work);
-static void activate_path(struct work_struct *work);
+static void activate_or_offline_path(struct pgpath *pgpath);
+static void activate_path_work(struct work_struct *work);
 static void process_queued_bios(struct work_struct *work);

 /*-----------------------------------------------
@ -136,7 +137,7 @@ static struct pgpath *alloc_pgpath(void)

 	if (pgpath) {
 		pgpath->is_active = true;
-		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
+		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work);
 	}

 	return pgpath;
@ -297,6 +298,8 @@ static int __pg_init_all_paths(struct multipath *m)
 	struct pgpath *pgpath;
 	unsigned long pg_init_delay = 0;

+	lockdep_assert_held(&m->lock);
+
 	if (atomic_read(&m->pg_init_in_progress) || test_bit(MPATHF_PG_INIT_DISABLED, &m->flags))
 		return 0;

@ -321,13 +324,16 @@ static int __pg_init_all_paths(struct multipath *m)
 	return atomic_read(&m->pg_init_in_progress);
 }

-static void pg_init_all_paths(struct multipath *m)
+static int pg_init_all_paths(struct multipath *m)
 {
+	int ret;
 	unsigned long flags;

 	spin_lock_irqsave(&m->lock, flags);
-	__pg_init_all_paths(m);
+	ret = __pg_init_all_paths(m);
 	spin_unlock_irqrestore(&m->lock, flags);
+
+	return ret;
 }

 static void __switch_pg(struct multipath *m, struct priority_group *pg)
@ -436,45 +442,21 @@ failed:
 }

 /*
- * Check whether bios must be queued in the device-mapper core rather
- * than here in the target.
- *
- * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
- * same value then we are not between multipath_presuspend()
- * and multipath_resume() calls and we have no need to check
- * for the DMF_NOFLUSH_SUSPENDING flag.
+ * dm_report_EIO() is a macro instead of a function to make pr_debug()
+ * report the function name and line number of the function from which
+ * it has been invoked.
 */
-static bool __must_push_back(struct multipath *m)
-{
-	return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
-		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) &&
-		dm_noflush_suspending(m->ti));
-}
-
-static bool must_push_back_rq(struct multipath *m)
-{
-	bool r;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-	r = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
-	     __must_push_back(m));
-	spin_unlock_irqrestore(&m->lock, flags);
-
-	return r;
-}
-
-static bool must_push_back_bio(struct multipath *m)
-{
-	bool r;
-	unsigned long flags;
-
-	spin_lock_irqsave(&m->lock, flags);
-	r = __must_push_back(m);
-	spin_unlock_irqrestore(&m->lock, flags);
-
-	return r;
-}
+#define dm_report_EIO(m)						\
+({									\
+	struct mapped_device *md = dm_table_get_md((m)->ti->table);	\
+									\
+	pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
+		 dm_device_name(md),					\
+		 test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags),	\
+		 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags),	\
+		 dm_noflush_suspending((m)->ti));			\
+	-EIO;								\
+})

 /*
 * Map cloned requests (request-based multipath)
@ -484,11 +466,11 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 				   struct request **__clone)
 {
 	struct multipath *m = ti->private;
-	int r = DM_MAPIO_REQUEUE;
 	size_t nr_bytes = blk_rq_bytes(rq);
 	struct pgpath *pgpath;
 	struct block_device *bdev;
 	struct dm_mpath_io *mpio = get_mpio(map_context);
+	struct request_queue *q;
 	struct request *clone;

 	/* Do we need to select a new pgpath? */
@ -497,13 +479,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 		pgpath = choose_pgpath(m, nr_bytes);

 	if (!pgpath) {
-		if (must_push_back_rq(m))
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			return DM_MAPIO_DELAY_REQUEUE;
-		return -EIO;	/* Failed */
+		return dm_report_EIO(m);	/* Failed */
 	} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
 		   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-		pg_init_all_paths(m);
-		return r;
+		if (pg_init_all_paths(m))
+			return DM_MAPIO_DELAY_REQUEUE;
+		return DM_MAPIO_REQUEUE;
 	}

 	memset(mpio, 0, sizeof(*mpio));
@ -511,13 +494,19 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	mpio->nr_bytes = nr_bytes;

 	bdev = pgpath->path.dev->bdev;
-
-	clone = blk_get_request(bdev_get_queue(bdev),
-			rq->cmd_flags | REQ_NOMERGE,
-			GFP_ATOMIC);
+	q = bdev_get_queue(bdev);
+	clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC);
 	if (IS_ERR(clone)) {
 		/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
-		return r;
+		bool queue_dying = blk_queue_dying(q);
+		DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
+			    PTR_ERR(clone), queue_dying ? " (path offline)" : "");
+		if (queue_dying) {
+			atomic_inc(&m->pg_init_in_progress);
+			activate_or_offline_path(pgpath);
+			return DM_MAPIO_REQUEUE;
+		}
+		return DM_MAPIO_DELAY_REQUEUE;
 	}
 	clone->bio = clone->biotail = NULL;
 	clone->rq_disk = bdev->bd_disk;
@ -567,9 +556,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	}

 	if (!pgpath) {
-		if (!must_push_back_bio(m))
-			return -EIO;
-		return DM_MAPIO_REQUEUE;
+		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+			return DM_MAPIO_REQUEUE;
+		return dm_report_EIO(m);
 	}

 	mpio->pgpath = pgpath;
@ -640,6 +629,14 @@ static void process_queued_bios(struct work_struct *work)
 	blk_finish_plug(&plug);
 }

+static void assign_bit(bool value, long nr, unsigned long *addr)
+{
+	if (value)
+		set_bit(nr, addr);
+	else
+		clear_bit(nr, addr);
+}
+
 /*
 * If we run out of usable paths, should we queue I/O or error it?
 */
@ -649,23 +646,11 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
 	unsigned long flags;

 	spin_lock_irqsave(&m->lock, flags);
-
-	if (save_old_value) {
-		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-		else
-			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-	} else {
-		if (queue_if_no_path)
-			set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-		else
-			clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
-	}
-	if (queue_if_no_path)
-		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-	else
-		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-
+	assign_bit((save_old_value && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) ||
+		   (!save_old_value && queue_if_no_path),
+		   MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
+	assign_bit(queue_if_no_path || dm_noflush_suspending(m->ti),
+		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 	spin_unlock_irqrestore(&m->lock, flags);

 	if (!queue_if_no_path) {
@ -1438,10 +1423,8 @@ out:
 	spin_unlock_irqrestore(&m->lock, flags);
 }

-static void activate_path(struct work_struct *work)
+static void activate_or_offline_path(struct pgpath *pgpath)
 {
-	struct pgpath *pgpath =
-		container_of(work, struct pgpath, activate_path.work);
 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);

 	if (pgpath->is_active && !blk_queue_dying(q))
@ -1450,6 +1433,14 @@ static void activate_path(struct work_struct *work)
 		pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
 }

+static void activate_path_work(struct work_struct *work)
+{
+	struct pgpath *pgpath =
+		container_of(work, struct pgpath, activate_path.work);
+
+	activate_or_offline_path(pgpath);
+}
+
 static int noretry_error(int error)
 {
 	switch (error) {
@ -1501,12 +1492,9 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);

-	if (!atomic_read(&m->nr_valid_paths)) {
-		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!must_push_back_rq(m))
-				r = -EIO;
-		}
-	}
+	if (atomic_read(&m->nr_valid_paths) == 0 &&
+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+		r = dm_report_EIO(m);

 	return r;
 }
@ -1547,13 +1535,9 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);

-	if (!atomic_read(&m->nr_valid_paths)) {
-		if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			if (!must_push_back_bio(m))
-				return -EIO;
-			return DM_ENDIO_REQUEUE;
-		}
-	}
+	if (atomic_read(&m->nr_valid_paths) == 0 &&
+	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+		return dm_report_EIO(m);

 	/* Queue for the daemon to resubmit */
 	dm_bio_restore(get_bio_details_from_bio(clone), clone);
@ -1619,10 +1603,8 @@ static void multipath_resume(struct dm_target *ti)
 	unsigned long flags;

 	spin_lock_irqsave(&m->lock, flags);
-	if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags))
-		set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
-	else
-		clear_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
+	assign_bit(test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags),
+		   MPATHF_QUEUE_IF_NO_PATH, &m->flags);
 	spin_unlock_irqrestore(&m->lock, flags);
 }

@ -1682,6 +1664,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			case DM_TYPE_MQ_REQUEST_BASED:
 				DMEMIT("queue_mode mq ");
 				break;
+			default:
+				WARN_ON_ONCE(true);
+				break;
 			}
 		}
 	}
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@ -1,6 +1,6 @@
 /*
 * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2017 Red Hat, Inc. All rights reserved.
 *
 * This file is released under the GPL.
 */
@ -79,7 +79,10 @@ struct raid_dev {
 #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */

 /* New for v1.10.0 */
-#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6! */
+#define __CTR_FLAG_JOURNAL_DEV		15 /* 2 */ /* Only with raid4/5/6 (journal device)! */
+
+/* New for v1.11.1 */
+#define __CTR_FLAG_JOURNAL_MODE		16 /* 2 */ /* Only with raid4/5/6 (journal mode)! */

 /*
 * Flags for rs->ctr_flags field.
@ -100,6 +103,7 @@ struct raid_dev {
 #define CTR_FLAG_DATA_OFFSET		(1 << __CTR_FLAG_DATA_OFFSET)
 #define CTR_FLAG_RAID10_USE_NEAR_SETS	(1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
 #define CTR_FLAG_JOURNAL_DEV		(1 << __CTR_FLAG_JOURNAL_DEV)
+#define CTR_FLAG_JOURNAL_MODE		(1 << __CTR_FLAG_JOURNAL_MODE)

 #define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)

@ -175,7 +179,8 @@ struct raid_dev {
 				 CTR_FLAG_REGION_SIZE | \
 				 CTR_FLAG_DELTA_DISKS | \
 				 CTR_FLAG_DATA_OFFSET | \
-				 CTR_FLAG_JOURNAL_DEV)
+				 CTR_FLAG_JOURNAL_DEV | \
+				 CTR_FLAG_JOURNAL_MODE)

 #define RAID6_VALID_FLAGS	(CTR_FLAG_SYNC | \
 				 CTR_FLAG_REBUILD | \
@ -186,7 +191,8 @@ struct raid_dev {
 				 CTR_FLAG_REGION_SIZE | \
 				 CTR_FLAG_DELTA_DISKS | \
 				 CTR_FLAG_DATA_OFFSET | \
-				 CTR_FLAG_JOURNAL_DEV)
+				 CTR_FLAG_JOURNAL_DEV | \
+				 CTR_FLAG_JOURNAL_MODE)
 /* ...valid options definitions per raid level */

 /*
@ -239,6 +245,7 @@ struct raid_set {
 	struct journal_dev {
 		struct dm_dev *dev;
 		struct md_rdev rdev;
+		int mode;
 	} journal_dev;

 	struct raid_dev dev[0];
@ -326,6 +333,7 @@ static struct arg_name_flag {
 	{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
 	{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
 	{ CTR_FLAG_JOURNAL_DEV, "journal_dev" },
+	{ CTR_FLAG_JOURNAL_MODE, "journal_mode" },
 };

 /* Return argument name string for given @flag */
@ -344,6 +352,39 @@ static const char *dm_raid_arg_name_by_flag(const uint32_t flag)
 	return NULL;
 }

+/* Define correlation of raid456 journal cache modes and dm-raid target line parameters */
+static struct {
+	const int mode;
+	const char *param;
+} _raid456_journal_mode[] = {
+	{ R5C_JOURNAL_MODE_WRITE_THROUGH , "writethrough" },
+	{ R5C_JOURNAL_MODE_WRITE_BACK    , "writeback" }
+};
+
+/* Return MD raid4/5/6 journal mode for dm @journal_mode one */
+static int dm_raid_journal_mode_to_md(const char *mode)
+{
+	int m = ARRAY_SIZE(_raid456_journal_mode);
+
+	while (m--)
+		if (!strcasecmp(mode, _raid456_journal_mode[m].param))
+			return _raid456_journal_mode[m].mode;
+
+	return -EINVAL;
+}
+
+/* Return dm-raid raid4/5/6 journal mode string for @mode */
+static const char *md_journal_mode_to_dm_raid(const int mode)
+{
+	int m = ARRAY_SIZE(_raid456_journal_mode);
+
+	while (m--)
+		if (mode == _raid456_journal_mode[m].mode)
+			return _raid456_journal_mode[m].param;
+
+	return "unknown";
+}
+
 /*
 * Bool helpers to test for various raid levels of a raid set.
 * It's level as reported by the superblock rather than
@ -1183,7 +1224,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 			continue;
 		}

-		/* "journal_dev dev" */
+		/* "journal_dev <dev>" */
 		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
 			int r;
 			struct md_rdev *jdev;
@ -1211,10 +1252,32 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
 				rs->ti->error = "No space for raid4/5/6 journal";
 				return -ENOSPC;
 			}
+			rs->journal_dev.mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
 			set_bit(Journal, &jdev->flags);
 			continue;
 		}

+		/* "journal_mode <mode>" ("journal_dev" mandatory!) */
+		if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE))) {
+			int r;
+
+			if (!test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+				rs->ti->error = "raid4/5/6 'journal_mode' is invalid without 'journal_dev'";
+				return -EINVAL;
+			}
+			if (test_and_set_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+				rs->ti->error = "Only one raid4/5/6 'journal_mode' argument allowed";
+				return -EINVAL;
+			}
+			r = dm_raid_journal_mode_to_md(arg);
+			if (r < 0) {
+				rs->ti->error = "Invalid 'journal_mode' argument";
+				return r;
+			}
+			rs->journal_dev.mode = r;
+			continue;
+		}
+
 		/*
 		 * Parameters with number values from here on.
 		 */
@ -3076,6 +3139,16 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);

+	/* If raid4/5/6 journal mode explictely requested (only possible with journal dev) -> set it */
+	if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) {
+		r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode);
+		if (r) {
+			ti->error = "Failed to set raid4/5/6 journal mode";
+			mddev_unlock(&rs->md);
+			goto bad_journal_mode_set;
+		}
+	}
+
 	mddev_suspend(&rs->md);

 	/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@ -3109,6 +3182,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	mddev_unlock(&rs->md);
 	return 0;

+bad_journal_mode_set:
 bad_stripe_cache:
 bad_check_reshape:
 	md_stop(&rs->md);
@ -3180,18 +3254,18 @@ static const char *decipher_sync_action(struct mddev *mddev)
 * Status characters:
 *
 *  'D' = Dead/Failed raid set component or raid4/5/6 journal device
- *  'a' = Alive but not in-sync
- *  'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ *  'a' = Alive but not in-sync raid set component _or_ alive raid4/5/6 'write_back' journal device
+ *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
 *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
 */
-static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
 {
 	if (!rdev->bdev)
 		return "-";
 	else if (test_bit(Faulty, &rdev->flags))
 		return "D";
 	else if (test_bit(Journal, &rdev->flags))
-		return "A";
+		return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
 	else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
 		return "a";
 	else
@ -3315,7 +3389,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,

 		/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
 		for (i = 0; i < rs->raid_disks; i++)
-			DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
+			DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));

 		/*
 		 * In-sync/Reshape ratio:
@ -3366,7 +3440,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		 * v1.10.0+:
 		 */
 		DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
-			      __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
+			      __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
 		break;

 	case STATUSTYPE_TABLE:
@ -3381,39 +3455,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				  write_mostly_params +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
 				  hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
-				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
+				  (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
+				  (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
+
 		/* Emit table line */
+		/* This has to be in the documented order for userspace! */
 		DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
-		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
-			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
-					 raid10_md_layout_to_format(mddev->layout));
-		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
-					 raid10_md_layout_to_copies(mddev->layout));
-		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
-			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
 		if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
 			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
-		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
-			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
-					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
-		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
-			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
-					   (unsigned long long) rs->data_offset);
-		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
-			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
-					  mddev->bitmap_info.daemon_sleep);
-		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
-					 max(rs->delta_disks, mddev->delta_disks));
-		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
-					 max_nr_stripes);
+		if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
+			DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
 		if (rebuild_disks)
 			for (i = 0; i < rs->raid_disks; i++)
 				if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks))
 					DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD),
 							 rs->dev[i].rdev.raid_disk);
+		if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
+			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
+					  mddev->bitmap_info.daemon_sleep);
+		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
+					 mddev->sync_speed_min);
+		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
+					 mddev->sync_speed_max);
 		if (write_mostly_params)
 			for (i = 0; i < rs->raid_disks; i++)
 				if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
@ -3422,15 +3487,30 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 		if (test_bit(__CTR_FLAG_MAX_WRITE_BEHIND, &rs->ctr_flags))
 			DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND),
 					  mddev->bitmap_info.max_write_behind);
-		if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
-					 mddev->sync_speed_max);
-		if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
-			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
-					 mddev->sync_speed_min);
+		if (test_bit(__CTR_FLAG_STRIPE_CACHE, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE),
+					 max_nr_stripes);
+		if (test_bit(__CTR_FLAG_REGION_SIZE, &rs->ctr_flags))
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE),
+					   (unsigned long long) to_sector(mddev->bitmap_info.chunksize));
+		if (test_bit(__CTR_FLAG_RAID10_COPIES, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES),
+					 raid10_md_layout_to_copies(mddev->layout));
+		if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT),
+					 raid10_md_layout_to_format(mddev->layout));
+		if (test_bit(__CTR_FLAG_DELTA_DISKS, &rs->ctr_flags))
+			DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS),
+					 max(rs->delta_disks, mddev->delta_disks));
+		if (test_bit(__CTR_FLAG_DATA_OFFSET, &rs->ctr_flags))
+			DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET),
+					   (unsigned long long) rs->data_offset);
 		if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
 			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
 					__get_dev_name(rs->journal_dev.dev));
+		if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags))
+			DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_MODE),
+					 md_journal_mode_to_dm_raid(rs->journal_dev.mode));
 		DMEMIT(" %d", rs->raid_disks);
 		for (i = 0; i < rs->raid_disks; i++)
 			DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@ -3791,7 +3871,7 @@ static void raid_resume(struct dm_target *ti)

 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 10, 1},
+	.version = {1, 11, 1},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@ -280,7 +280,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 	if (!rq->q->mq_ops)
 		dm_old_requeue_request(rq);
 	else
-		dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
+		dm_mq_delay_requeue_request(rq, delay_requeue ? 100/*ms*/ : 0);

 	rq_completed(md, rw, false);
 }
@ -815,10 +815,14 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	dm_init_md_queue(md);

 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
-	blk_mq_register_dev(disk_to_dev(md->disk), q);
+	err = blk_mq_register_dev(disk_to_dev(md->disk), q);
+	if (err)
+		goto out_cleanup_queue;

 	return 0;

+out_cleanup_queue:
+	blk_cleanup_queue(q);
 out_tag_set:
 	blk_mq_free_tag_set(md->tag_set);
 out_kfree_tag_set:
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@ -442,6 +442,7 @@ static void stripe_io_hints(struct dm_target *ti,
 static struct target_type stripe_target = {
 	.name   = "striped",
 	.version = {1, 6, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@ -30,7 +30,7 @@

 struct dm_table {
 	struct mapped_device *md;
-	unsigned type;
+	enum dm_queue_mode type;

 	/* btree table */
 	unsigned int depth;
@ -47,6 +47,7 @@ struct dm_table {
 	bool integrity_supported:1;
 	bool singleton:1;
 	bool all_blk_mq:1;
+	unsigned integrity_added:1;

 	/*
 	 * Indicates the rw permissions for the new logical
@ -372,7 +373,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 */
 dev_t dm_get_dev_t(const char *path)
 {
-	dev_t uninitialized_var(dev);
+	dev_t dev;
 	struct block_device *bdev;

 	bdev = lookup_bdev(path);
@ -626,13 +627,13 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,

 	struct dm_target *uninitialized_var(ti);
 	struct queue_limits ti_limits;
-	unsigned i = 0;
+	unsigned i;

 	/*
 	 * Check each entry in the table in turn.
 	 */
-	while (i < dm_table_get_num_targets(table)) {
-		ti = dm_table_get_target(table, i++);
+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+		ti = dm_table_get_target(table, i);

 		blk_set_stacking_limits(&ti_limits);

@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 		t->immutable_target_type = tgt->type;
 	}

+	if (dm_target_has_integrity(tgt->type))
+		t->integrity_added = 1;
+
 	tgt->table = t;
 	tgt->begin = start;
 	tgt->len = len;
@ -821,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 }
 EXPORT_SYMBOL(dm_consume_args);

-static bool __table_type_bio_based(unsigned table_type)
+static bool __table_type_bio_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_BIO_BASED ||
 		table_type == DM_TYPE_DAX_BIO_BASED);
 }

-static bool __table_type_request_based(unsigned table_type)
+static bool __table_type_request_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_REQUEST_BASED ||
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 }

-void dm_table_set_type(struct dm_table *t, unsigned type)
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
 {
 	t->type = type;
 }
@ -850,11 +854,11 @@ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
 static bool dm_table_supports_dax(struct dm_table *t)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;

 	/* Ensure that all targets support DAX. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);

 		if (!ti->type->direct_access)
 			return false;
@ -875,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t)
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
-	unsigned live_md_type = dm_get_md_type(t->md);
+	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);

 	if (t->type != DM_TYPE_NONE) {
 		/* target already set the table's type */
@ -984,7 +988,7 @@ verify_rq_based:
 	return 0;
 }

-unsigned dm_table_get_type(struct dm_table *t)
+enum dm_queue_mode dm_table_get_type(struct dm_table *t)
 {
 	return t->type;
 }
@ -1006,11 +1010,11 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t)

 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t)
 {
-	struct dm_target *uninitialized_var(ti);
-	unsigned i = 0;
+	struct dm_target *ti;
+	unsigned i;

-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);
 		if (dm_target_is_wildcard(ti->type))
 			return ti;
 	}
@ -1035,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t)

 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
-	unsigned type = dm_table_get_type(t);
+	enum dm_queue_mode type = dm_table_get_type(t);
 	unsigned per_io_data_size = 0;
 	struct dm_target *tgt;
 	unsigned i;
@ -1131,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
 	struct list_head *devices = dm_table_get_devices(t);
 	struct dm_dev_internal *dd = NULL;
 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
+	unsigned i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		struct dm_target *ti = dm_table_get_target(t, i);
+		if (!dm_target_passes_integrity(ti->type))
+			goto no_integrity;
+	}

 	list_for_each_entry(dd, devices, list) {
 		template_disk = dd->dm_dev->bdev->bd_disk;
@ -1168,6 +1179,10 @@ static int dm_table_register_integrity(struct dm_table *t)
 	struct mapped_device *md = t->md;
 	struct gendisk *template_disk = NULL;

+	/* If target handles integrity itself do not register it here. */
+	if (t->integrity_added)
+		return 0;
+
 	template_disk = dm_table_get_integrity_disk(t);
 	if (!template_disk)
 		return 0;
@ -1313,15 +1328,16 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev,
 */
 bool dm_table_has_no_data_devices(struct dm_table *table)
 {
-	struct dm_target *uninitialized_var(ti);
-	unsigned i = 0, num_devices = 0;
+	struct dm_target *ti;
+	unsigned i, num_devices;

-	while (i < dm_table_get_num_targets(table)) {
-		ti = dm_table_get_target(table, i++);
+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
+		ti = dm_table_get_target(table, i);

 		if (!ti->type->iterate_devices)
 			return false;

+		num_devices = 0;
 		ti->type->iterate_devices(ti, count_device, &num_devices);
 		if (num_devices)
 			return false;
@ -1336,16 +1352,16 @@ bool dm_table_has_no_data_devices(struct dm_table *table)
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits)
 {
-	struct dm_target *uninitialized_var(ti);
+	struct dm_target *ti;
 	struct queue_limits ti_limits;
-	unsigned i = 0;
+	unsigned i;

 	blk_set_stacking_limits(limits);

-	while (i < dm_table_get_num_targets(table)) {
+	for (i = 0; i < dm_table_get_num_targets(table); i++) {
 		blk_set_stacking_limits(&ti_limits);

-		ti = dm_table_get_target(table, i++);
+		ti = dm_table_get_target(table, i);

 		if (!ti->type->iterate_devices)
 			goto combine_limits;
@ -1394,6 +1410,9 @@ static void dm_table_verify_integrity(struct dm_table *t)
 {
 	struct gendisk *template_disk = NULL;

+	if (t->integrity_added)
+		return;
+
 	if (t->integrity_supported) {
 		/*
 		 * Verify that the original integrity profile
@ -1424,7 +1443,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;

 	/*
 	 * Require at least one underlying device to support flushes.
@ -1432,8 +1451,8 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	 * so we need to use iterate_devices here, which targets
 	 * supporting flushes must provide.
 	 */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);

 		if (!ti->num_flush_bios)
 			continue;
@ -1477,10 +1496,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 					   iterate_devices_callout_fn func)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;

-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);

 		if (!ti->type->iterate_devices ||
 		    !ti->type->iterate_devices(ti, func, NULL))
@ -1501,10 +1520,10 @@ static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *de
 static bool dm_table_supports_write_same(struct dm_table *t)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;

-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);

 		if (!ti->num_write_same_bios)
 			return false;
@ -1556,7 +1575,7 @@ static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 static bool dm_table_supports_discards(struct dm_table *t)
 {
 	struct dm_target *ti;
-	unsigned i = 0;
+	unsigned i;

 	/*
 	 * Unless any target used by the table set discards_supported,
@ -1565,8 +1584,8 @@ static bool dm_table_supports_discards(struct dm_table *t)
 	 * so we need to use iterate_devices here, which targets
 	 * supporting discard selectively must provide.
 	 */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		ti = dm_table_get_target(t, i);

 		if (!ti->num_discard_bios)
 			continue;
@ -1672,6 +1691,8 @@ static void suspend_targets(struct dm_table *t, enum suspend_mode mode)
 	int i = t->num_targets;
 	struct dm_target *ti = t->targets;

+	lockdep_assert_held(&t->md->suspend_lock);
+
 	while (i--) {
 		switch (mode) {
 		case PRESUSPEND:
@ -1719,6 +1740,8 @@ int dm_table_resume_targets(struct dm_table *t)
 {
 	int i, r = 0;

+	lockdep_assert_held(&t->md->suspend_lock);
+
 	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = t->targets + i;

--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@ -77,7 +77,6 @@
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
 #define THIN_VERSION 2
-#define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3

 /*
@ -686,7 +685,6 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 	int r;

 	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
-					  THIN_METADATA_CACHE_SIZE,
 					  THIN_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(pmd->bm)) {
 		DMERR("could not create block manager");
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@ -5,7 +5,7 @@
 */

 #include "dm-thin-metadata.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
 #include "dm.h"

 #include <linux/device-mapper.h>
@ -1069,6 +1069,7 @@ static void passdown_endio(struct bio *bio)
 	 * to unmap (we ignore err).
 	 */
 	queue_passdown_pt2(bio->bi_private);
+	bio_put(bio);
 }

 static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@ -188,7 +188,7 @@ error:
 static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
 			  u8 *want_digest, u8 *data)
 {
-	if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+	if (unlikely(verity_hash(v, verity_io_hash_req(v, io),
 				 data, 1 << v->data_dev_block_bits,
 				 verity_io_real_digest(v, io))))
 		return 0;
@ -397,7 +397,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 	}

 	/* Always re-validate the corrected block against the expected hash */
-	r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+	r = verity_hash(v, verity_io_hash_req(v, io), fio->output,
 			1 << v->data_dev_block_bits,
 			verity_io_real_digest(v, io));
 	if (unlikely(r < 0))
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@ -93,81 +93,123 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 }

 /*
- * Wrapper for crypto_shash_init, which handles verity salting.
+ * Callback function for asynchrnous crypto API completion notification
 */
-static int verity_hash_init(struct dm_verity *v, struct shash_desc *desc)
+static void verity_op_done(struct crypto_async_request *base, int err)
+{
+	struct verity_result *res = (struct verity_result *)base->data;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	res->err = err;
+	complete(&res->completion);
+}
+
+/*
+ * Wait for async crypto API callback
+ */
+static inline int verity_complete_op(struct verity_result *res, int ret)
+{
+	switch (ret) {
+	case 0:
+		break;
+
+	case -EINPROGRESS:
+	case -EBUSY:
+		ret = wait_for_completion_interruptible(&res->completion);
+		if (!ret)
+			ret = res->err;
+		reinit_completion(&res->completion);
+		break;
+
+	default:
+		DMERR("verity_wait_hash: crypto op submission failed: %d", ret);
+	}
+
+	if (unlikely(ret < 0))
+		DMERR("verity_wait_hash: crypto op failed: %d", ret);
+
+	return ret;
+}
+
+static int verity_hash_update(struct dm_verity *v, struct ahash_request *req,
+				const u8 *data, size_t len,
+				struct verity_result *res)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, data, len);
+	ahash_request_set_crypt(req, &sg, NULL, len);
+
+	return verity_complete_op(res, crypto_ahash_update(req));
+}
+
+/*
+ * Wrapper for crypto_ahash_init, which handles verity salting.
+ */
+static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
+				struct verity_result *res)
 {
 	int r;

-	desc->tfm = v->tfm;
-	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	ahash_request_set_tfm(req, v->tfm);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					verity_op_done, (void *)res);
+	init_completion(&res->completion);

-	r = crypto_shash_init(desc);
+	r = verity_complete_op(res, crypto_ahash_init(req));

 	if (unlikely(r < 0)) {
-		DMERR("crypto_shash_init failed: %d", r);
+		DMERR("crypto_ahash_init failed: %d", r);
 		return r;
 	}

-	if (likely(v->version >= 1)) {
-		r = crypto_shash_update(desc, v->salt, v->salt_size);
-
-		if (unlikely(r < 0)) {
-			DMERR("crypto_shash_update failed: %d", r);
-			return r;
-		}
-	}
-
-	return 0;
-}
-
-static int verity_hash_update(struct dm_verity *v, struct shash_desc *desc,
-			      const u8 *data, size_t len)
-{
-	int r = crypto_shash_update(desc, data, len);
-
-	if (unlikely(r < 0))
-		DMERR("crypto_shash_update failed: %d", r);
+	if (likely(v->version >= 1))
+		r = verity_hash_update(v, req, v->salt, v->salt_size, res);

 	return r;
 }

-static int verity_hash_final(struct dm_verity *v, struct shash_desc *desc,
-			     u8 *digest)
+static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
+			     u8 *digest, struct verity_result *res)
 {
 	int r;

 	if (unlikely(!v->version)) {
-		r = crypto_shash_update(desc, v->salt, v->salt_size);
+		r = verity_hash_update(v, req, v->salt, v->salt_size, res);

 		if (r < 0) {
-			DMERR("crypto_shash_update failed: %d", r);
-			return r;
+			DMERR("verity_hash_final failed updating salt: %d", r);
+			goto out;
 		}
 	}

-	r = crypto_shash_final(desc, digest);
-
-	if (unlikely(r < 0))
-		DMERR("crypto_shash_final failed: %d", r);
-
+	ahash_request_set_crypt(req, NULL, digest, 0);
+	r = verity_complete_op(res, crypto_ahash_final(req));
+out:
 	return r;
 }

-int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+int verity_hash(struct dm_verity *v, struct ahash_request *req,
 		const u8 *data, size_t len, u8 *digest)
 {
 	int r;
+	struct verity_result res;

-	r = verity_hash_init(v, desc);
+	r = verity_hash_init(v, req, &res);
 	if (unlikely(r < 0))
-		return r;
+		goto out;

-	r = verity_hash_update(v, desc, data, len);
+	r = verity_hash_update(v, req, data, len, &res);
 	if (unlikely(r < 0))
-		return r;
+		goto out;

-	return verity_hash_final(v, desc, digest);
+	r = verity_hash_final(v, req, digest, &res);
+
+out:
+	return r;
 }

 static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
@ -275,7 +317,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			goto release_ret_r;
 		}

-		r = verity_hash(v, verity_io_hash_desc(v, io),
+		r = verity_hash(v, verity_io_hash_req(v, io),
 				data, 1 << v->hash_dev_block_bits,
 				verity_io_real_digest(v, io));
 		if (unlikely(r < 0))
@ -343,6 +385,49 @@ out:
 	return r;
 }

+/*
+ * Calculates the digest for the given bio
+ */
+int verity_for_io_block(struct dm_verity *v, struct dm_verity_io *io,
+			struct bvec_iter *iter, struct verity_result *res)
+{
+	unsigned int todo = 1 << v->data_dev_block_bits;
+	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
+	struct scatterlist sg;
+	struct ahash_request *req = verity_io_hash_req(v, io);
+
+	do {
+		int r;
+		unsigned int len;
+		struct bio_vec bv = bio_iter_iovec(bio, *iter);
+
+		sg_init_table(&sg, 1);
+
+		len = bv.bv_len;
+
+		if (likely(len >= todo))
+			len = todo;
+		/*
+		 * Operating on a single page at a time looks suboptimal
+		 * until you consider the typical block size is 4,096B.
+		 * Going through this loops twice should be very rare.
+		 */
+		sg_set_page(&sg, bv.bv_page, len, bv.bv_offset);
+		ahash_request_set_crypt(req, &sg, NULL, len);
+		r = verity_complete_op(res, crypto_ahash_update(req));
+
+		if (unlikely(r < 0)) {
+			DMERR("verity_for_io_block crypto op failed: %d", r);
+			return r;
+		}
+
+		bio_advance_iter(bio, iter, len);
+		todo -= len;
+	} while (todo);
+
+	return 0;
+}
+
 /*
 * Calls function process for 1 << v->data_dev_block_bits bytes in the bio_vec
 * starting from iter.
@ -381,12 +466,6 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 	return 0;
 }

-static int verity_bv_hash_update(struct dm_verity *v, struct dm_verity_io *io,
-				 u8 *data, size_t len)
-{
-	return verity_hash_update(v, verity_io_hash_desc(v, io), data, len);
-}
-
 static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
 			  u8 *data, size_t len)
 {
@ -403,10 +482,11 @@ static int verity_verify_io(struct dm_verity_io *io)
 	struct dm_verity *v = io->v;
 	struct bvec_iter start;
 	unsigned b;
+	struct verity_result res;

 	for (b = 0; b < io->n_blocks; b++) {
 		int r;
-		struct shash_desc *desc = verity_io_hash_desc(v, io);
+		struct ahash_request *req = verity_io_hash_req(v, io);

 		r = verity_hash_for_block(v, io, io->block + b,
 					  verity_io_want_digest(v, io),
@ -427,16 +507,17 @@ static int verity_verify_io(struct dm_verity_io *io)
 			continue;
 		}

-		r = verity_hash_init(v, desc);
+		r = verity_hash_init(v, req, &res);
 		if (unlikely(r < 0))
 			return r;

 		start = io->iter;
-		r = verity_for_bv_block(v, io, &io->iter, verity_bv_hash_update);
+		r = verity_for_io_block(v, io, &io->iter, &res);
 		if (unlikely(r < 0))
 			return r;

-		r = verity_hash_final(v, desc, verity_io_real_digest(v, io));
+		r = verity_hash_final(v, req, verity_io_real_digest(v, io),
+					&res);
 		if (unlikely(r < 0))
 			return r;

@ -705,7 +786,7 @@ static void verity_dtr(struct dm_target *ti)
 	kfree(v->zero_digest);

 	if (v->tfm)
-		crypto_free_shash(v->tfm);
+		crypto_free_ahash(v->tfm);

 	kfree(v->alg_name);

@ -723,7 +804,7 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_alloc_zero_digest(struct dm_verity *v)
 {
 	int r = -ENOMEM;
-	struct shash_desc *desc;
+	struct ahash_request *req;
 	u8 *zero_data;

 	v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
@ -731,9 +812,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 	if (!v->zero_digest)
 		return r;

-	desc = kmalloc(v->shash_descsize, GFP_KERNEL);
+	req = kmalloc(v->ahash_reqsize, GFP_KERNEL);

-	if (!desc)
+	if (!req)
 		return r; /* verity_dtr will free zero_digest */

 	zero_data = kzalloc(1 << v->data_dev_block_bits, GFP_KERNEL);
@ -741,11 +822,11 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 	if (!zero_data)
 		goto out;

-	r = verity_hash(v, desc, zero_data, 1 << v->data_dev_block_bits,
+	r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits,
 			v->zero_digest);

 out:
-	kfree(desc);
+	kfree(req);
 	kfree(zero_data);

 	return r;
@ -923,21 +1004,21 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}

-	v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+	v->tfm = crypto_alloc_ahash(v->alg_name, 0, 0);
 	if (IS_ERR(v->tfm)) {
 		ti->error = "Cannot initialize hash function";
 		r = PTR_ERR(v->tfm);
 		v->tfm = NULL;
 		goto bad;
 	}
-	v->digest_size = crypto_shash_digestsize(v->tfm);
+	v->digest_size = crypto_ahash_digestsize(v->tfm);
 	if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
 		ti->error = "Digest size too big";
 		r = -EINVAL;
 		goto bad;
 	}
-	v->shash_descsize =
-		sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+	v->ahash_reqsize = sizeof(struct ahash_request) +
+		crypto_ahash_reqsize(v->tfm);

 	v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
 	if (!v->root_digest) {
@ -1037,7 +1118,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}

 	ti->per_io_data_size = sizeof(struct dm_verity_io) +
-				v->shash_descsize + v->digest_size * 2;
+				v->ahash_reqsize + v->digest_size * 2;

 	r = verity_fec_ctr(v);
 	if (r)
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@ -37,7 +37,7 @@ struct dm_verity {
 	struct dm_target *ti;
 	struct dm_bufio_client *bufio;
 	char *alg_name;
-	struct crypto_shash *tfm;
+	struct crypto_ahash *tfm;
 	u8 *root_digest;	/* digest of the root block */
 	u8 *salt;		/* salt: its size is salt_size */
 	u8 *zero_digest;	/* digest for a zero block */
@ -52,7 +52,7 @@ struct dm_verity {
 	unsigned char levels;	/* the number of tree levels */
 	unsigned char version;
 	unsigned digest_size;	/* digest size for the current hash algorithm */
-	unsigned shash_descsize;/* the size of temporary space for crypto */
+	unsigned int ahash_reqsize;/* the size of temporary space for crypto */
 	int hash_failed;	/* set to 1 if hash of any block failed */
 	enum verity_mode mode;	/* mode for handling verification errors */
 	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
@ -81,31 +81,36 @@ struct dm_verity_io {
 	/*
 	 * Three variably-size fields follow this struct:
 	 *
-	 * u8 hash_desc[v->shash_descsize];
+	 * u8 hash_req[v->ahash_reqsize];
 	 * u8 real_digest[v->digest_size];
 	 * u8 want_digest[v->digest_size];
 	 *
-	 * To access them use: verity_io_hash_desc(), verity_io_real_digest()
+	 * To access them use: verity_io_hash_req(), verity_io_real_digest()
 	 * and verity_io_want_digest().
 	 */
 };

-static inline struct shash_desc *verity_io_hash_desc(struct dm_verity *v,
+struct verity_result {
+	struct completion completion;
+	int err;
+};
+
+static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
 						     struct dm_verity_io *io)
 {
-	return (struct shash_desc *)(io + 1);
+	return (struct ahash_request *)(io + 1);
 }

 static inline u8 *verity_io_real_digest(struct dm_verity *v,
 					struct dm_verity_io *io)
 {
-	return (u8 *)(io + 1) + v->shash_descsize;
+	return (u8 *)(io + 1) + v->ahash_reqsize;
 }

 static inline u8 *verity_io_want_digest(struct dm_verity *v,
 					struct dm_verity_io *io)
 {
-	return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+	return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
 }

 static inline u8 *verity_io_digest_end(struct dm_verity *v,
@ -120,7 +125,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
 					      struct dm_verity_io *io,
 					      u8 *data, size_t len));

-extern int verity_hash(struct dm_verity *v, struct shash_desc *desc,
+extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
 		       const u8 *data, size_t len, u8 *digest);

 extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@ -1104,8 +1104,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,

 	__bio_clone_fast(clone, bio);

-	if (bio_integrity(bio)) {
-		int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+	if (unlikely(bio_integrity(bio) != NULL)) {
+		int r;
+
+		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
+			     !dm_target_passes_integrity(tio->ti->type))) {
+			DMWARN("%s: the target %s doesn't support integrity data.",
+				dm_device_name(tio->io->md),
+				tio->ti->type->name);
+			return -EIO;
+		}
+
+		r = bio_integrity_clone(clone, bio, GFP_NOIO);
 		if (r < 0)
 			return r;
 	}
@ -1113,7 +1123,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
 	clone->bi_iter.bi_size = to_bytes(len);

-	if (bio_integrity(bio))
+	if (unlikely(bio_integrity(bio) != NULL))
 		bio_integrity_trim(clone, 0, len);

 	return 0;
@ -1715,6 +1725,8 @@ static void event_callback(void *context)
 */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
+	lockdep_assert_held(&md->suspend_lock);
+
 	set_capacity(md->disk, size);

 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
@ -1822,13 +1834,13 @@ void dm_unlock_md_type(struct mapped_device *md)
 	mutex_unlock(&md->type_lock);
 }

-void dm_set_md_type(struct mapped_device *md, unsigned type)
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
 {
 	BUG_ON(!mutex_is_locked(&md->type_lock));
 	md->type = type;
 }

-unsigned dm_get_md_type(struct mapped_device *md)
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
 {
 	return md->type;
 }
@ -1855,7 +1867,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
-	unsigned type = dm_get_md_type(md);
+	enum dm_queue_mode type = dm_get_md_type(md);

 	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
@ -1886,6 +1898,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		if (type == DM_TYPE_DAX_BIO_BASED)
 			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
 		break;
+	case DM_TYPE_NONE:
+		WARN_ON_ONCE(true);
+		break;
 	}

 	return 0;
@ -2164,8 +2179,6 @@ static void unlock_fs(struct mapped_device *md)
 * If __dm_suspend returns 0, the device is completely quiescent
 * now. There is no request-processing activity. All new requests
 * are being added to md->deferred list.
- *
- * Caller must hold md->suspend_lock
 */
 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 			unsigned suspend_flags, long task_state,
@ -2183,6 +2196,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 */
 	if (noflush)
 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+	else
+		pr_debug("%s: suspending with flush\n", dm_device_name(md));

 	/*
 	 * This gets reverted if there's an error later and the targets
@ -2381,6 +2396,8 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
 {
 	struct dm_table *map = NULL;

+	lockdep_assert_held(&md->suspend_lock);
+
 	if (md->internal_suspend_count++)
 		return; /* nested internal suspend */

@ -2571,7 +2588,7 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);

-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
 					    unsigned integrity, unsigned per_io_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-unsigned dm_table_get_type(struct dm_table *t);
+enum dm_queue_mode dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);

 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
-void dm_set_md_type(struct mapped_device *md, unsigned type);
-unsigned dm_get_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md);

 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
@ -204,7 +204,7 @@ void dm_kcopyd_exit(void);
 /*
 * Mempool operations
 */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
 					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);

--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@ -378,7 +378,6 @@ struct dm_block_manager {

 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 						 unsigned block_size,
-						 unsigned cache_size,
 						 unsigned max_held_per_thread)
 {
 	int r;
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@ -33,7 +33,7 @@ void *dm_block_data(struct dm_block *b);
 struct dm_block_manager;
 struct dm_block_manager *dm_block_manager_create(
 	struct block_device *bdev, unsigned block_size,
-	unsigned cache_size, unsigned max_held_per_thread);
+	unsigned max_held_per_thread);
 void dm_block_manager_destroy(struct dm_block_manager *bm);

 unsigned dm_bm_block_size(struct dm_block_manager *bm);
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@ -902,8 +902,12 @@ static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
 		else
 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);

-		if (next_block || flags & INTERNAL_NODE)
-			block = value64(ro_node(s), i);
+		if (next_block || flags & INTERNAL_NODE) {
+			if (find_highest)
+				block = value64(ro_node(s), i);
+			else
+				block = value64(ro_node(s), 0);
+		}

 	} while (flags & INTERNAL_NODE);

--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@ -54,16 +54,6 @@
 */
 #define R5L_POOL_SIZE	4

-/*
- * r5c journal modes of the array: write-back or write-through.
- * write-through mode has identical behavior as existing log only
- * implementation.
- */
-enum r5c_journal_mode {
-	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
-	R5C_JOURNAL_MODE_WRITE_BACK = 1,
-};
-
 static char *r5c_journal_mode_str[] = {"write-through",
 				       "write-back"};
 /*
@ -2526,40 +2516,56 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	return ret;
 }

-static ssize_t r5c_journal_mode_store(struct mddev *mddev,
-				      const char *page, size_t length)
+/*
+ * Set journal cache mode on @mddev (external API initially needed by dm-raid).
+ *
+ * @mode as defined in 'enum r5c_journal_mode'.
+ *
+ */
+int r5c_journal_mode_set(struct mddev *mddev, int mode)
 {
 	struct r5conf *conf = mddev->private;
 	struct r5l_log *log = conf->log;
-	int val = -1, i;
-	int len = length;

 	if (!log)
 		return -ENODEV;

-	if (len && page[len - 1] == '\n')
-		len -= 1;
-	for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++)
-		if (strlen(r5c_journal_mode_str[i]) == len &&
-		    strncmp(page, r5c_journal_mode_str[i], len) == 0) {
-			val = i;
-			break;
-		}
-	if (val < R5C_JOURNAL_MODE_WRITE_THROUGH ||
-	    val > R5C_JOURNAL_MODE_WRITE_BACK)
+	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
+	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
 		return -EINVAL;

 	if (raid5_calc_degraded(conf) > 0 &&
-	    val == R5C_JOURNAL_MODE_WRITE_BACK)
+	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
 		return -EINVAL;

 	mddev_suspend(mddev);
-	conf->log->r5c_journal_mode = val;
+	conf->log->r5c_journal_mode = mode;
 	mddev_resume(mddev);

 	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
-		 mdname(mddev), val, r5c_journal_mode_str[val]);
-	return length;
+		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
+	return 0;
+}
+EXPORT_SYMBOL(r5c_journal_mode_set);
+
+static ssize_t r5c_journal_mode_store(struct mddev *mddev,
+				      const char *page, size_t length)
+{
+	int mode = ARRAY_SIZE(r5c_journal_mode_str);
+	size_t len = length;
+
+	if (len < 2)
+		return -EINVAL;
+
+	if (page[len - 1] == '\n')
+		len--;
+
+	while (mode--)
+		if (strlen(r5c_journal_mode_str[mode]) == len &&
+		    !strncmp(page, r5c_journal_mode_str[mode], len))
+			break;
+
+	return r5c_journal_mode_set(mddev, mode) ?: length;
 }

 struct md_sysfs_entry
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@ -510,6 +510,16 @@ struct r5worker_group {
 	int stripes_cnt;
 };

+/*
+ * r5c journal modes of the array: write-back or write-through.
+ * write-through mode has identical behavior as existing log only
+ * implementation.
+ */
+enum r5c_journal_mode {
+	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
+	R5C_JOURNAL_MODE_WRITE_BACK = 1,
+};
+
 enum r5_cache_state {
 	R5_INACTIVE_BLOCKED,	/* release of inactive stripes blocked,
 				 * waiting for 25% to be free
@ -741,4 +751,5 @@ extern struct stripe_head *
 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			int previous, int noblock, int noquiesce);
 extern int raid5_calc_degraded(struct r5conf *conf);
+extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
 #endif
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@ -22,11 +22,13 @@ struct bio_vec;
 /*
 * Type of table, mapped_device's mempool and request_queue
 */
-#define DM_TYPE_NONE			0
-#define DM_TYPE_BIO_BASED		1
-#define DM_TYPE_REQUEST_BASED		2
-#define DM_TYPE_MQ_REQUEST_BASED	3
-#define DM_TYPE_DAX_BIO_BASED		4
+enum dm_queue_mode {
+	DM_TYPE_NONE		 = 0,
+	DM_TYPE_BIO_BASED	 = 1,
+	DM_TYPE_REQUEST_BASED	 = 2,
+	DM_TYPE_MQ_REQUEST_BASED = 3,
+	DM_TYPE_DAX_BIO_BASED	 = 4,
+};

 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;

@ -221,6 +223,18 @@ struct target_type {
 */
 typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);

+/*
+ * A target implements own bio data integrity.
+ */
+#define DM_TARGET_INTEGRITY		0x00000010
+#define dm_target_has_integrity(type)	((type)->features & DM_TARGET_INTEGRITY)
+
+/*
+ * A target passes integrity data to the lower device.
+ */
+#define DM_TARGET_PASSES_INTEGRITY	0x00000020
+#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
@ -465,7 +479,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
 * Useful for "hybrid" target (supports both bio-based
 * and request-based).
 */
-void dm_table_set_type(struct dm_table *t, unsigned type);
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);

 /*
 * Finally call this to make the table ready for use.