From c217649bf2d60ac119afd71d938278cffd55962b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:53:46 +0000
Subject: [PATCH 01/32] dm: dont take i_mutex to change device size

No longer needlessly hold md->bdev->bd_inode->i_mutex when changing the
size of a DM device.  This additional locking is unnecessary because
i_size_write() is already protected by the existing critical section in
dm_swap_table().  DM already has a reference on md->bdev so the
associated bd_inode may be changed without lifetime concerns.

A negative side-effect of having held md->bdev->bd_inode->i_mutex was
that a concurrent DM device resize and flush (via fsync) would deadlock.
Dropping md->bdev->bd_inode->i_mutex eliminates this potential for
deadlock.  The following reproducer no longer deadlocks:
  https://www.redhat.com/archives/dm-devel/2009-July/msg00284.html

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: stable@kernel.org
---
 drivers/md/dm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..4ef066a3090f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1992,13 +1992,14 @@ static void event_callback(void *context)
 	wake_up(&md->eventq);
 }
 
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	mutex_lock(&md->bdev->bd_inode->i_mutex);
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
 /*

From 09c9d4c9b6a2b5909ae3c6265e4cd3820b636863 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:59:46 +0000
Subject: [PATCH 02/32] dm mpath: disable blk_abort_queue

Revert commit 224cb3e981f1b2f9f93dbd49eaef505d17d894c2
  dm: Call blk_abort_queue on failed paths

Multipath began to use blk_abort_queue() to allow for
lower latency path deactivation.  This was found to
cause list corruption:

   the cmd gets blk_abort_queued/timedout run on it and the scsi eh
   somehow is able to complete and run scsi_queue_insert while
   scsi_request_fn is still trying to process the request.

   https://www.redhat.com/archives/dm-devel/2010-November/msg00085.html

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: stable@kernel.org
---
 drivers/md/dm-mpath.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..406091f9692b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -33,7 +33,6 @@ struct pgpath {
 	unsigned fail_count;		/* Cumulative failure count */
 
 	struct dm_path path;
-	struct work_struct deactivate_path;
 	struct work_struct activate_path;
 };
 
@@ -116,7 +115,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
 
 
 /*-----------------------------------------------
@@ -129,7 +127,6 @@ static struct pgpath *alloc_pgpath(void)
 
 	if (pgpath) {
 		pgpath->is_active = 1;
-		INIT_WORK(&pgpath->deactivate_path, deactivate_path);
 		INIT_WORK(&pgpath->activate_path, activate_path);
 	}
 
@@ -141,14 +138,6 @@ static void free_pgpath(struct pgpath *pgpath)
 	kfree(pgpath);
 }
 
-static void deactivate_path(struct work_struct *work)
-{
-	struct pgpath *pgpath =
-		container_of(work, struct pgpath, deactivate_path);
-
-	blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
-
 static struct priority_group *alloc_priority_group(void)
 {
 	struct priority_group *pg;
@@ -995,7 +984,6 @@ static int fail_path(struct pgpath *pgpath)
 		      pgpath->path.dev->name, m->nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
-	queue_work(kmultipathd, &pgpath->deactivate_path);
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);

From d9bf0b508ddfe19883b982b29a03c02ccbf53806 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Jan 2011 19:59:47 +0000
Subject: [PATCH 03/32] dm io: remove BIO_RW_SYNCIO flag from kcopyd

Remove the REQ_SYNC flag to improve write throughput when writing
to the origin with a snapshot on the same device (using the CFQ I/O
scheduler).

Sequential write throughput (chunksize of 4k, 32k, 512k)
  unpatched:  8.5,  8.6,  9.3 MB/s
  patched:   15.2, 18.5, 17.5 MB/s

Snapshot exception reallocations are triggered by writes that are
usually async, so mark the associated dm_io_request as async as well.
This helps when using the CFQ I/O scheduler because it has separate
queues for sync and async I/O.  Async is optimized for throughput; sync
for latency.  With this change we're consciously favoring throughput over
latency.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..5ad9231c8700 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -345,7 +345,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+		.bi_rw = job->rw | REQ_UNPLUG,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,

From 84c89557a302e18414a011cc52b1abd034860743 Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Thu, 13 Jan 2011 19:59:47 +0000
Subject: [PATCH 04/32] dm ioctl: allow rename to fill empty uuid

Allow the uuid of a mapped device to be set after device creation.
Previously the uuid (which is optional) could only be set by
DM_DEV_CREATE.  If no uuid was supplied it could not be set later.

Sometimes it's necessary to create the device before the uuid is known,
and in such cases the uuid must be filled in after the creation.

This patch extends DM_DEV_RENAME to accept a uuid accompanied by
a new flag DM_UUID_FLAG.  This can only be done once and if no
uuid was previously supplied.  It cannot be used to change an
existing uuid.

DM_VERSION_MINOR is also bumped to 19 to indicate this interface
extension is available.

Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 103 +++++++++++++++++++++++++++++----------
 include/linux/dm-ioctl.h |  12 +++--
 2 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..f3481d922206 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
 		DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
 
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+	mutex_lock(&dm_hash_cells_mutex);
+	hc->uuid = new_uuid;
+	mutex_unlock(&dm_hash_cells_mutex);
+
+	list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+	char *old_name;
+
+	/*
+	 * Rename and move the name cell.
+	 */
+	list_del(&hc->name_list);
+	old_name = hc->name;
+
+	mutex_lock(&dm_hash_cells_mutex);
+	hc->name = new_name;
+	mutex_unlock(&dm_hash_cells_mutex);
+
+	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+	return old_name;
+}
+
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 					    const char *new)
 {
-	char *new_name, *old_name;
+	char *new_data, *old_name = NULL;
 	struct hash_cell *hc;
 	struct dm_table *table;
 	struct mapped_device *md;
+	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
 	/*
 	 * duplicate new.
 	 */
-	new_name = kstrdup(new, GFP_KERNEL);
-	if (!new_name)
+	new_data = kstrdup(new, GFP_KERNEL);
+	if (!new_data)
 		return ERR_PTR(-ENOMEM);
 
 	down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	/*
 	 * Is new free ?
 	 */
-	hc = __get_name_cell(new);
+	if (change_uuid)
+		hc = __get_uuid_cell(new);
+	else
+		hc = __get_name_cell(new);
+
 	if (hc) {
-		DMWARN("asked to rename to an already-existing name %s -> %s",
+		DMWARN("Unable to change %s on mapped device %s to one that "
+		       "already exists: %s",
+		       change_uuid ? "uuid" : "name",
 		       param->name, new);
 		dm_put(hc->md);
 		up_write(&_hash_lock);
-		kfree(new_name);
+		kfree(new_data);
 		return ERR_PTR(-EBUSY);
 	}
 
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	 */
 	hc = __get_name_cell(param->name);
 	if (!hc) {
-		DMWARN("asked to rename a non-existent device %s -> %s",
-		       param->name, new);
+		DMWARN("Unable to rename non-existent device, %s to %s%s",
+		       param->name, change_uuid ? "uuid " : "", new);
 		up_write(&_hash_lock);
-		kfree(new_name);
+		kfree(new_data);
 		return ERR_PTR(-ENXIO);
 	}
 
 	/*
-	 * rename and move the name cell.
+	 * Does this device already have a uuid?
 	 */
-	list_del(&hc->name_list);
-	old_name = hc->name;
-	mutex_lock(&dm_hash_cells_mutex);
-	hc->name = new_name;
-	mutex_unlock(&dm_hash_cells_mutex);
-	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+	if (change_uuid && hc->uuid) {
+		DMWARN("Unable to change uuid of mapped device %s to %s "
+		       "because uuid is already set to %s",
+		       param->name, new, hc->uuid);
+		dm_put(hc->md);
+		up_write(&_hash_lock);
+		kfree(new_data);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (change_uuid)
+		__set_cell_uuid(hc, new_data);
+	else
+		old_name = __change_cell_name(hc, new_data);
 
 	/*
 	 * Wake up any dm event waiters.
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
-	char *new_name = (char *) param + param->data_start;
+	char *new_data = (char *) param + param->data_start;
 	struct mapped_device *md;
+	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
-	if (new_name < param->data ||
-	    invalid_str(new_name, (void *) param + param_size) ||
-	    strlen(new_name) > DM_NAME_LEN - 1) {
-		DMWARN("Invalid new logical volume name supplied.");
+	if (new_data < param->data ||
+	    invalid_str(new_data, (void *) param + param_size) ||
+	    strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+		DMWARN("Invalid new mapped device name or uuid string supplied.");
 		return -EINVAL;
 	}
 
-	r = check_name(new_name);
-	if (r)
-		return r;
+	if (!change_uuid) {
+		r = check_name(new_data);
+		if (r)
+			return r;
+	}
 
-	md = dm_hash_rename(param, new_name);
+	md = dm_hash_rename(param, new_data);
 	if (IS_ERR(md))
 		return PTR_ERR(md);
 
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 49eab360d5d4..e453e1174ae1 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -44,7 +44,7 @@
  * Remove a device, destroy any tables.
  *
  * DM_DEV_RENAME:
- * Rename a device.
+ * Rename a device or set its uuid if none was previously supplied.
  *
  * DM_SUSPEND:
  * This performs both suspend and resume, depending which flag is
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	18
+#define DM_VERSION_MINOR	19
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2010-06-29)"
+#define DM_VERSION_EXTRA	"-ioctl (2010-10-14)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
@@ -322,4 +322,10 @@ enum {
  */
 #define DM_UEVENT_GENERATED_FLAG	(1 << 13) /* Out */
 
+/*
+ * If set, rename changes the uuid not the name.  Only permitted
+ * if no uuid was previously supplied: an existing uuid cannot be changed.
+ */
+#define DM_UUID_FLAG			(1 << 14) /* In */
+
 #endif				/* _LINUX_DM_IOCTL_H */

From 5fc2ffeabb9ee0fc0e71ff16b49f34f0ed3d05b4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:59:48 +0000
Subject: [PATCH 05/32] dm raid1: support discard

Enable discard support in the DM mirror target.
Also change an existing use of 'bvec' to 'addr' in the union.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..c87756eb7a21 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE_FLUSH,
 		.mem.type = DM_IO_KMEM,
-		.mem.ptr.bvec = NULL,
+		.mem.ptr.addr = NULL,
 		.client = ms->io_client,
 	};
 
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 		.client = ms->io_client,
 	};
 
+	if (bio->bi_rw & REQ_DISCARD) {
+		io_req.bi_rw |= REQ_DISCARD;
+		io_req.mem.type = DM_IO_KMEM;
+		io_req.mem.ptr.addr = NULL;
+	}
+
 	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
 		map_region(dest++, m, bio);
 
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		if (bio->bi_rw & REQ_FLUSH) {
+		if ((bio->bi_rw & REQ_FLUSH) ||
+		    (bio->bi_rw & REQ_DISCARD)) {
 			bio_list_add(&sync, bio);
 			continue;
 		}
@@ -1076,6 +1083,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->private = ms;
 	ti->split_io = dm_rh_get_region_size(ms->rh);
 	ti->num_flush_requests = 1;
+	ti->num_discard_requests = 1;
 
 	ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
 	if (!ms->kmirrord_wq) {

From 4a1aeb98297e17f4e0a8cdda919e63bf528b2e5d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:48 +0000
Subject: [PATCH 06/32] dm: remove dm_mutex after bkl conversion

This patch replaces dm_mutex with _minor_lock in dm_blk_close()
and then removes it.

During the BKL conversion, commit 6e9624b8caec290d28b4c6d9ec75749df6372b87
(block: push down BKL into .open and .release) pushed lock_kernel()
down into dm_blk_open/close calls.
Commit 2a48fc0ab24241755dc93bfd4f01d68efab47f5a
(block: autoconvert trivial BKL users to private mutex) converted it to a
local mutex, but _minor_lock is sufficient.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4ef066a3090f..0de692176ad4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
 
-static DEFINE_MUTEX(dm_mutex);
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
 	struct mapped_device *md;
 
-	mutex_lock(&dm_mutex);
 	spin_lock(&_minor_lock);
 
 	md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 
 out:
 	spin_unlock(&_minor_lock);
-	mutex_unlock(&dm_mutex);
 
 	return md ? 0 : -ENXIO;
 }
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
 	struct mapped_device *md = disk->private_data;
 
-	mutex_lock(&dm_mutex);
+	spin_lock(&_minor_lock);
+
 	atomic_dec(&md->open_count);
 	dm_put(md);
-	mutex_unlock(&dm_mutex);
+
+	spin_unlock(&_minor_lock);
 
 	return 0;
 }

From 69a8cfcda21017364df1c21b720daf304b5598a6 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:49 +0000
Subject: [PATCH 07/32] dm crypt: set key size early

Simplify key size verification (hexadecimal string) and
set key size early in constructor.

(Patch required by later changes.)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4c4408a2602f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -973,15 +973,15 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
-	unsigned key_size = strlen(key) >> 1;
-
-	if (cc->key_size && cc->key_size != key_size)
+	/* The key size may not be changed. */
+	if (cc->key_size != (strlen(key) >> 1))
 		return -EINVAL;
 
-	cc->key_size = key_size; /* initial settings */
+	/* Hyphen (which gives a key_size of zero) means there is no key. */
+	if (!cc->key_size && strcmp(key, "-"))
+		return -EINVAL;
 
-	if ((!key_size && strcmp(key, "-")) ||
-	   (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
 		return -EINVAL;
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
@@ -1194,6 +1194,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot allocate encryption context";
 		return -ENOMEM;
 	}
+	cc->key_size = key_size;
 
 	ti->private = cc;
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);

From 4a038677df4da84e42fd68b5ab2dfa4d82baa444 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:49 +0000
Subject: [PATCH 08/32] dm log userspace: trap all failed log construction
 errors

When constructing a mirror log, it is possible for the initial request
to fail for other reasons besides -ESRCH.  These must be handled too.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..1c25ad3d02a2 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -181,8 +181,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
 				 ctr_str, str_size, NULL, NULL);
 
-	if (r == -ESRCH) {
-		DMERR("Userspace log server not found");
+	if (r < 0) {
+		if (r == -ESRCH)
+			DMERR("Userspace log server not found");
+		else
+			DMERR("Userspace log server failed to create log");
 		goto out;
 	}
 
@@ -214,10 +217,9 @@ out:
 
 static void userspace_dtr(struct dm_dirty_log *log)
 {
-	int r;
 	struct log_c *lc = log->context;
 
-	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+	(void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
 				 NULL, 0,
 				 NULL, NULL);
 

From 8d35d3e37eed884ba15229a146df846f399909b4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Jan 2011 19:59:50 +0000
Subject: [PATCH 09/32] dm kcopyd: delay unplugging

Make kcopyd merge more I/O requests by using device unplugging.

Without this patch, each I/O request is dispatched separately to the device.
If the device supports tagged queuing, there are many small requests sent
to the device. To improve performance, this patch will batch as many requests
as possible, allowing the queue to merge consecutive requests, and send them
to the device at once.

In my tests (15k SCSI disk), this patch improves sequential write throughput:

  Sequential write throughput (chunksize of 4k, 32k, 512k)
  unpatched: 15.2, 18.5, 17.5 MB/s
  patched:   14.4, 22.6, 23.0 MB/s

In most common uses (snapshot or two-way mirror), kcopyd is only used for
two devices, one for reading and the other for writing, thus this optimization
is implemented only for two devices. The optimization may be extended to n-way
mirrors with some code complexity increase.

We keep track of two block devices to unplug (one for read and the
other for write) and unplug them when exiting "do_work" thread.  If
there are more devices used (in theory it could happen, in practice it
is rare), we unplug immediately.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 54 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 5ad9231c8700..dad32f8bce7d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
 
+	/*
+	 * Block devices to unplug.
+	 * Non-NULL pointer means that a block device has some pending requests
+	 * and needs to be unplugged.
+	 */
+	struct block_device *unplug[2];
+
 	struct dm_io_client *io_client;
 
 	wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
 	return 0;
 }
 
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+	if (kc->unplug[rw] != NULL) {
+		blk_unplug(bdev_get_queue(kc->unplug[rw]));
+		kc->unplug[rw] = NULL;
+	}
+}
+
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+			   struct block_device *bdev)
+{
+	if (likely(kc->unplug[rw] == bdev))
+		return;
+	unplug(kc, rw);
+	kc->unplug[rw] = bdev;
+}
+
 static void complete_io(unsigned long error, void *context)
 {
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | REQ_UNPLUG,
+		.bi_rw = job->rw,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
-	if (job->rw == READ)
+	if (job->rw == READ) {
 		r = dm_io(&io_req, 1, &job->source, NULL);
-	else
+		prepare_unplug(job->kc, READ, job->source.bdev);
+	} else {
+		if (job->num_dests > 1)
+			io_req.bi_rw |= REQ_UNPLUG;
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+		if (!(io_req.bi_rw & REQ_UNPLUG))
+			prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+	}
 
 	return r;
 }
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
 	 * Pages jobs when successful will jump onto the io jobs
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
+	 *
+	 * Note that io_jobs add block devices to the unplug array,
+	 * this array is cleared with "unplug" calls. It is thus
+	 * forbidden to run complete_jobs after io_jobs and before
+	 * unplug because the block device could be destroyed in
+	 * job completion callback.
 	 */
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
 	process_jobs(&kc->io_jobs, kc, run_io_job);
+	unplug(kc, READ);
+	unplug(kc, WRITE);
 }
 
 /*
@@ -619,6 +665,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
 
+	memset(kc->unplug, 0, sizeof(kc->unplug));
+
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;

From 909cc4fb48dd9870f6ebe4bd32cfbe37c102df62 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:50 +0000
Subject: [PATCH 10/32] dm log userspace: split flush queue

Split the 'flush_list', which contained a mix of both 'mark' and 'clear'
requests, into two distinct lists ('mark_list' and 'clear_list').

The device mapper log implementations (used by various DM targets) are
allowed to cache 'mark' and 'clear' requests until a 'flush' is
received.  Until now, these cached requests were kept in the same list.
They will now be put into distinct lists to facilitate group processing
of these requests (in the next patch).

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 41 +++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1c25ad3d02a2..767adf300fa5 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -37,8 +37,15 @@ struct log_c {
 	 */
 	uint64_t in_sync_hint;
 
+	/*
+	 * Mark and clear requests are held until a flush is issued
+	 * so that we can group, and thereby limit, the amount of
+	 * network traffic between kernel and userspace.  The 'flush_lock'
+	 * is used to protect these lists.
+	 */
 	spinlock_t flush_lock;
-	struct list_head flush_list;  /* only for clear and mark requests */
+	struct list_head mark_list;
+	struct list_head clear_list;
 };
 
 static mempool_t *flush_entry_pool;
@@ -169,7 +176,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
 	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
 	spin_lock_init(&lc->flush_lock);
-	INIT_LIST_HEAD(&lc->flush_list);
+	INIT_LIST_HEAD(&lc->mark_list);
+	INIT_LIST_HEAD(&lc->clear_list);
 
 	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
 	if (str_size < 0) {
@@ -362,14 +370,16 @@ static int userspace_flush(struct dm_dirty_log *log)
 	int r = 0;
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	LIST_HEAD(flush_list);
+	LIST_HEAD(mark_list);
+	LIST_HEAD(clear_list);
 	struct flush_entry *fe, *tmp_fe;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
-	list_splice_init(&lc->flush_list, &flush_list);
+	list_splice_init(&lc->mark_list, &mark_list);
+	list_splice_init(&lc->clear_list, &clear_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
-	if (list_empty(&flush_list))
+	if (list_empty(&mark_list) && list_empty(&clear_list))
 		return 0;
 
 	/*
@@ -379,7 +389,16 @@ static int userspace_flush(struct dm_dirty_log *log)
 	 * do it one by one.
 	 */
 
-	list_for_each_entry(fe, &flush_list, list) {
+	list_for_each_entry(fe, &mark_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			goto fail;
+	}
+
+	list_for_each_entry(fe, &clear_list, list) {
 		r = userspace_do_request(lc, lc->uuid, fe->type,
 					 (char *)&fe->region,
 					 sizeof(fe->region),
@@ -397,7 +416,11 @@ fail:
 	 * Calling code will receive an error and will know that
 	 * the log facility has failed.
 	 */
-	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+	list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+		list_del(&fe->list);
+		mempool_free(fe, flush_entry_pool);
+	}
+	list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
 		list_del(&fe->list);
 		mempool_free(fe, flush_entry_pool);
 	}
@@ -427,7 +450,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	fe->type = DM_ULOG_MARK_REGION;
 	fe->region = region;
-	list_add(&fe->list, &lc->flush_list);
+	list_add(&fe->list, &lc->mark_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
 	return;
@@ -464,7 +487,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	fe->type = DM_ULOG_CLEAR_REGION;
 	fe->region = region;
-	list_add(&fe->list, &lc->flush_list);
+	list_add(&fe->list, &lc->clear_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
 	return;

From 085ae0651b2791f3a430ddb76da92925b9952e13 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:51 +0000
Subject: [PATCH 11/32] dm log userspace: group clear and mark requests

Allow the device-mapper log's 'mark' and 'clear' requests to be
grouped and processed in a batch.  This can significantly reduce the
amount of traffic going between the kernel and userspace (where the
processing daemon resides).

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 102 ++++++++++++++++++++++-------
 1 file changed, 79 insertions(+), 23 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 767adf300fa5..31e1687e7bf6 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -18,6 +18,14 @@ struct flush_entry {
 	struct list_head list;
 };
 
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary.  However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
+
 struct log_c {
 	struct dm_target *ti;
 	uint32_t region_size;
@@ -348,6 +356,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 	return (r) ? 0 : (int)in_sync;
 }
 
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+	int r = 0;
+	struct flush_entry *fe;
+
+	list_for_each_entry(fe, flush_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+	int r = 0;
+	int count;
+	uint32_t type = 0;
+	struct flush_entry *fe, *tmp_fe;
+	LIST_HEAD(tmp_list);
+	uint64_t group[MAX_FLUSH_GROUP_COUNT];
+
+	/*
+	 * Group process the requests
+	 */
+	while (!list_empty(flush_list)) {
+		count = 0;
+
+		list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+			group[count] = fe->region;
+			count++;
+
+			list_del(&fe->list);
+			list_add(&fe->list, &tmp_list);
+
+			type = fe->type;
+			if (count >= MAX_FLUSH_GROUP_COUNT)
+				break;
+		}
+
+		r = userspace_do_request(lc, lc->uuid, type,
+					 (char *)(group),
+					 count * sizeof(uint64_t),
+					 NULL, NULL);
+		if (r) {
+			/* Group send failed.  Attempt one-by-one. */
+			list_splice_init(&tmp_list, flush_list);
+			r = flush_one_by_one(lc, flush_list);
+			break;
+		}
+	}
+
+	/*
+	 * Must collect flush_entrys that were successfully processed
+	 * as a group so that they will be free'd by the caller.
+	 */
+	list_splice_init(&tmp_list, flush_list);
+
+	return r;
+}
+
 /*
  * userspace_flush
  *
@@ -382,30 +455,13 @@ static int userspace_flush(struct dm_dirty_log *log)
 	if (list_empty(&mark_list) && list_empty(&clear_list))
 		return 0;
 
-	/*
-	 * FIXME: Count up requests, group request types,
-	 * allocate memory to stick all requests in and
-	 * send to server in one go.  Failing the allocation,
-	 * do it one by one.
-	 */
+	r = flush_by_group(lc, &mark_list);
+	if (r)
+		goto fail;
 
-	list_for_each_entry(fe, &mark_list, list) {
-		r = userspace_do_request(lc, lc->uuid, fe->type,
-					 (char *)&fe->region,
-					 sizeof(fe->region),
-					 NULL, NULL);
-		if (r)
-			goto fail;
-	}
-
-	list_for_each_entry(fe, &clear_list, list) {
-		r = userspace_do_request(lc, lc->uuid, fe->type,
-					 (char *)&fe->region,
-					 sizeof(fe->region),
-					 NULL, NULL);
-		if (r)
-			goto fail;
-	}
+	r = flush_by_group(lc, &clear_list);
+	if (r)
+		goto fail;
 
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
 				 NULL, 0, NULL, NULL);

From 86a54a4802df10d23ccd655e2083e812fe990243 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:52 +0000
Subject: [PATCH 12/32] dm log userspace: add version number to comms

This patch adds a 'version' field to the 'dm_ulog_request'
structure.

The 'version' field is taken from a portion of the unused
'padding' field in the 'dm_ulog_request' structure.  This
was done to avoid changing the size of the structure and
possibly disrupting backwards compatibility.

The version number will help notify user-space daemons
when a change has been made to the kernel/userspace
log API.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c     |  6 ++++--
 drivers/md/dm-log-userspace-transfer.c |  1 +
 include/linux/dm-log-userspace.h       | 13 ++++++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 31e1687e7bf6..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,6 +12,8 @@
 
 #include "dm-log-userspace-transfer.h"
 
+#define DM_LOG_USERSPACE_VSN "1.1.0"
+
 struct flush_entry {
 	int type;
 	region_t region;
@@ -765,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
 		return r;
 	}
 
-	DMINFO("version 1.0.0 loaded");
+	DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
 	return 0;
 }
 
@@ -775,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
 	dm_ulog_tfr_exit();
 	mempool_destroy(flush_entry_pool);
 
-	DMINFO("version 1.0.0 unloaded");
+	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
 	return;
 }
 
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
 
 	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
 	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->version = DM_ULOG_REQUEST_VERSION;
 	tfr->luid = luid;
 	tfr->seq = dm_ulog_seq++;
 
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
index 0c3c3a2110c4..eeace7d3ff15 100644
--- a/include/linux/dm-log-userspace.h
+++ b/include/linux/dm-log-userspace.h
@@ -370,6 +370,16 @@
 #define DM_ULOG_REQUEST_TYPE(request_type) \
 	(DM_ULOG_REQUEST_MASK & (request_type))
 
+/*
+ * DM_ULOG_REQUEST_VERSION is incremented when there is a
+ * change to the way information is passed between kernel
+ * and userspace.  This could be a structure change of
+ * dm_ulog_request or a change in the way requests are
+ * issued/handled.  Changes are outlined here:
+ *	version 1:  Initial implementation
+ */
+#define DM_ULOG_REQUEST_VERSION 1
+
 struct dm_ulog_request {
 	/*
 	 * The local unique identifier (luid) and the universally unique
@@ -383,8 +393,9 @@ struct dm_ulog_request {
 	 */
 	uint64_t luid;
 	char uuid[DM_UUID_LEN];
-	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+	char padding[3];        /* Padding because DM_UUID_LEN = 129 */
 
+	uint32_t version;       /* See DM_ULOG_REQUEST_VERSION */
 	int32_t error;          /* Used to report back processing errors */
 
 	uint32_t seq;           /* Sequence number for request */

From 7dbcd137414f3877737802438926d6dba7906a9a Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:52 +0000
Subject: [PATCH 13/32] dm crypt: simplify compatible table output

Rename cc->cipher_mode to cc->cipher_string and store the whole of the cipher
information so it can easily be printed when processing the DM_DEV_STATUS ioctl.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c4408a2602f..9a896e1cb2ea 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -108,7 +108,7 @@ struct crypt_config {
 	struct workqueue_struct *crypt_queue;
 
 	char *cipher;
-	char *cipher_mode;
+	char *cipher_string;
 
 	struct crypt_iv_operations *iv_gen_ops;
 	union {
@@ -1030,7 +1030,7 @@ static void crypt_dtr(struct dm_target *ti)
 		dm_put_device(ti, cc->dev);
 
 	kzfree(cc->cipher);
-	kzfree(cc->cipher_mode);
+	kzfree(cc->cipher_string);
 
 	/* Must zero key material before freeing */
 	kzfree(cc);
@@ -1050,6 +1050,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		return -EINVAL;
 	}
 
+	cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+	if (!cc->cipher_string)
+		goto bad_mem;
+
 	/*
 	 * Legacy dm-crypt cipher specification
 	 * cipher-mode-iv:ivopts
@@ -1061,12 +1065,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (!cc->cipher)
 		goto bad_mem;
 
-	if (tmp) {
-		cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
-		if (!cc->cipher_mode)
-			goto bad_mem;
-	}
-
 	chainmode = strsep(&tmp, "-");
 	ivopts = strsep(&tmp, "-");
 	ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1072,11 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	/* Compatibility mode for old dm-crypt mappings */
+	/*
+	 * For compatibility with the original dm-crypt mapping format, if
+	 * only the cipher name is supplied, use cbc-plain.
+	 */
 	if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
-		kfree(cc->cipher_mode);
-		cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
 		chainmode = "cbc";
 		ivmode = "plain";
 	}
@@ -1307,10 +1306,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		if (cc->cipher_mode)
-			DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
-		else
-			DMEMIT("%s ", cc->cipher);
+		DMEMIT("%s ", cc->cipher_string);
 
 		if (cc->key_size > 0) {
 			if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1422,7 +1418,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From c029772125594e31eb1a5ad9e0913724ed9891f2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 13 Jan 2011 19:59:53 +0000
Subject: [PATCH 14/32] dm crypt: scale to multiple cpus

Currently dm-crypt does all the encryption work for a single dm-crypt
mapping in a single workqueue. This does not scale well when multiple
CPUs are submitting IO at a high rate. The single CPU running the single
thread cannot keep up with the encryption and encrypted IO performance
tanks.

This patch changes the crypto workqueue to be per CPU. This means
that as long as the IO submitter (or the interrupt target CPUs
for reads) runs on different CPUs the encryption work will be also
parallel.

To avoid a bottleneck on the IO worker I also changed those to be
per-CPU threads.

There is still some shared data, so I suspect some bouncing
cache lines. But I haven't done a detailed study on that yet.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 254 ++++++++++++++++++++++++++++++++----------
 1 file changed, 196 insertions(+), 58 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9a896e1cb2ea..50ae6ef83738 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,6 +18,7 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -77,7 +78,6 @@ struct crypt_iv_operations {
 };
 
 struct iv_essiv_private {
-	struct crypto_cipher *tfm;
 	struct crypto_hash *hash_tfm;
 	u8 *salt;
 };
@@ -91,6 +91,22 @@ struct iv_benbi_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+	struct ablkcipher_request *req;
+	struct crypto_ablkcipher *tfm;
+
+	/* ESSIV: struct crypto_cipher *essiv_tfm */
+	void *iv_private;
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
 	struct dm_dev *dev;
 	sector_t start;
@@ -118,6 +134,12 @@ struct crypt_config {
 	sector_t iv_offset;
 	unsigned int iv_size;
 
+	/*
+	 * Duplicated per cpu state. Access through
+	 * per_cpu_ptr() only.
+	 */
+	struct crypt_cpu __percpu *cpu;
+
 	/*
 	 * Layout of each crypto request:
 	 *
@@ -132,9 +154,7 @@ struct crypt_config {
 	 * correctly aligned.
 	 */
 	unsigned int dmreq_start;
-	struct ablkcipher_request *req;
 
-	struct crypto_ablkcipher *tfm;
 	unsigned long flags;
 	unsigned int key_size;
 	u8 key[0];
@@ -149,6 +169,19 @@ static struct kmem_cache *_crypt_io_pool;
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+	return this_cpu_ptr(cc->cpu);
+}
+
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+	return __this_cpu_ptr(cc->cpu)->tfm;
+}
+
 /*
  * Different IV generation algorithms:
  *
@@ -195,7 +228,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	struct hash_desc desc;
 	struct scatterlist sg;
-	int err;
+	struct crypto_cipher *essiv_tfm;
+	int err, cpu;
 
 	sg_init_one(&sg, cc->key, cc->key_size);
 	desc.tfm = essiv->hash_tfm;
@@ -205,8 +239,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	if (err)
 		return err;
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+	for_each_possible_cpu(cpu) {
+		essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+
+		err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
 				    crypto_hash_digestsize(essiv->hash_tfm));
+		if (err)
+			return err;
+	}
+
+	return 0;
 }
 
 /* Wipe salt and reset key derived from volume key */
@@ -214,24 +256,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+	struct crypto_cipher *essiv_tfm;
+	int cpu, r, err = 0;
 
 	memset(essiv->salt, 0, salt_size);
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+	for_each_possible_cpu(cpu) {
+		essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+		r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+		if (r)
+			err = r;
+	}
+
+	return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+					     struct dm_target *ti,
+					     u8 *salt, unsigned saltsize)
+{
+	struct crypto_cipher *essiv_tfm;
+	int err;
+
+	/* Setup the essiv_tfm with the given salt */
+	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(essiv_tfm)) {
+		ti->error = "Error allocating crypto tfm for ESSIV";
+		return essiv_tfm;
+	}
+
+	if (crypto_cipher_blocksize(essiv_tfm) !=
+	    crypto_ablkcipher_ivsize(any_tfm(cc))) {
+		ti->error = "Block size of ESSIV cipher does "
+			    "not match IV size of block cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+	if (err) {
+		ti->error = "Failed to set key for ESSIV cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(err);
+	}
+
+	return essiv_tfm;
 }
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+	int cpu;
+	struct crypt_cpu *cpu_cc;
+	struct crypto_cipher *essiv_tfm;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-	crypto_free_cipher(essiv->tfm);
-	essiv->tfm = NULL;
-
 	crypto_free_hash(essiv->hash_tfm);
 	essiv->hash_tfm = NULL;
 
 	kzfree(essiv->salt);
 	essiv->salt = NULL;
+
+	for_each_possible_cpu(cpu) {
+		cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+		essiv_tfm = cpu_cc->iv_private;
+
+		if (essiv_tfm)
+			crypto_free_cipher(essiv_tfm);
+
+		cpu_cc->iv_private = NULL;
+	}
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +334,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	struct crypto_cipher *essiv_tfm = NULL;
 	struct crypto_hash *hash_tfm = NULL;
 	u8 *salt = NULL;
-	int err;
+	int err, cpu;
 
 	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,30 +356,22 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		goto bad;
 	}
 
-	/* Allocate essiv_tfm */
-	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(essiv_tfm)) {
-		ti->error = "Error allocating crypto tfm for ESSIV";
-		err = PTR_ERR(essiv_tfm);
-		goto bad;
-	}
-	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_ablkcipher_ivsize(cc->tfm)) {
-		ti->error = "Block size of ESSIV cipher does "
-			    "not match IV size of block cipher";
-		err = -EINVAL;
-		goto bad;
-	}
-
 	cc->iv_gen_private.essiv.salt = salt;
-	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
+	for_each_possible_cpu(cpu) {
+		essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+					crypto_hash_digestsize(hash_tfm));
+		if (IS_ERR(essiv_tfm)) {
+			crypt_iv_essiv_dtr(cc);
+			return PTR_ERR(essiv_tfm);
+		}
+		per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+	}
+
 	return 0;
 
 bad:
-	if (essiv_tfm && !IS_ERR(essiv_tfm))
-		crypto_free_cipher(essiv_tfm);
 	if (hash_tfm && !IS_ERR(hash_tfm))
 		crypto_free_hash(hash_tfm);
 	kfree(salt);
@@ -294,16 +380,19 @@ bad:
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
+	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
+
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+	crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+
 	return 0;
 }
 
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+	unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
 	int log = ilog2(bs);
 
 	/* we need to calculate how far we must shift the sector count
@@ -412,7 +501,7 @@ static int crypt_convert_block(struct crypt_config *cc,
 
 	dmreq = dmreq_of_req(cc, req);
 	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-			 crypto_ablkcipher_alignmask(cc->tfm) + 1);
+			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
 
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
@@ -454,16 +543,19 @@ static int crypt_convert_block(struct crypt_config *cc,
 
 static void kcryptd_async_done(struct crypto_async_request *async_req,
 			       int error);
+
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	if (!cc->req)
-		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-	ablkcipher_request_set_tfm(cc->req, cc->tfm);
-	ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
-					CRYPTO_TFM_REQ_MAY_SLEEP,
-					kcryptd_async_done,
-					dmreq_of_req(cc, cc->req));
+	struct crypt_cpu *this_cc = this_crypt_config(cc);
+
+	if (!this_cc->req)
+		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfm);
+	ablkcipher_request_set_callback(this_cc->req,
+	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 
 /*
@@ -472,6 +564,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
+	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 
 	atomic_set(&ctx->pending, 1);
@@ -483,7 +576,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->pending);
 
-		r = crypt_convert_block(cc, ctx, cc->req);
+		r = crypt_convert_block(cc, ctx, this_cc->req);
 
 		switch (r) {
 		/* async */
@@ -492,7 +585,7 @@ static int crypt_convert(struct crypt_config *cc,
 			INIT_COMPLETION(ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			cc->req = NULL;
+			this_cc->req = NULL;
 			ctx->sector++;
 			continue;
 
@@ -651,6 +744,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * They must be separated as otherwise the final stages could be
  * starved by new requests which can block in the first stages due
  * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
  */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -971,6 +1067,20 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 }
 
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+	int cpu, err = 0, r;
+
+	for_each_possible_cpu(cpu) {
+		r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfm,
+					       cc->key, cc->key_size);
+		if (r)
+			err = r;
+	}
+
+	return err;
+}
+
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
 	/* The key size may not be changed. */
@@ -986,19 +1096,22 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+	return crypt_setkey_allcpus(cc);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 	memset(&cc->key, 0, cc->key_size * sizeof(u8));
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+
+	return crypt_setkey_allcpus(cc);
 }
 
 static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = ti->private;
+	struct crypt_cpu *cpu_cc;
+	int cpu;
 
 	ti->private = NULL;
 
@@ -1010,6 +1123,15 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
+	if (cc->cpu)
+		for_each_possible_cpu(cpu) {
+			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+			if (cpu_cc->req)
+				mempool_free(cpu_cc->req, cc->req_pool);
+			if (cpu_cc->tfm)
+				crypto_free_ablkcipher(cpu_cc->tfm);
+		}
+
 	if (cc->bs)
 		bioset_free(cc->bs);
 
@@ -1023,12 +1145,12 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
 
-	if (cc->tfm && !IS_ERR(cc->tfm))
-		crypto_free_ablkcipher(cc->tfm);
-
 	if (cc->dev)
 		dm_put_device(ti, cc->dev);
 
+	if (cc->cpu)
+		free_percpu(cc->cpu);
+
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 
@@ -1040,9 +1162,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 			    char *cipher_in, char *key)
 {
 	struct crypt_config *cc = ti->private;
+	struct crypto_ablkcipher *tfm;
 	char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
 	char *cipher_api = NULL;
-	int ret = -EINVAL;
+	int cpu, ret = -EINVAL;
 
 	/* Convert to crypto api definition? */
 	if (strchr(cipher_in, '(')) {
@@ -1072,6 +1195,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
+	cc->cpu = alloc_percpu(struct crypt_cpu);
+	if (!cc->cpu) {
+		ti->error = "Cannot allocate per cpu state";
+		goto bad_mem;
+	}
+
 	/*
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.
@@ -1098,11 +1227,14 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Allocate cipher */
-	cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
-	if (IS_ERR(cc->tfm)) {
-		ret = PTR_ERR(cc->tfm);
-		ti->error = "Error allocating crypto tfm";
-		goto bad;
+	for_each_possible_cpu(cpu) {
+		tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
+		if (IS_ERR(tfm)) {
+			ret = PTR_ERR(tfm);
+			ti->error = "Error allocating crypto tfm";
+			goto bad;
+		}
+		per_cpu_ptr(cc->cpu, cpu)->tfm = tfm;
 	}
 
 	/* Initialize and set key */
@@ -1113,7 +1245,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Initialize IV */
-	cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+	cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(cc->iv_size,
@@ -1208,9 +1340,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
-	cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-	cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1351,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
-	cc->req = NULL;
 
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
@@ -1252,13 +1383,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->start = tmpll;
 
 	ret = -ENOMEM;
-	cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+	cc->io_queue = alloc_workqueue("kcryptd_io",
+				       WQ_NON_REENTRANT|
+				       WQ_MEM_RECLAIM,
+				       1);
 	if (!cc->io_queue) {
 		ti->error = "Couldn't create kcryptd io queue";
 		goto bad;
 	}
 
-	cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+	cc->crypt_queue = alloc_workqueue("kcryptd",
+					  WQ_NON_REENTRANT|
+					  WQ_CPU_INTENSIVE|
+					  WQ_MEM_RECLAIM,
+					  1);
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
@@ -1418,7 +1556,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From 20c82538e4f5ede51bc2b4795bc6e5cae772796d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:53 +0000
Subject: [PATCH 15/32] dm crypt: use io thread for reads only if mempool
 exhausted

If there is enough memory, code can directly submit bio
instead queing this operation in separate thread.

Try to alloc bio clone with GFP_NOWAIT and only if it
fails use separate queue (map function cannot block here).

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 50ae6ef83738..dc4403bcc6a0 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -787,26 +787,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+	blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 
-	crypt_inc_pending(io);
-
 	/*
 	 * The block layer might modify the bvec array, so always
 	 * copy the required bvecs because we need the original
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
-	clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
-	if (unlikely(!clone)) {
-		io->error = -ENOMEM;
-		crypt_dec_pending(io);
-		return;
+	clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
+	if (!clone) {
+		kcryptd_unplug(cc);
+		return 1;
 	}
 
+	crypt_inc_pending(io);
+
 	clone_init(io, clone);
 	clone->bi_idx = 0;
 	clone->bi_vcnt = bio_segments(base_bio);
@@ -816,6 +820,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
 	       sizeof(struct bio_vec) * clone->bi_vcnt);
 
 	generic_make_request(clone);
+	return 0;
 }
 
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -828,9 +833,12 @@ static void kcryptd_io(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
-	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_io_read(io);
-	else
+	if (bio_data_dir(io->base_bio) == READ) {
+		crypt_inc_pending(io);
+		if (kcryptd_io_read(io, GFP_NOIO))
+			io->error = -ENOMEM;
+		crypt_dec_pending(io);
+	} else
 		kcryptd_io_write(io);
 }
 
@@ -1424,9 +1432,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
 	io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
 
-	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_queue_io(io);
-	else
+	if (bio_data_dir(io->base_bio) == READ) {
+		if (kcryptd_io_read(io, GFP_NOWAIT))
+			kcryptd_queue_io(io);
+	} else
 		kcryptd_queue_crypt(io);
 
 	return DM_MAPIO_SUBMITTED;

From 2dc5327d3acb3340ab6fa3981401b076b78a51f4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:54 +0000
Subject: [PATCH 16/32] dm crypt: add post iv call to iv generator

IV (initialisation vector) can in principle depend not only
on sector but also on plaintext data (or other attributes).

Change IV generator interface to work directly with dmreq
structure to allow such dependence in generator.

Also add post() function which is called after the crypto
operation.

This allows tricky modification of decrypted data or IV
internals.

In asynchronous mode the post() can be called after
ctx->sector count was increased so it is needed
to add iv_sector copy directly to dmreq structure.
(N.B. dmreq always include only one sector in scatterlists)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 48 +++++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index dc4403bcc6a0..e0ebe685be6a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -64,6 +64,7 @@ struct dm_crypt_request {
 	struct convert_context *ctx;
 	struct scatterlist sg_in;
 	struct scatterlist sg_out;
+	sector_t iv_sector;
 };
 
 struct crypt_config;
@@ -74,7 +75,10 @@ struct crypt_iv_operations {
 	void (*dtr)(struct crypt_config *cc);
 	int (*init)(struct crypt_config *cc);
 	int (*wipe)(struct crypt_config *cc);
-	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+	int (*generator)(struct crypt_config *cc, u8 *iv,
+			 struct dm_crypt_request *dmreq);
+	int (*post)(struct crypt_config *cc, u8 *iv,
+		    struct dm_crypt_request *dmreq);
 };
 
 struct iv_essiv_private {
@@ -168,6 +172,7 @@ static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
 static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
 {
@@ -205,19 +210,20 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
 
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+	*(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
 
 	return 0;
 }
 
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-				sector_t sector)
+				struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(sector);
+	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
 
 	return 0;
 }
@@ -378,12 +384,13 @@ bad:
 	return err;
 }
 
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
 
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(sector);
+	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
 	crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
 
 	return 0;
@@ -417,19 +424,21 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
 
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	__be64 val;
 
 	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
 
-	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+	val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
 	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
 
 	return 0;
 }
 
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+			     struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
 
@@ -489,6 +498,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
 	return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
 
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+		       struct dm_crypt_request *dmreq)
+{
+	return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+		crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
+
 static int crypt_convert_block(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct ablkcipher_request *req)
@@ -500,9 +516,9 @@ static int crypt_convert_block(struct crypt_config *cc,
 	int r = 0;
 
 	dmreq = dmreq_of_req(cc, req);
-	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+	iv = iv_of_dmreq(cc, dmreq);
 
+	dmreq->iv_sector = ctx->sector;
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
 	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -525,7 +541,7 @@ static int crypt_convert_block(struct crypt_config *cc,
 	}
 
 	if (cc->iv_gen_ops) {
-		r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+		r = cc->iv_gen_ops->generator(cc, iv, dmreq);
 		if (r < 0)
 			return r;
 	}
@@ -538,6 +554,9 @@ static int crypt_convert_block(struct crypt_config *cc,
 	else
 		r = crypto_ablkcipher_decrypt(req);
 
+	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+		r = cc->iv_gen_ops->post(cc, iv, dmreq);
+
 	return r;
 }
 
@@ -1005,6 +1024,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 		return;
 	}
 
+	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+
 	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
 
 	if (!atomic_dec_and_test(&ctx->pending))

From d1f9642381847e2b94caa34c3533211cf36ffcf4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:54 +0000
Subject: [PATCH 17/32] dm crypt: add multi key capability

This patch adds generic multikey handling to be used
in following patch for Loop-AES mode compatibility.

This patch extends mapping table to optional keycount and
implements generic multi-key capability.

With more keys defined the <key> string is divided into
several <keycount> sections and these are used for tfms.

The tfm is used according to sector offset
(sector 0->tfm[0], sector 1->tfm[1], sector N->tfm[N modulo keycount])
(only power of two values supported for keycount here).

Because of tfms per-cpu allocation, this mode can be take
a lot of memory on large smp systems.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Max Vozeler <max@hinterhof.net>
---
 Documentation/device-mapper/dm-crypt.txt |  7 +-
 drivers/md/dm-crypt.c                    | 85 ++++++++++++++++++------
 2 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 524de926290d..59293ac4a5d0 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -8,7 +8,7 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
-    (In format cipher-chainmode-ivopts:ivmode).
+    (In format cipher[:keycount]-chainmode-ivopts:ivmode).
     Examples:
        des
        aes-cbc-essiv:sha256
@@ -20,6 +20,11 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
     Key used for encryption. It is encoded as a hexadecimal number.
     You can only use key sizes that are valid for the selected cipher.
 
+<keycount>
+    Multi-key compatibility mode. You can define <keycount> keys and
+    then sectors are encrypted according to their offsets (sector 0 uses key0;
+    sector 1 uses key1 etc.).  <keycount> must be a power of two.
+
 <iv_offset>
     The IV offset is a sector count that is added to the sector number
     before creating the IV.
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e0ebe685be6a..b8b9267c4dbb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -101,10 +101,9 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
  */
 struct crypt_cpu {
 	struct ablkcipher_request *req;
-	struct crypto_ablkcipher *tfm;
-
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
+	struct crypto_ablkcipher *tfms[0];
 };
 
 /*
@@ -143,6 +142,7 @@ struct crypt_config {
 	 * per_cpu_ptr() only.
 	 */
 	struct crypt_cpu __percpu *cpu;
+	unsigned tfms_count;
 
 	/*
 	 * Layout of each crypto request:
@@ -161,6 +161,7 @@ struct crypt_config {
 
 	unsigned long flags;
 	unsigned int key_size;
+	unsigned int key_parts;
 	u8 key[0];
 };
 
@@ -184,7 +185,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
  */
 static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 {
-	return __this_cpu_ptr(cc->cpu)->tfm;
+	return __this_cpu_ptr(cc->cpu)->tfms[0];
 }
 
 /*
@@ -567,11 +568,12 @@ static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
 	struct crypt_cpu *this_cc = this_crypt_config(cc);
+	unsigned key_index = ctx->sector & (cc->tfms_count - 1);
 
 	if (!this_cc->req)
 		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfm);
+	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
 	ablkcipher_request_set_callback(this_cc->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
@@ -1097,15 +1099,48 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 }
 
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
+{
+	struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+	unsigned i;
+
+	for (i = 0; i < cc->tfms_count; i++)
+		if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+			crypto_free_ablkcipher(cpu_cc->tfms[i]);
+			cpu_cc->tfms[i] = NULL;
+		}
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+	struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+	unsigned i;
+	int err;
+
+	for (i = 0; i < cc->tfms_count; i++) {
+		cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+		if (IS_ERR(cpu_cc->tfms[i])) {
+			err = PTR_ERR(cpu_cc->tfms[i]);
+			crypt_free_tfms(cc, cpu);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
 static int crypt_setkey_allcpus(struct crypt_config *cc)
 {
-	int cpu, err = 0, r;
+	unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+	int cpu, err = 0, i, r;
 
 	for_each_possible_cpu(cpu) {
-		r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfm,
-					       cc->key, cc->key_size);
-		if (r)
-			err = r;
+		for (i = 0; i < cc->tfms_count; i++) {
+			r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfms[i],
+						     cc->key + (i * subkey_size), subkey_size);
+			if (r)
+				err = r;
+		}
 	}
 
 	return err;
@@ -1158,8 +1193,7 @@ static void crypt_dtr(struct dm_target *ti)
 			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
 			if (cpu_cc->req)
 				mempool_free(cpu_cc->req, cc->req_pool);
-			if (cpu_cc->tfm)
-				crypto_free_ablkcipher(cpu_cc->tfm);
+			crypt_free_tfms(cc, cpu);
 		}
 
 	if (cc->bs)
@@ -1192,8 +1226,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 			    char *cipher_in, char *key)
 {
 	struct crypt_config *cc = ti->private;
-	struct crypto_ablkcipher *tfm;
-	char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
+	char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
 	char *cipher_api = NULL;
 	int cpu, ret = -EINVAL;
 
@@ -1209,10 +1242,20 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
 	/*
 	 * Legacy dm-crypt cipher specification
-	 * cipher-mode-iv:ivopts
+	 * cipher[:keycount]-mode-iv:ivopts
 	 */
 	tmp = cipher_in;
-	cipher = strsep(&tmp, "-");
+	keycount = strsep(&tmp, "-");
+	cipher = strsep(&keycount, ":");
+
+	if (!keycount)
+		cc->tfms_count = 1;
+	else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+		 !is_power_of_2(cc->tfms_count)) {
+		ti->error = "Bad cipher key count specification";
+		return -EINVAL;
+	}
+	cc->key_parts = cc->tfms_count;
 
 	cc->cipher = kstrdup(cipher, GFP_KERNEL);
 	if (!cc->cipher)
@@ -1225,7 +1268,9 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	cc->cpu = alloc_percpu(struct crypt_cpu);
+	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)) +
+				 cc->tfms_count * sizeof(*(cc->cpu->tfms)),
+				 __alignof__(struct crypt_cpu));
 	if (!cc->cpu) {
 		ti->error = "Cannot allocate per cpu state";
 		goto bad_mem;
@@ -1258,13 +1303,11 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 
 	/* Allocate cipher */
 	for_each_possible_cpu(cpu) {
-		tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
-		if (IS_ERR(tfm)) {
-			ret = PTR_ERR(tfm);
+		ret = crypt_alloc_tfms(cc, cpu, cipher_api);
+		if (ret < 0) {
 			ti->error = "Error allocating crypto tfm";
 			goto bad;
 		}
-		per_cpu_ptr(cc->cpu, cpu)->tfm = tfm;
 	}
 
 	/* Initialize and set key */
@@ -1587,7 +1630,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 9, 0},
+	.version = {1, 10, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From 34745785937a2003c144c0d4802fa637470d87af Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:55 +0000
Subject: [PATCH 18/32] dm crypt: add loop aes iv generator

This patch adds a compatible implementation of the block
chaining mode used by the Loop-AES block device encryption
system (http://loop-aes.sourceforge.net/) designed
by Jari Ruusu.

It operates on full 512 byte sectors and uses CBC
with an IV derived from the sector number, the data and
optionally extra IV seed.

This means that after CBC decryption the first block of sector
must be tweaked according to decrypted data.

Loop-AES can use three encryption schemes:
 version 1: is plain aes-cbc mode (already compatible)
 version 2: uses 64 multikey scheme with own IV generator
 version 3: the same as version 2 with additional IV seed
            (it uses 65 keys, last key is used as IV seed)

The IV generator is here named lmk (Loop-AES multikey)
and for the cipher specification looks like: aes:64-cbc-lmk

Version 2 and 3 is recognised according to length
of provided multi-key string (which is just hexa encoded
"raw key" used in original Loop-AES ioctl).

Configuration of the device and decoding key string will
be done in userspace (cryptsetup).
(Loop-AES stores keys in gpg encrypted file, raw keys are
output of simple hashing of lines in this file).

Based on an implementation by Max Vozeler:
  http://article.gmane.org/gmane.linux.kernel.cryptoapi/3752/

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
CC: Max Vozeler <max@hinterhof.net>
---
 drivers/md/dm-crypt.c | 193 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b8b9267c4dbb..4e054bd91664 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -23,6 +23,9 @@
 #include <linux/scatterlist.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <crypto/md5.h>
+#include <crypto/algapi.h>
 
 #include <linux/device-mapper.h>
 
@@ -90,6 +93,12 @@ struct iv_benbi_private {
 	int shift;
 };
 
+#define LMK_SEED_SIZE 64 /* hash + 0 */
+struct iv_lmk_private {
+	struct crypto_shash *hash_tfm;
+	u8 *seed;
+};
+
 /*
  * Crypt: maps a linear range of a block device
  * and encrypts / decrypts at the same time.
@@ -133,6 +142,7 @@ struct crypt_config {
 	union {
 		struct iv_essiv_private essiv;
 		struct iv_benbi_private benbi;
+		struct iv_lmk_private lmk;
 	} iv_gen_private;
 	sector_t iv_offset;
 	unsigned int iv_size;
@@ -207,6 +217,20 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  * null: the initial vector is always zero.  Provides compatibility with
  *       obsolete loop_fish2 devices.  Do not use for new devices.
  *
+ * lmk:  Compatible implementation of the block chaining mode used
+ *       by the Loop-AES block device encryption system
+ *       designed by Jari Ruusu. See http://loop-aes.sourceforge.net/
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from the sector number, the data and
+ *       optionally extra IV seed.
+ *       This means that after decryption the first block
+ *       of sector must be tweaked according to decrypted data.
+ *       Loop-AES can use three encryption schemes:
+ *         version 1: is plain aes-cbc mode
+ *         version 2: uses 64 multikey scheme with lmk IV generator
+ *         version 3: the same as version 2 with additional IV seed
+ *                   (it uses 65 keys, last key is used as IV seed)
+ *
  * plumb: unimplemented, see:
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
@@ -446,6 +470,156 @@ static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
 	return 0;
 }
 
+static void crypt_iv_lmk_dtr(struct crypt_config *cc)
+{
+	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+	if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
+		crypto_free_shash(lmk->hash_tfm);
+	lmk->hash_tfm = NULL;
+
+	kzfree(lmk->seed);
+	lmk->seed = NULL;
+}
+
+static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
+			    const char *opts)
+{
+	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+	lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0);
+	if (IS_ERR(lmk->hash_tfm)) {
+		ti->error = "Error initializing LMK hash";
+		return PTR_ERR(lmk->hash_tfm);
+	}
+
+	/* No seed in LMK version 2 */
+	if (cc->key_parts == cc->tfms_count) {
+		lmk->seed = NULL;
+		return 0;
+	}
+
+	lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
+	if (!lmk->seed) {
+		crypt_iv_lmk_dtr(cc);
+		ti->error = "Error kmallocing seed storage in LMK";
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int crypt_iv_lmk_init(struct crypt_config *cc)
+{
+	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+	int subkey_size = cc->key_size / cc->key_parts;
+
+	/* LMK seed is on the position of LMK_KEYS + 1 key */
+	if (lmk->seed)
+		memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
+		       crypto_shash_digestsize(lmk->hash_tfm));
+
+	return 0;
+}
+
+static int crypt_iv_lmk_wipe(struct crypt_config *cc)
+{
+	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+
+	if (lmk->seed)
+		memset(lmk->seed, 0, LMK_SEED_SIZE);
+
+	return 0;
+}
+
+static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+			    struct dm_crypt_request *dmreq,
+			    u8 *data)
+{
+	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
+	struct {
+		struct shash_desc desc;
+		char ctx[crypto_shash_descsize(lmk->hash_tfm)];
+	} sdesc;
+	struct md5_state md5state;
+	u32 buf[4];
+	int i, r;
+
+	sdesc.desc.tfm = lmk->hash_tfm;
+	sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	r = crypto_shash_init(&sdesc.desc);
+	if (r)
+		return r;
+
+	if (lmk->seed) {
+		r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE);
+		if (r)
+			return r;
+	}
+
+	/* Sector is always 512B, block size 16, add data of blocks 1-31 */
+	r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31);
+	if (r)
+		return r;
+
+	/* Sector is cropped to 56 bits here */
+	buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
+	buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
+	buf[2] = cpu_to_le32(4024);
+	buf[3] = 0;
+	r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf));
+	if (r)
+		return r;
+
+	/* No MD5 padding here */
+	r = crypto_shash_export(&sdesc.desc, &md5state);
+	if (r)
+		return r;
+
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		__cpu_to_le32s(&md5state.hash[i]);
+	memcpy(iv, &md5state.hash, cc->iv_size);
+
+	return 0;
+}
+
+static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
+			    struct dm_crypt_request *dmreq)
+{
+	u8 *src;
+	int r = 0;
+
+	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
+		src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0);
+		r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
+		kunmap_atomic(src, KM_USER0);
+	} else
+		memset(iv, 0, cc->iv_size);
+
+	return r;
+}
+
+static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
+			     struct dm_crypt_request *dmreq)
+{
+	u8 *dst;
+	int r;
+
+	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
+		return 0;
+
+	dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0);
+	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
+
+	/* Tweak the first block of plaintext sector */
+	if (!r)
+		crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
+
+	kunmap_atomic(dst, KM_USER0);
+	return r;
+}
+
 static struct crypt_iv_operations crypt_iv_plain_ops = {
 	.generator = crypt_iv_plain_gen
 };
@@ -472,6 +646,15 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
 	.generator = crypt_iv_null_gen
 };
 
+static struct crypt_iv_operations crypt_iv_lmk_ops = {
+	.ctr	   = crypt_iv_lmk_ctr,
+	.dtr	   = crypt_iv_lmk_dtr,
+	.init	   = crypt_iv_lmk_init,
+	.wipe	   = crypt_iv_lmk_wipe,
+	.generator = crypt_iv_lmk_gen,
+	.post	   = crypt_iv_lmk_post
+};
+
 static void crypt_convert_init(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct bio *bio_out, struct bio *bio_in,
@@ -1341,7 +1524,15 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		cc->iv_gen_ops = &crypt_iv_benbi_ops;
 	else if (strcmp(ivmode, "null") == 0)
 		cc->iv_gen_ops = &crypt_iv_null_ops;
-	else {
+	else if (strcmp(ivmode, "lmk") == 0) {
+		cc->iv_gen_ops = &crypt_iv_lmk_ops;
+		/* Version 2 and 3 is recognised according
+		 * to length of provided multi-key string.
+		 * If present (version 3), last key is used as IV seed.
+		 */
+		if (cc->key_size % cc->key_parts)
+			cc->key_parts++;
+	} else {
 		ret = -EINVAL;
 		ti->error = "Invalid IV mode";
 		goto bad;

From 810b492375f4aed5ce222982054adc0394a4bd33 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:55 +0000
Subject: [PATCH 19/32] dm ioctl: suppress needless warning messages

The device-mapper should not send warning messages to syslog
if a device is not found. This can be done by userspace
according to the returned dm-ioctl error code.

So move these messages to debug level and use rate limiting
to not flood syslog.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f3481d922206..6d12775a1061 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -779,7 +779,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	hc = __find_device_hash_cell(param);
 
 	if (!hc) {
-		DMWARN("device doesn't appear to be in the dev hash table.");
+		DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
 		up_write(&_hash_lock);
 		return -ENXIO;
 	}
@@ -791,7 +791,7 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 	 */
 	r = dm_lock_for_deletion(md);
 	if (r) {
-		DMWARN("unable to remove open device %s", hc->name);
+		DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
 		up_write(&_hash_lock);
 		dm_put(md);
 		return r;
@@ -938,7 +938,7 @@ static int do_resume(struct dm_ioctl *param)
 
 	hc = __find_device_hash_cell(param);
 	if (!hc) {
-		DMWARN("device doesn't appear to be in the dev hash table.");
+		DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
 		up_write(&_hash_lock);
 		return -ENXIO;
 	}
@@ -1265,7 +1265,7 @@ static int table_clear(struct dm_ioctl *param, size_t param_size)
 
 	hc = __find_device_hash_cell(param);
 	if (!hc) {
-		DMWARN("device doesn't appear to be in the dev hash table.");
+		DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table.");
 		up_write(&_hash_lock);
 		return -ENXIO;
 	}

From fecec20e55ec117a09857ac1a455e2e6e2f17df4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:56 +0000
Subject: [PATCH 20/32] dm snapshot: remove unused dm_snapshot queued_bios_work

dm_snapshot->queued_bios_work isn't used.  Remove ->queued_bios[_work]
from dm_snapshot structure, the flush_queued_bios work function and
ksnapd workqueue.

The DM snapshot changes that were going to use the ksnapd workqueue were
either superseded (fix for origin write races) or never completed
(deallocation of invalid snapshot's memory via workqueue).

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 38 --------------------------------------
 1 file changed, 38 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 53cf79d8bcbc..0f47698beafa 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -19,7 +19,6 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
-#include <linux/workqueue.h>
 
 #include "dm-exception-store.h"
 
@@ -106,10 +105,6 @@ struct dm_snapshot {
 
 	struct dm_kcopyd_client *kcopyd_client;
 
-	/* Queue of snapshot writes for ksnapd to flush */
-	struct bio_list queued_bios;
-	struct work_struct queued_bios_work;
-
 	/* Wait for events based on state_bits */
 	unsigned long state_bits;
 
@@ -160,9 +155,6 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 }
 EXPORT_SYMBOL(dm_snap_cow);
 
-static struct workqueue_struct *ksnapd;
-static void flush_queued_bios(struct work_struct *work);
-
 static sector_t chunk_to_sector(struct dm_exception_store *store,
 				chunk_t chunk)
 {
@@ -1153,9 +1145,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	spin_lock_init(&s->tracked_chunk_lock);
 
-	bio_list_init(&s->queued_bios);
-	INIT_WORK(&s->queued_bios_work, flush_queued_bios);
-
 	ti->private = s;
 	ti->num_flush_requests = num_flush_requests;
 
@@ -1279,8 +1268,6 @@ static void snapshot_dtr(struct dm_target *ti)
 	struct dm_snapshot *s = ti->private;
 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 
-	flush_workqueue(ksnapd);
-
 	down_read(&_origins_lock);
 	/* Check whether exception handover must be cancelled */
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
@@ -1342,20 +1329,6 @@ static void flush_bios(struct bio *bio)
 	}
 }
 
-static void flush_queued_bios(struct work_struct *work)
-{
-	struct dm_snapshot *s =
-		container_of(work, struct dm_snapshot, queued_bios_work);
-	struct bio *queued_bios;
-	unsigned long flags;
-
-	spin_lock_irqsave(&s->pe_lock, flags);
-	queued_bios = bio_list_get(&s->queued_bios);
-	spin_unlock_irqrestore(&s->pe_lock, flags);
-
-	flush_bios(queued_bios);
-}
-
 static int do_origin(struct dm_dev *origin, struct bio *bio);
 
 /*
@@ -2291,17 +2264,8 @@ static int __init dm_snapshot_init(void)
 		goto bad_tracked_chunk_cache;
 	}
 
-	ksnapd = create_singlethread_workqueue("ksnapd");
-	if (!ksnapd) {
-		DMERR("Failed to create ksnapd workqueue.");
-		r = -ENOMEM;
-		goto bad_pending_pool;
-	}
-
 	return 0;
 
-bad_pending_pool:
-	kmem_cache_destroy(tracked_chunk_cache);
 bad_tracked_chunk_cache:
 	kmem_cache_destroy(pending_cache);
 bad_pending_cache:
@@ -2322,8 +2286,6 @@ bad_register_snapshot_target:
 
 static void __exit dm_snapshot_exit(void)
 {
-	destroy_workqueue(ksnapd);
-
 	dm_unregister_target(&snapshot_target);
 	dm_unregister_target(&origin_target);
 	dm_unregister_target(&merge_target);

From d5ffa387e24646cb1cb55d80fd0f182a00e0edb7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:56 +0000
Subject: [PATCH 21/32] dm: dont use flush_scheduled_work

flush_scheduled_work() is being deprecated.  Flush the used work
directly instead.  In all dm targets, the only work which uses
system_wq is ->trigger_event.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 2 +-
 drivers/md/dm-raid1.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 406091f9692b..fcc59c3f756e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -920,7 +920,7 @@ static void flush_multipath_work(struct multipath *m)
 	flush_workqueue(kmpath_handlerd);
 	multipath_wait_for_pg_init_completion(m);
 	flush_workqueue(kmultipathd);
-	flush_scheduled_work();
+	flush_work_sync(&m->trigger_event);
 }
 
 static void multipath_dtr(struct dm_target *ti)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index c87756eb7a21..0d58b6f875cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1138,7 +1138,7 @@ static void mirror_dtr(struct dm_target *ti)
 
 	del_timer_sync(&ms->timer);
 	flush_workqueue(ms->kmirrord_wq);
-	flush_scheduled_work();
+	flush_work_sync(&ms->trigger_event);
 	dm_kcopyd_client_destroy(ms->kcopyd_client);
 	destroy_workqueue(ms->kmirrord_wq);
 	free_context(ms, ti, ms->nr_mirrors);

From f521f074abe7b3990f5df65482cdc3d851b80665 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:57 +0000
Subject: [PATCH 22/32] dm stripe: switch from local workqueue to system_wq

kstriped only serves sc->kstriped_ws which runs dm_table_event().
This doesn't need to be executed from an ordered workqueue w/ rescuer.
Drop kstriped and use the system_wq instead.  While at it, rename
kstriped_ws to trigger_event so that it's consistent with other dm
modules.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f0371b4c4fbf..dddfa14f2982 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -39,23 +39,20 @@ struct stripe_c {
 	struct dm_target *ti;
 
 	/* Work struct used for triggering events*/
-	struct work_struct kstriped_ws;
+	struct work_struct trigger_event;
 
 	struct stripe stripe[0];
 };
 
-static struct workqueue_struct *kstriped;
-
 /*
  * An event is triggered whenever a drive
  * drops out of a stripe volume.
  */
 static void trigger_event(struct work_struct *work)
 {
-	struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
-
+	struct stripe_c *sc = container_of(work, struct stripe_c,
+					   trigger_event);
 	dm_table_event(sc->ti->table);
-
 }
 
 static inline struct stripe_c *alloc_context(unsigned int stripes)
@@ -160,7 +157,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
-	INIT_WORK(&sc->kstriped_ws, trigger_event);
+	INIT_WORK(&sc->trigger_event, trigger_event);
 
 	/* Set pointer to dm target; used in trigger_event */
 	sc->ti = ti;
@@ -211,7 +208,7 @@ static void stripe_dtr(struct dm_target *ti)
 	for (i = 0; i < sc->stripes; i++)
 		dm_put_device(ti, sc->stripe[i].dev);
 
-	flush_workqueue(kstriped);
+	flush_work_sync(&sc->trigger_event);
 	kfree(sc);
 }
 
@@ -367,7 +364,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 			atomic_inc(&(sc->stripe[i].error_count));
 			if (atomic_read(&(sc->stripe[i].error_count)) <
 			    DM_IO_ERROR_THRESHOLD)
-				queue_work(kstriped, &sc->kstriped_ws);
+				schedule_work(&sc->trigger_event);
 		}
 
 	return error;
@@ -401,7 +398,7 @@ static void stripe_io_hints(struct dm_target *ti,
 
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 3, 0},
+	.version = {1, 3, 1},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
@@ -422,20 +419,10 @@ int __init dm_stripe_init(void)
 		return r;
 	}
 
-	kstriped = create_singlethread_workqueue("kstriped");
-	if (!kstriped) {
-		DMERR("failed to create workqueue kstriped");
-		dm_unregister_target(&stripe_target);
-		return -ENOMEM;
-	}
-
 	return r;
 }
 
 void dm_stripe_exit(void)
 {
 	dm_unregister_target(&stripe_target);
-	destroy_workqueue(kstriped);
-
-	return;
 }

From 4d4d66ab5322fa9b0f51842a76139387a40e1ce9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:57 +0000
Subject: [PATCH 23/32] dm: convert workqueues to alloc_ordered

Convert all create[_singlethread]_work() users to the new
alloc[_ordered]_workqueue().  This conversion is mechanical and
doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-delay.c           | 2 +-
 drivers/md/dm-kcopyd.c          | 2 +-
 drivers/md/dm-mpath.c           | 5 +++--
 drivers/md/dm-raid1.c           | 2 +-
 drivers/md/dm-snap-persistent.c | 2 +-
 drivers/md/dm.c                 | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index baa11912cc94..f18375dcedd9 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -352,7 +352,7 @@ static int __init dm_delay_init(void)
 {
 	int r = -ENOMEM;
 
-	kdelayd_wq = create_workqueue("kdelayd");
+	kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
 	if (!kdelayd_wq) {
 		DMERR("Couldn't start kdelayd");
 		goto bad_queue;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index dad32f8bce7d..63d67169c7f4 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -672,7 +672,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 		goto bad_slab;
 
 	INIT_WORK(&kc->kcopyd_work, do_work);
-	kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
+	kc->kcopyd_wq = alloc_ordered_workqueue("kcopyd", WQ_MEM_RECLAIM);
 	if (!kc->kcopyd_wq)
 		goto bad_workqueue;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index fcc59c3f756e..35ab5781f88f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1675,7 +1675,7 @@ static int __init dm_multipath_init(void)
 		return -EINVAL;
 	}
 
-	kmultipathd = create_workqueue("kmpathd");
+	kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
 	if (!kmultipathd) {
 		DMERR("failed to create workqueue kmpathd");
 		dm_unregister_target(&multipath_target);
@@ -1689,7 +1689,8 @@ static int __init dm_multipath_init(void)
 	 * old workqueue would also create a bottleneck in the
 	 * path of the storage hardware device activation.
 	 */
-	kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd");
+	kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
+						  WQ_MEM_RECLAIM);
 	if (!kmpath_handlerd) {
 		DMERR("failed to create workqueue kmpath_handlerd");
 		destroy_workqueue(kmultipathd);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 0d58b6f875cc..39917430f9f8 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1085,7 +1085,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_requests = 1;
 	ti->num_discard_requests = 1;
 
-	ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
+	ms->kmirrord_wq = alloc_ordered_workqueue("kmirrord", WQ_MEM_RECLAIM);
 	if (!ms->kmirrord_wq) {
 		DMERR("couldn't start kmirrord");
 		r = -ENOMEM;
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2129cdb115dc..d3021a69c857 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
 	atomic_set(&ps->pending_count, 0);
 	ps->callbacks = NULL;
 
-	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
+	ps->metadata_wq = alloc_ordered_workqueue("ksnaphd", WQ_MEM_RECLAIM);
 	if (!ps->metadata_wq) {
 		kfree(ps);
 		DMERR("couldn't start header metadata update thread");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0de692176ad4..39aaa9290128 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1883,7 +1883,7 @@ static struct mapped_device *alloc_dev(int minor)
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
-	md->wq = create_singlethread_workqueue("kdmflush");
+	md->wq = alloc_ordered_workqueue("kdmflush", WQ_MEM_RECLAIM);
 	if (!md->wq)
 		goto bad_thread;
 

From 9c4376de98719d2768dd919553843de34bb094a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:58 +0000
Subject: [PATCH 24/32] dm: use non reentrant workqueues if equivalent

kmirrord_wq, kcopyd_work and md->wq are created per dm instance and
serve only a single work item from the dm instance, so non-reentrant
workqueues would provide the same ordering guarantees as ordered ones
while allowing CPU affinity and use of the workqueues for other
purposes.  Switch them to non-reentrant workqueues.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c   | 3 ++-
 drivers/md/dm-raid1.c    | 5 +++--
 drivers/md/dm.c          | 3 ++-
 include/linux/dm-ioctl.h | 4 ++--
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 63d67169c7f4..924f5f0084c2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -672,7 +672,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 		goto bad_slab;
 
 	INIT_WORK(&kc->kcopyd_work, do_work);
-	kc->kcopyd_wq = alloc_ordered_workqueue("kcopyd", WQ_MEM_RECLAIM);
+	kc->kcopyd_wq = alloc_workqueue("kcopyd",
+					WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!kc->kcopyd_wq)
 		goto bad_workqueue;
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 39917430f9f8..dee326775c60 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1085,7 +1085,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_requests = 1;
 	ti->num_discard_requests = 1;
 
-	ms->kmirrord_wq = alloc_ordered_workqueue("kmirrord", WQ_MEM_RECLAIM);
+	ms->kmirrord_wq = alloc_workqueue("kmirrord",
+					  WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
 		DMERR("couldn't start kmirrord");
 		r = -ENOMEM;
@@ -1414,7 +1415,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 12, 0},
+	.version = {1, 12, 1},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 39aaa9290128..e504bb40d60e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1883,7 +1883,8 @@ static struct mapped_device *alloc_dev(int minor)
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
-	md->wq = alloc_ordered_workqueue("kdmflush", WQ_MEM_RECLAIM);
+	md->wq = alloc_workqueue("kdmflush",
+				 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!md->wq)
 		goto bad_thread;
 
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index e453e1174ae1..78bbf47bbb96 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -268,8 +268,8 @@ enum {
 
 #define DM_VERSION_MAJOR	4
 #define DM_VERSION_MINOR	19
-#define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2010-10-14)"
+#define DM_VERSION_PATCHLEVEL	1
+#define DM_VERSION_EXTRA	"-ioctl (2011-01-07)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */

From 239c8dd533e74de4a7f3c85c4f9f430eb08867c1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jan 2011 19:59:59 +0000
Subject: [PATCH 25/32] dm snapshot: persistent make metadata_wq multithreaded

metadata_wq serves on-stack work items from chunk_io().  Even if
multiple chunk_io() are simultaneously in progress, each is
independent and queued only once, so multithreaded workqueue can be
safely used.

Switch metadata_wq to multithread and flush the work item instead of
the workqueue in chunk_io().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index d3021a69c857..95891dfcbca0 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 	 */
 	INIT_WORK_ONSTACK(&req.work, do_metadata);
 	queue_work(ps->metadata_wq, &req.work);
-	flush_workqueue(ps->metadata_wq);
+	flush_work(&req.work);
 
 	return req.result;
 }
@@ -818,7 +818,7 @@ static int persistent_ctr(struct dm_exception_store *store,
 	atomic_set(&ps->pending_count, 0);
 	ps->callbacks = NULL;
 
-	ps->metadata_wq = alloc_ordered_workqueue("ksnaphd", WQ_MEM_RECLAIM);
+	ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0);
 	if (!ps->metadata_wq) {
 		kfree(ps);
 		DMERR("couldn't start header metadata update thread");

From b83b2f295aec418c7501c05df4dfd168a79d165a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:59:59 +0000
Subject: [PATCH 26/32] dm snapshot: avoid storing private suspended state

Use dm_suspended() rather than having each snapshot target maintain a
private 'suspended' flag in struct dm_snapshot.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 0f47698beafa..fdde53cd12b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -79,9 +79,6 @@ struct dm_snapshot {
 	/* Origin writes don't trigger exceptions until this is set */
 	int active;
 
-	/* Whether or not owning mapped_device is suspended */
-	int suspended;
-
 	atomic_t pending_exceptions_count;
 
 	mempool_t *pending_pool;
@@ -1102,7 +1099,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->ti = ti;
 	s->valid = 1;
 	s->active = 0;
-	s->suspended = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
 	init_rwsem(&s->lock);
 	INIT_LIST_HEAD(&s->list);
@@ -1733,15 +1729,6 @@ static void snapshot_merge_presuspend(struct dm_target *ti)
 	stop_merge(s);
 }
 
-static void snapshot_postsuspend(struct dm_target *ti)
-{
-	struct dm_snapshot *s = ti->private;
-
-	down_write(&s->lock);
-	s->suspended = 1;
-	up_write(&s->lock);
-}
-
 static int snapshot_preresume(struct dm_target *ti)
 {
 	int r = 0;
@@ -1756,7 +1743,7 @@ static int snapshot_preresume(struct dm_target *ti)
 			DMERR("Unable to resume snapshot source until "
 			      "handover completes.");
 			r = -EINVAL;
-		} else if (!snap_src->suspended) {
+		} else if (!dm_suspended(snap_src->ti)) {
 			DMERR("Unable to perform snapshot handover until "
 			      "source is suspended.");
 			r = -EINVAL;
@@ -1789,7 +1776,6 @@ static void snapshot_resume(struct dm_target *ti)
 
 	down_write(&s->lock);
 	s->active = 1;
-	s->suspended = 0;
 	up_write(&s->lock);
 }
 
@@ -2167,7 +2153,7 @@ static int origin_iterate_devices(struct dm_target *ti,
 
 static struct target_type origin_target = {
 	.name    = "snapshot-origin",
-	.version = {1, 7, 0},
+	.version = {1, 7, 1},
 	.module  = THIS_MODULE,
 	.ctr     = origin_ctr,
 	.dtr     = origin_dtr,
@@ -2180,13 +2166,12 @@ static struct target_type origin_target = {
 
 static struct target_type snapshot_target = {
 	.name    = "snapshot",
-	.version = {1, 9, 0},
+	.version = {1, 10, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
 	.map     = snapshot_map,
 	.end_io  = snapshot_end_io,
-	.postsuspend = snapshot_postsuspend,
 	.preresume  = snapshot_preresume,
 	.resume  = snapshot_resume,
 	.status  = snapshot_status,
@@ -2195,14 +2180,13 @@ static struct target_type snapshot_target = {
 
 static struct target_type merge_target = {
 	.name    = dm_snapshot_merge_target_name,
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module  = THIS_MODULE,
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
 	.map     = snapshot_merge_map,
 	.end_io  = snapshot_end_io,
 	.presuspend = snapshot_merge_presuspend,
-	.postsuspend = snapshot_postsuspend,
 	.preresume  = snapshot_preresume,
 	.resume  = snapshot_merge_resume,
 	.status  = snapshot_status,

From dbc883f1570d992ba926a8c9e22140ba473c6cc1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 13 Jan 2011 20:00:00 +0000
Subject: [PATCH 27/32] dm log: use PTR_ERR value instead of ENOMEM

It's nicer to return the PTR_ERR() value instead of just returning
-ENOMEM.  In the current code the PTR_ERR() value is always equal to
-ENOMEM so this doesn't actually affect anything, but still...

In addition, dm_dirty_log_create() doesn't check for a specific -ENOMEM
return.  So this change is safe relative to potential for a non -ENOMEM
return in the future.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 33420e68d153..6951536ea29c 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -455,7 +455,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 			r = PTR_ERR(lc->io_req.client);
 			DMWARN("couldn't allocate disk io client");
 			kfree(lc);
-			return -ENOMEM;
+			return r;
 		}
 
 		lc->disk_header = vmalloc(buf_size);

From 052189a2ec956810feefb6a681416c5e6a207646 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 13 Jan 2011 20:00:00 +0000
Subject: [PATCH 28/32] dm: remove superfluous irq disablement in dm_request_fn

This patch changes spin_lock_irq() to spin_lock() in dm_request_fn().
This patch is just a clean-up and no functional change.

The spin_lock_irq() was leftover from the early request-based dm code,
where map_request() used to enable interrupts.
Since current map_request() never enables interrupts, we can change it
to spin_lock() to match the prior spin_unlock().

Auditing through the dm and block-layer code called from
map_request(), I confirmed all functions save/restore interrupt
status, so no function returning with interrupts enabled.
Also I haven't observed any problem on my test environment which
uses scsi and lpfc driver after heavy I/O testing with occasional
path down/up.

Added BUG_ON() to detect breakage in future.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e504bb40d60e..eaa3af0e0632 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1637,13 +1637,15 @@ static void dm_request_fn(struct request_queue *q)
 		if (map_request(ti, clone, md))
 			goto requeued;
 
-		spin_lock_irq(q->queue_lock);
+		BUG_ON(!irqs_disabled());
+		spin_lock(q->queue_lock);
 	}
 
 	goto out;
 
 requeued:
-	spin_lock_irq(q->queue_lock);
+	BUG_ON(!irqs_disabled());
+	spin_lock(q->queue_lock);
 
 plug_and_out:
 	if (!elv_queue_empty(q))

From 4e2d19e46b507018c6ed15f6c081d8f887ae229c Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Thu, 13 Jan 2011 20:00:01 +0000
Subject: [PATCH 29/32] dm mpath: delay activate_path retry on SCSI_DH_RETRY

This patch adds a user-configurable 'pg_init_delay_msecs' feature.  Use
this feature to specify the number of milliseconds to delay before
retrying scsi_dh_activate, when SCSI_DH_RETRY is returned.

SCSI Device Handlers return SCSI_DH_IMM_RETRY if we could retry
activation immediately and SCSI_DH_RETRY in cases where it is better to
retry after some delay.

Currently we immediately retry scsi_dh_activate irrespective of
SCSI_DH_IMM_RETRY and SCSI_DH_RETRY.

The 'pg_init_delay_msecs' feature may be provided during table create or
load, e.g.:
    dmsetup create --table "0 20971520 multipath 3 queue_if_no_path \
	pg_init_delay_msecs 2500 ..." mpatha

The default for 'pg_init_delay_msecs' is 2000 milliseconds.
Maximum configurable delay is 60000 milliseconds.  Specifying a
'pg_init_delay_msecs' of 0 will cause immediate retry.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Acked-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 35ab5781f88f..b82d28819e2a 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -23,6 +23,8 @@
 
 #define DM_MSG_PREFIX "multipath"
 #define MESG_STR(x) x, sizeof(x)
+#define DM_PG_INIT_DELAY_MSECS 2000
+#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 
 /* Path properties */
 struct pgpath {
@@ -33,7 +35,7 @@ struct pgpath {
 	unsigned fail_count;		/* Cumulative failure count */
 
 	struct dm_path path;
-	struct work_struct activate_path;
+	struct delayed_work activate_path;
 };
 
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,11 +66,15 @@ struct multipath {
 
 	const char *hw_handler_name;
 	char *hw_handler_params;
+
 	unsigned nr_priority_groups;
 	struct list_head priority_groups;
+
+	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
+
 	unsigned pg_init_required;	/* pg_init needs calling? */
 	unsigned pg_init_in_progress;	/* Only one pg_init allowed at once */
-	wait_queue_head_t pg_init_wait;	/* Wait for pg_init completion */
+	unsigned pg_init_delay_retry;	/* Delay pg_init retry? */
 
 	unsigned nr_valid_paths;	/* Total number of usable paths */
 	struct pgpath *current_pgpath;
@@ -81,6 +87,7 @@ struct multipath {
 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
 	unsigned pg_init_retries;	/* Number of times to retry pg_init */
 	unsigned pg_init_count;		/* Number of times pg_init called */
+	unsigned pg_init_delay_msecs;	/* Number of msecs before pg_init retry */
 
 	struct work_struct process_queued_ios;
 	struct list_head queued_ios;
@@ -127,7 +134,7 @@ static struct pgpath *alloc_pgpath(void)
 
 	if (pgpath) {
 		pgpath->is_active = 1;
-		INIT_WORK(&pgpath->activate_path, activate_path);
+		INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
 	}
 
 	return pgpath;
@@ -188,6 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_LIST_HEAD(&m->queued_ios);
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
+		m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
@@ -227,14 +235,19 @@ static void free_multipath(struct multipath *m)
 static void __pg_init_all_paths(struct multipath *m)
 {
 	struct pgpath *pgpath;
+	unsigned long pg_init_delay = 0;
 
 	m->pg_init_count++;
 	m->pg_init_required = 0;
+	if (m->pg_init_delay_retry)
+		pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
+						 m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
 	list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
 		/* Skip failed paths */
 		if (!pgpath->is_active)
 			continue;
-		if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+		if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
+				       pg_init_delay))
 			m->pg_init_in_progress++;
 	}
 }
@@ -782,8 +795,9 @@ static int parse_features(struct arg_set *as, struct multipath *m)
 	const char *param_name;
 
 	static struct param _params[] = {
-		{0, 3, "invalid number of feature args"},
+		{0, 5, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
+		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
 
 	r = read_param(_params, shift(as), &argc, &ti->error);
@@ -810,6 +824,14 @@ static int parse_features(struct arg_set *as, struct multipath *m)
 			continue;
 		}
 
+		if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+		    (argc >= 1)) {
+			r = read_param(_params + 2, shift(as),
+				       &m->pg_init_delay_msecs, &ti->error);
+			argc--;
+			continue;
+		}
+
 		ti->error = "Unrecognised multipath feature request";
 		r = -EINVAL;
 	} while (argc && !r);
@@ -1022,7 +1044,7 @@ static int reinstate_path(struct pgpath *pgpath)
 		m->current_pgpath = NULL;
 		queue_work(kmultipathd, &m->process_queued_ios);
 	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
-		if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+		if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
 			m->pg_init_in_progress++;
 	}
 
@@ -1157,6 +1179,7 @@ static void pg_init_done(void *data, int errors)
 	struct priority_group *pg = pgpath->pg;
 	struct multipath *m = pg->m;
 	unsigned long flags;
+	unsigned delay_retry = 0;
 
 	/* device or driver problems */
 	switch (errors) {
@@ -1181,8 +1204,9 @@ static void pg_init_done(void *data, int errors)
 		 */
 		bypass_pg(m, pg, 1);
 		break;
-	/* TODO: For SCSI_DH_RETRY we should wait a couple seconds */
 	case SCSI_DH_RETRY:
+		/* Wait before retrying. */
+		delay_retry = 1;
 	case SCSI_DH_IMM_RETRY:
 	case SCSI_DH_RES_TEMP_UNAVAIL:
 		if (pg_init_limit_reached(m, pgpath))
@@ -1215,6 +1239,7 @@ static void pg_init_done(void *data, int errors)
 	if (!m->pg_init_required)
 		m->queue_io = 0;
 
+	m->pg_init_delay_retry = delay_retry;
 	queue_work(kmultipathd, &m->process_queued_ios);
 
 	/*
@@ -1229,7 +1254,7 @@ out:
 static void activate_path(struct work_struct *work)
 {
 	struct pgpath *pgpath =
-		container_of(work, struct pgpath, activate_path);
+		container_of(work, struct pgpath, activate_path.work);
 
 	scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
 				pg_init_done, pgpath);
@@ -1370,11 +1395,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 		DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
 	else {
 		DMEMIT("%u ", m->queue_if_no_path +
-			      (m->pg_init_retries > 0) * 2);
+			      (m->pg_init_retries > 0) * 2 +
+			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
 		if (m->queue_if_no_path)
 			DMEMIT("queue_if_no_path ");
 		if (m->pg_init_retries)
 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+		if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
+			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
 	}
 
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1643,7 +1671,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 1, 1},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,

From 9d357b0787bb3c91835d5e658c3bda178f9ca419 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Jan 2011 20:00:01 +0000
Subject: [PATCH 30/32] dm: introduce target callbacks and congestion callback

DM currently implements congestion checking by checking on congestion
in each component device.  For raid456 we need to also check if the
stripe cache is congested.

Add per-target congestion checker callback support.

Extending the target_callbacks structure with additional callback
functions allows for establishing multiple callbacks per-target (a
callback is also needed for unplug).

Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 14 ++++++++++++++
 include/linux/device-mapper.h | 11 +++++++++++
 2 files changed, 25 insertions(+)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 985c20a4f30e..7e2ec3c05550 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -71,6 +71,8 @@ struct dm_table {
 	void *event_context;
 
 	struct dm_md_mempools *mempools;
+
+	struct list_head target_callbacks;
 };
 
 /*
@@ -204,6 +206,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&t->devices);
+	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
 	t->discards_supported = 1;
 
@@ -1225,10 +1228,17 @@ int dm_table_resume_targets(struct dm_table *t)
 	return 0;
 }
 
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
+{
+	list_add(&cb->list, &t->target_callbacks);
+}
+EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
+
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_target_callbacks *cb;
 	int r = 0;
 
 	list_for_each_entry(dd, devices, list) {
@@ -1243,6 +1253,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 				     bdevname(dd->dm_dev.bdev, b));
 	}
 
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->congested_fn)
+			r |= cb->congested_fn(cb, bdi_bits);
+
 	return r;
 }
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 2970022faa63..4b1c63d478ab 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -193,6 +193,12 @@ struct dm_target {
 	char *error;
 };
 
+/* Each target can link one of these into the table */
+struct dm_target_callbacks {
+	struct list_head list;
+	int (*congested_fn) (struct dm_target_callbacks *, int);
+};
+
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
@@ -268,6 +274,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 int dm_table_add_target(struct dm_table *t, const char *type,
 			sector_t start, sector_t len, char *params);
 
+/*
+ * Target_ctr should call this if it needs to add any callbacks.
+ */
+void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb);
+
 /*
  * Finally call this to make the table ready for use.
  */

From 99d03c141b40914b67d63c9d23b8da4386422ed7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Jan 2011 20:00:02 +0000
Subject: [PATCH 31/32] dm: per target unplug callback support

Add per-target unplug callback support.

Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 5 +++++
 include/linux/device-mapper.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7e2ec3c05550..dffa0ac7c4f0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1278,6 +1278,7 @@ void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_target_callbacks *cb;
 
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
@@ -1290,6 +1291,10 @@ void dm_table_unplug_all(struct dm_table *t)
 				     dm_device_name(t->md),
 				     bdevname(dd->dm_dev.bdev, b));
 	}
+
+	list_for_each_entry(cb, &t->target_callbacks, list)
+		if (cb->unplug_fn)
+			cb->unplug_fn(cb);
 }
 
 struct mapped_device *dm_table_get_md(struct dm_table *t)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 4b1c63d478ab..272496d1fae4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -197,6 +197,7 @@ struct dm_target {
 struct dm_target_callbacks {
 	struct list_head list;
 	int (*congested_fn) (struct dm_target_callbacks *, int);
+	void (*unplug_fn)(struct dm_target_callbacks *);
 };
 
 int dm_register_target(struct target_type *t);

From 9d09e663d5502c46f2d9481c04c1087e1c2da698 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 13 Jan 2011 20:00:02 +0000
Subject: [PATCH 32/32] dm: raid456 basic support

This patch is the skeleton for the DM target that will be
the bridge from DM to MD (initially RAID456 and later RAID1).  It
provides a way to use device-mapper interfaces to the MD RAID456
drivers.

As with all device-mapper targets, the nominal public interfaces are the
constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
and STATUSTYPE_TABLE).  The CTR table looks like the following:

1: <s> <l> raid \
2:	<raid_type> <#raid_params> <raid_params> \
3:	<#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>

Line 1 contains the standard first three arguments to any device-mapper
target - the start, length, and target type fields.  The target type in
this case is "raid".

Line 2 contains the arguments that define the particular raid
type/personality/level, the required arguments for that raid type, and
any optional arguments.  Possible raid types include: raid4, raid5_la,
raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (again, raid1 is
planned for the future.)  The list of required and optional parameters
is the same for all the current raid types.  The required parameters are
positional, while the optional parameters are given as key/value pairs.
The possible parameters are as follows:
 <chunk_size>		Chunk size in sectors.
 [[no]sync]		Force/Prevent RAID initialization
 [rebuild <idx>]	Rebuild the drive indicated by the index
 [daemon_sleep <ms>]	Time between bitmap daemon work to clear bits
 [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
 [max_write_behind <value>]		See '-write-behind=' (man mdadm)
 [stripe_cache <sectors>]		Stripe cache size for higher RAIDs

Line 3 contains the list of devices that compose the array in
metadata/data device pairs.  If the metadata is stored separately, a '-'
is given for the metadata device position.  If a drive has failed or is
missing at creation time, a '-' can be given for both the metadata and
data drives for a given position.

Examples:
# RAID4 - 4 data drives, 1 parity
# No metadata devices specified to hold superblock/bitmap info
# Chunk size of 1MiB
# (Lines separated for easy reading)
0 1960893648 raid \
	raid4 1 2048 \
	5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81

# RAID4 - 4 data drives, 1 parity (no metadata devices)
# Chunk size of 1MiB, force RAID initialization,
#	min recovery rate at 20 kiB/sec/disk
0 1960893648 raid \
        raid4 4 2048 min_recovery_rate 20 sync\
        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81

Performing a 'dmsetup table' should display the CTR table used to
construct the mapping (with possible reordering of optional
parameters).

Performing a 'dmsetup status' will yield information on the state and
health of the array.  The output is as follows:
1: <s> <l> raid \
2:	<raid_type> <#devices> <1 health char for each dev> <resync_ratio>

Line 1 is standard DM output.  Line 2 is best shown by example:
	0 1960893648 raid raid4 5 AAAAA 2/490221568
Here we can see the RAID type is raid4, there are 5 devices - all of
which are 'A'live, and the array is 2/490221568 complete with recovery.

Cc: linux-raid@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  70 +++
 drivers/md/Kconfig                      |  24 +
 drivers/md/Makefile                     |   1 +
 drivers/md/dm-raid.c                    | 697 ++++++++++++++++++++++++
 4 files changed, 792 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-raid.txt
 create mode 100644 drivers/md/dm-raid.c

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
new file mode 100644
index 000000000000..33b6b7071ac8
--- /dev/null
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -0,0 +1,70 @@
+Device-mapper RAID (dm-raid) is a bridge from DM to MD.  It
+provides a way to use device-mapper interfaces to access the MD RAID
+drivers.
+
+As with all device-mapper targets, the nominal public interfaces are the
+constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
+and STATUSTYPE_TABLE).  The CTR table looks like the following:
+
+1: <s> <l> raid \
+2:      <raid_type> <#raid_params> <raid_params> \
+3:      <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
+
+Line 1 contains the standard first three arguments to any device-mapper
+target - the start, length, and target type fields.  The target type in
+this case is "raid".
+
+Line 2 contains the arguments that define the particular raid
+type/personality/level, the required arguments for that raid type, and
+any optional arguments.  Possible raid types include: raid4, raid5_la,
+raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (raid1 is
+planned for the future.)  The list of required and optional parameters
+is the same for all the current raid types.  The required parameters are
+positional, while the optional parameters are given as key/value pairs.
+The possible parameters are as follows:
+ <chunk_size>           Chunk size in sectors.
+ [[no]sync]             Force/Prevent RAID initialization
+ [rebuild <idx>]        Rebuild the drive indicated by the index
+ [daemon_sleep <ms>]    Time between bitmap daemon work to clear bits
+ [min_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
+ [max_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
+ [max_write_behind <sectors>]           See '-write-behind=' (man mdadm)
+ [stripe_cache <sectors>]               Stripe cache size for higher RAIDs
+
+Line 3 contains the list of devices that compose the array in
+metadata/data device pairs.  If the metadata is stored separately, a '-'
+is given for the metadata device position.  If a drive has failed or is
+missing at creation time, a '-' can be given for both the metadata and
+data drives for a given position.
+
+NB. Currently all metadata devices must be specified as '-'.
+
+Examples:
+# RAID4 - 4 data drives, 1 parity
+# No metadata devices specified to hold superblock/bitmap info
+# Chunk size of 1MiB
+# (Lines separated for easy reading)
+0 1960893648 raid \
+        raid4 1 2048 \
+        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# Chunk size of 1MiB, force RAID initialization,
+#       min recovery rate at 20 kiB/sec/disk
+0 1960893648 raid \
+        raid4 4 2048 min_recovery_rate 20 sync\
+        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+
+Performing a 'dmsetup table' should display the CTR table used to
+construct the mapping (with possible reordering of optional
+parameters).
+
+Performing a 'dmsetup status' will yield information on the state and
+health of the array.  The output is as follows:
+1: <s> <l> raid \
+2:      <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
+
+Line 1 is standard DM output.  Line 2 is best shown by example:
+        0 1960893648 raid raid4 5 AAAAA 2/490221568
+Here we can see the RAID type is raid4, there are 5 devices - all of
+which are 'A'live, and the array is 2/490221568 complete with recovery.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e31559..98d9ec85e0eb 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_RAID
+       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID456
+       select BLK_DEV_MD
+       ---help---
+	 A dm target that supports RAID4, RAID5 and RAID6 mappings
+
+	 A RAID-5 set of N drives with a capacity of C MB per drive provides
+	 the capacity of C * (N - 1) MB, and protects against a failure
+	 of a single drive. For a given sector (row) number, (N - 1) drives
+	 contain data sectors, and one drive contains the parity protection.
+	 For a RAID-4 set, the parity blocks are present on a single drive,
+	 while a RAID-5 set distributes the parity across the drives in one
+	 of the available parity distribution methods.
+
+	 A RAID-6 set of N drives with a capacity of C MB per drive
+	 provides the capacity of C * (N - 2) MB, and protects
+	 against a failure of any two drives. For a given sector
+	 (row) number, (N - 2) drives contain data sectors, and two
+	 drives contains two independent redundancy syndromes.  Like
+	 RAID-5, RAID-6 distributes the syndromes across the drives
+	 in one of the available parity distribution methods.
+
 config DM_LOG_USERSPACE
 	tristate "Mirror userspace logging (EXPERIMENTAL)"
 	depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919d..d0138606c2e8 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_RAID)	+= dm-raid.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 000000000000..b9e1e15ef11c
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (C) 2010-2011 Neil Brown
+ * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/slab.h>
+
+#include "md.h"
+#include "raid5.h"
+#include "dm.h"
+#include "bitmap.h"
+
+#define DM_MSG_PREFIX "raid"
+
+/*
+ * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
+ * make it so the flag doesn't set anything.
+ */
+#ifndef MD_SYNC_STATE_FORCED
+#define MD_SYNC_STATE_FORCED 0
+#endif
+
+struct raid_dev {
+	/*
+	 * Two DM devices, one to hold metadata and one to hold the
+	 * actual data/parity.  The reason for this is to not confuse
+	 * ti->len and give more flexibility in altering size and
+	 * characteristics.
+	 *
+	 * While it is possible for this device to be associated
+	 * with a different physical device than the data_dev, it
+	 * is intended for it to be the same.
+	 *    |--------- Physical Device ---------|
+	 *    |- meta_dev -|------ data_dev ------|
+	 */
+	struct dm_dev *meta_dev;
+	struct dm_dev *data_dev;
+	struct mdk_rdev_s rdev;
+};
+
+/*
+ * Flags for rs->print_flags field.
+ */
+#define DMPF_DAEMON_SLEEP      0x1
+#define DMPF_MAX_WRITE_BEHIND  0x2
+#define DMPF_SYNC              0x4
+#define DMPF_NOSYNC            0x8
+#define DMPF_STRIPE_CACHE      0x10
+#define DMPF_MIN_RECOVERY_RATE 0x20
+#define DMPF_MAX_RECOVERY_RATE 0x40
+
+struct raid_set {
+	struct dm_target *ti;
+
+	uint64_t print_flags;
+
+	struct mddev_s md;
+	struct raid_type *raid_type;
+	struct dm_target_callbacks callbacks;
+
+	struct raid_dev dev[0];
+};
+
+/* Supported raid types and properties. */
+static struct raid_type {
+	const char *name;		/* RAID algorithm. */
+	const char *descr;		/* Descriptor text for logging. */
+	const unsigned parity_devs;	/* # of parity devices. */
+	const unsigned minimal_devs;	/* minimal # of devices in set. */
+	const unsigned level;		/* RAID level. */
+	const unsigned algorithm;	/* RAID algorithm. */
+} raid_types[] = {
+	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
+	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
+	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
+	{"raid5_ls", "RAID5 (left symmetric)",		1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
+	{"raid5_rs", "RAID5 (right symmetric)",		1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
+	{"raid6_zr", "RAID6 (zero restart)",		2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
+	{"raid6_nr", "RAID6 (N restart)",		2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
+	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
+};
+
+static struct raid_type *get_raid_type(char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(raid_types); i++)
+		if (!strcmp(raid_types[i].name, name))
+			return &raid_types[i];
+
+	return NULL;
+}
+
+static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
+{
+	unsigned i;
+	struct raid_set *rs;
+	sector_t sectors_per_dev;
+
+	if (raid_devs <= raid_type->parity_devs) {
+		ti->error = "Insufficient number of devices";
+		return ERR_PTR(-EINVAL);
+	}
+
+	sectors_per_dev = ti->len;
+	if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+		ti->error = "Target length not divisible by number of data devices";
+		return ERR_PTR(-EINVAL);
+	}
+
+	rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
+	if (!rs) {
+		ti->error = "Cannot allocate raid context";
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mddev_init(&rs->md);
+
+	rs->ti = ti;
+	rs->raid_type = raid_type;
+	rs->md.raid_disks = raid_devs;
+	rs->md.level = raid_type->level;
+	rs->md.new_level = rs->md.level;
+	rs->md.dev_sectors = sectors_per_dev;
+	rs->md.layout = raid_type->algorithm;
+	rs->md.new_layout = rs->md.layout;
+	rs->md.delta_disks = 0;
+	rs->md.recovery_cp = 0;
+
+	for (i = 0; i < raid_devs; i++)
+		md_rdev_init(&rs->dev[i].rdev);
+
+	/*
+	 * Remaining items to be initialized by further RAID params:
+	 *  rs->md.persistent
+	 *  rs->md.external
+	 *  rs->md.chunk_sectors
+	 *  rs->md.new_chunk_sectors
+	 */
+
+	return rs;
+}
+
+static void context_free(struct raid_set *rs)
+{
+	int i;
+
+	for (i = 0; i < rs->md.raid_disks; i++)
+		if (rs->dev[i].data_dev)
+			dm_put_device(rs->ti, rs->dev[i].data_dev);
+
+	kfree(rs);
+}
+
+/*
+ * For every device we have two words
+ *  <meta_dev>: meta device name or '-' if missing
+ *  <data_dev>: data device name or '-' if missing
+ *
+ * This code parses those words.
+ */
+static int dev_parms(struct raid_set *rs, char **argv)
+{
+	int i;
+	int rebuild = 0;
+	int metadata_available = 0;
+	int ret = 0;
+
+	for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
+		rs->dev[i].rdev.raid_disk = i;
+
+		rs->dev[i].meta_dev = NULL;
+		rs->dev[i].data_dev = NULL;
+
+		/*
+		 * There are no offsets, since there is a separate device
+		 * for data and metadata.
+		 */
+		rs->dev[i].rdev.data_offset = 0;
+		rs->dev[i].rdev.mddev = &rs->md;
+
+		if (strcmp(argv[0], "-")) {
+			rs->ti->error = "Metadata devices not supported";
+			return -EINVAL;
+		}
+
+		if (!strcmp(argv[1], "-")) {
+			if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
+			    (!rs->dev[i].rdev.recovery_offset)) {
+				rs->ti->error = "Drive designated for rebuild not specified";
+				return -EINVAL;
+			}
+
+			continue;
+		}
+
+		ret = dm_get_device(rs->ti, argv[1],
+				    dm_table_get_mode(rs->ti->table),
+				    &rs->dev[i].data_dev);
+		if (ret) {
+			rs->ti->error = "RAID device lookup failure";
+			return ret;
+		}
+
+		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
+		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
+		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+			rebuild++;
+	}
+
+	if (metadata_available) {
+		rs->md.external = 0;
+		rs->md.persistent = 1;
+		rs->md.major_version = 2;
+	} else if (rebuild && !rs->md.recovery_cp) {
+		/*
+		 * Without metadata, we will not be able to tell if the array
+		 * is in-sync or not - we must assume it is not.  Therefore,
+		 * it is impossible to rebuild a drive.
+		 *
+		 * Even if there is metadata, the on-disk information may
+		 * indicate that the array is not in-sync and it will then
+		 * fail at that time.
+		 *
+		 * User could specify 'nosync' option if desperate.
+		 */
+		DMERR("Unable to rebuild drive while array is not in-sync");
+		rs->ti->error = "RAID device lookup failure";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Possible arguments are...
+ * RAID456:
+ *	<chunk_size> [optional_args]
+ *
+ * Optional args:
+ *    [[no]sync]			Force or prevent recovery of the entire array
+ *    [rebuild <idx>]			Rebuild the drive indicated by the index
+ *    [daemon_sleep <ms>]		Time between bitmap daemon work to clear bits
+ *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
+ *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
+ *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
+ *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
+ */
+static int parse_raid_params(struct raid_set *rs, char **argv,
+			     unsigned num_raid_params)
+{
+	unsigned i, rebuild_cnt = 0;
+	unsigned long value;
+	char *key;
+
+	/*
+	 * First, parse the in-order required arguments
+	 */
+	if ((strict_strtoul(argv[0], 10, &value) < 0) ||
+	    !is_power_of_2(value) || (value < 8)) {
+		rs->ti->error = "Bad chunk size";
+		return -EINVAL;
+	}
+
+	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
+	argv++;
+	num_raid_params--;
+
+	/*
+	 * Second, parse the unordered optional arguments
+	 */
+	for (i = 0; i < rs->md.raid_disks; i++)
+		set_bit(In_sync, &rs->dev[i].rdev.flags);
+
+	for (i = 0; i < num_raid_params; i++) {
+		if (!strcmp(argv[i], "nosync")) {
+			rs->md.recovery_cp = MaxSector;
+			rs->print_flags |= DMPF_NOSYNC;
+			rs->md.flags |= MD_SYNC_STATE_FORCED;
+			continue;
+		}
+		if (!strcmp(argv[i], "sync")) {
+			rs->md.recovery_cp = 0;
+			rs->print_flags |= DMPF_SYNC;
+			rs->md.flags |= MD_SYNC_STATE_FORCED;
+			continue;
+		}
+
+		/* The rest of the optional arguments come in key/value pairs */
+		if ((i + 1) >= num_raid_params) {
+			rs->ti->error = "Wrong number of raid parameters given";
+			return -EINVAL;
+		}
+
+		key = argv[i++];
+		if (strict_strtoul(argv[i], 10, &value) < 0) {
+			rs->ti->error = "Bad numerical argument given in raid params";
+			return -EINVAL;
+		}
+
+		if (!strcmp(key, "rebuild")) {
+			if (++rebuild_cnt > rs->raid_type->parity_devs) {
+				rs->ti->error = "Too many rebuild drives given";
+				return -EINVAL;
+			}
+			if (value > rs->md.raid_disks) {
+				rs->ti->error = "Invalid rebuild index given";
+				return -EINVAL;
+			}
+			clear_bit(In_sync, &rs->dev[value].rdev.flags);
+			rs->dev[value].rdev.recovery_offset = 0;
+		} else if (!strcmp(key, "max_write_behind")) {
+			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+
+			/*
+			 * In device-mapper, we specify things in sectors, but
+			 * MD records this value in kB
+			 */
+			value /= 2;
+			if (value > COUNTER_MAX) {
+				rs->ti->error = "Max write-behind limit out of range";
+				return -EINVAL;
+			}
+			rs->md.bitmap_info.max_write_behind = value;
+		} else if (!strcmp(key, "daemon_sleep")) {
+			rs->print_flags |= DMPF_DAEMON_SLEEP;
+			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
+				rs->ti->error = "daemon sleep period out of range";
+				return -EINVAL;
+			}
+			rs->md.bitmap_info.daemon_sleep = value;
+		} else if (!strcmp(key, "stripe_cache")) {
+			rs->print_flags |= DMPF_STRIPE_CACHE;
+
+			/*
+			 * In device-mapper, we specify things in sectors, but
+			 * MD records this value in kB
+			 */
+			value /= 2;
+
+			if (rs->raid_type->level < 5) {
+				rs->ti->error = "Inappropriate argument: stripe_cache";
+				return -EINVAL;
+			}
+			if (raid5_set_cache_size(&rs->md, (int)value)) {
+				rs->ti->error = "Bad stripe_cache size";
+				return -EINVAL;
+			}
+		} else if (!strcmp(key, "min_recovery_rate")) {
+			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+			if (value > INT_MAX) {
+				rs->ti->error = "min_recovery_rate out of range";
+				return -EINVAL;
+			}
+			rs->md.sync_speed_min = (int)value;
+		} else if (!strcmp(key, "max_recovery_rate")) {
+			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+			if (value > INT_MAX) {
+				rs->ti->error = "max_recovery_rate out of range";
+				return -EINVAL;
+			}
+			rs->md.sync_speed_max = (int)value;
+		} else {
+			DMERR("Unable to parse RAID parameter: %s", key);
+			rs->ti->error = "Unable to parse RAID parameters";
+			return -EINVAL;
+		}
+	}
+
+	/* Assume there are no metadata devices until the drives are parsed */
+	rs->md.persistent = 0;
+	rs->md.external = 1;
+
+	return 0;
+}
+
+static void do_table_event(struct work_struct *ws)
+{
+	struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
+
+	dm_table_event(rs->ti->table);
+}
+
+static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
+{
+	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+	return md_raid5_congested(&rs->md, bits);
+}
+
+static void raid_unplug(struct dm_target_callbacks *cb)
+{
+	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
+
+	md_raid5_unplug_device(rs->md.private);
+}
+
+/*
+ * Construct a RAID4/5/6 mapping:
+ * Args:
+ *	<raid_type> <#raid_params> <raid_params>		\
+ *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
+ *
+ * ** metadata devices are not supported yet, use '-' instead **
+ *
+ * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
+ * details on possible <raid_params>.
+ */
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int ret;
+	struct raid_type *rt;
+	unsigned long num_raid_params, num_raid_devs;
+	struct raid_set *rs = NULL;
+
+	/* Must have at least <raid_type> <#raid_params> */
+	if (argc < 2) {
+		ti->error = "Too few arguments";
+		return -EINVAL;
+	}
+
+	/* raid type */
+	rt = get_raid_type(argv[0]);
+	if (!rt) {
+		ti->error = "Unrecognised raid_type";
+		return -EINVAL;
+	}
+	argc--;
+	argv++;
+
+	/* number of RAID parameters */
+	if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
+		ti->error = "Cannot understand number of RAID parameters";
+		return -EINVAL;
+	}
+	argc--;
+	argv++;
+
+	/* Skip over RAID params for now and find out # of devices */
+	if (num_raid_params + 1 > argc) {
+		ti->error = "Arguments do not agree with counts given";
+		return -EINVAL;
+	}
+
+	if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
+	    (num_raid_devs >= INT_MAX)) {
+		ti->error = "Cannot understand number of raid devices";
+		return -EINVAL;
+	}
+
+	rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
+	if (IS_ERR(rs))
+		return PTR_ERR(rs);
+
+	ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
+	if (ret)
+		goto bad;
+
+	ret = -EINVAL;
+
+	argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+	argv += num_raid_params + 1;
+
+	if (argc != (num_raid_devs * 2)) {
+		ti->error = "Supplied RAID devices does not match the count given";
+		goto bad;
+	}
+
+	ret = dev_parms(rs, argv);
+	if (ret)
+		goto bad;
+
+	INIT_WORK(&rs->md.event_work, do_table_event);
+	ti->split_io = rs->md.chunk_sectors;
+	ti->private = rs;
+
+	mutex_lock(&rs->md.reconfig_mutex);
+	ret = md_run(&rs->md);
+	rs->md.in_sync = 0; /* Assume already marked dirty */
+	mutex_unlock(&rs->md.reconfig_mutex);
+
+	if (ret) {
+		ti->error = "Fail to run raid array";
+		goto bad;
+	}
+
+	rs->callbacks.congested_fn = raid_is_congested;
+	rs->callbacks.unplug_fn = raid_unplug;
+	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
+
+	return 0;
+
+bad:
+	context_free(rs);
+
+	return ret;
+}
+
+static void raid_dtr(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	list_del_init(&rs->callbacks.list);
+	md_stop(&rs->md);
+	context_free(rs);
+}
+
+static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+	struct raid_set *rs = ti->private;
+	mddev_t *mddev = &rs->md;
+
+	mddev->pers->make_request(mddev, bio);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+		       char *result, unsigned maxlen)
+{
+	struct raid_set *rs = ti->private;
+	unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
+	unsigned sz = 0;
+	int i;
+	sector_t sync;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
+
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			if (test_bit(Faulty, &rs->dev[i].rdev.flags))
+				DMEMIT("D");
+			else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
+				DMEMIT("A");
+			else
+				DMEMIT("a");
+		}
+
+		if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+			sync = rs->md.curr_resync_completed;
+		else
+			sync = rs->md.recovery_cp;
+
+		if (sync > rs->md.resync_max_sectors)
+			sync = rs->md.resync_max_sectors;
+
+		DMEMIT(" %llu/%llu",
+		       (unsigned long long) sync,
+		       (unsigned long long) rs->md.resync_max_sectors);
+
+		break;
+	case STATUSTYPE_TABLE:
+		/* The string you would use to construct this array */
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
+				raid_param_cnt++; /* for rebuilds */
+
+		raid_param_cnt += (hweight64(rs->print_flags) * 2);
+		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+			raid_param_cnt--;
+
+		DMEMIT("%s %u %u", rs->raid_type->name,
+		       raid_param_cnt, rs->md.chunk_sectors);
+
+		if ((rs->print_flags & DMPF_SYNC) &&
+		    (rs->md.recovery_cp == MaxSector))
+			DMEMIT(" sync");
+		if (rs->print_flags & DMPF_NOSYNC)
+			DMEMIT(" nosync");
+
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
+				DMEMIT(" rebuild %u", i);
+
+		if (rs->print_flags & DMPF_DAEMON_SLEEP)
+			DMEMIT(" daemon_sleep %lu",
+			       rs->md.bitmap_info.daemon_sleep);
+
+		if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
+
+		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
+
+		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+			DMEMIT(" max_write_behind %lu",
+			       rs->md.bitmap_info.max_write_behind);
+
+		if (rs->print_flags & DMPF_STRIPE_CACHE) {
+			raid5_conf_t *conf = rs->md.private;
+
+			/* convert from kiB to sectors */
+			DMEMIT(" stripe_cache %d",
+			       conf ? conf->max_nr_stripes * 2 : 0);
+		}
+
+		DMEMIT(" %d", rs->md.raid_disks);
+		for (i = 0; i < rs->md.raid_disks; i++) {
+			DMEMIT(" -"); /* metadata device */
+
+			if (rs->dev[i].data_dev)
+				DMEMIT(" %s", rs->dev[i].data_dev->name);
+			else
+				DMEMIT(" -");
+		}
+	}
+
+	return 0;
+}
+
+static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+	struct raid_set *rs = ti->private;
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; !ret && i < rs->md.raid_disks; i++)
+		if (rs->dev[i].data_dev)
+			ret = fn(ti,
+				 rs->dev[i].data_dev,
+				 0, /* No offset on data devs */
+				 rs->md.dev_sectors,
+				 data);
+
+	return ret;
+}
+
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct raid_set *rs = ti->private;
+	unsigned chunk_size = rs->md.chunk_sectors << 9;
+	raid5_conf_t *conf = rs->md.private;
+
+	blk_limits_io_min(limits, chunk_size);
+	blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
+}
+
+static void raid_presuspend(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	md_stop_writes(&rs->md);
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	mddev_suspend(&rs->md);
+}
+
+static void raid_resume(struct dm_target *ti)
+{
+	struct raid_set *rs = ti->private;
+
+	mddev_resume(&rs->md);
+}
+
+static struct target_type raid_target = {
+	.name = "raid",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = raid_ctr,
+	.dtr = raid_dtr,
+	.map = raid_map,
+	.status = raid_status,
+	.iterate_devices = raid_iterate_devices,
+	.io_hints = raid_io_hints,
+	.presuspend = raid_presuspend,
+	.postsuspend = raid_postsuspend,
+	.resume = raid_resume,
+};
+
+static int __init dm_raid_init(void)
+{
+	return dm_register_target(&raid_target);
+}
+
+static void __exit dm_raid_exit(void)
+{
+	dm_unregister_target(&raid_target);
+}
+
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
+MODULE_ALIAS("dm-raid6");
+MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");