From 49fd524f95cb4cc699d435e0ebb08b1c6220da6d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 16 Apr 2014 10:57:18 -0600
Subject: [PATCH 01/54] bsg: update check for rq based driver for blk-mq

bsg currently checks ->request_fn to check whether a queue can
handle struct request. But with blk-mq, we don't have a request_fn
yet are request based. Add a queue_is_rq_based() helper and use
that in bsg, I'm guessing this is not the last place we need to
update for this. Besides, it better explains what is being
checked.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg.c            | 2 +-
 include/linux/blkdev.h | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/bsg.c b/block/bsg.c
index 420a5a9f1b23..e5214c148096 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1008,7 +1008,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 	/*
 	 * we need a proper transport to send commands, not a stacked device
 	 */
-	if (!q->request_fn)
+	if (!queue_is_rq_based(q))
 		return 0;
 
 	bcd = &q->bsg_dev;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 20b26d4e53a2..74ee55fefcf0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -612,6 +612,15 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 
 #define rq_data_dir(rq)		(((rq)->cmd_flags & 1) != 0)
 
+/*
+ * Driver can handle struct request, if it either has an old style
+ * request_fn defined, or is blk-mq based.
+ */
+static inline bool queue_is_rq_based(struct request_queue *q)
+{
+	return q->request_fn || q->mq_ops;
+}
+
 static inline unsigned int blk_queue_cluster(struct request_queue *q)
 {
 	return q->limits.cluster;

From 01aad3f0de67281f099e253b404fe08db2814c28 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Mon, 21 Apr 2014 17:19:31 +0200
Subject: [PATCH 02/54] skd: Use pci_enable_msix_exact() instead of
 pci_enable_msix_range()

Function pci_enable_msix_exact() is a variation of
pci_enable_msix_range() that allows a device driver
to request a particular number of MSI-X interrupts,
rather than any number within a specified range.

Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/skd_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 36bcedfd930c..1dcf9067cffa 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -3944,15 +3944,14 @@ static int skd_acquire_msix(struct skd_device *skdev)
 	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
 		entries[i].entry = i;
 
-	rc = pci_enable_msix_range(pdev, entries,
-				   SKD_MIN_MSIX_COUNT, SKD_MAX_MSIX_COUNT);
-	if (rc < 0) {
+	rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT);
+	if (rc) {
 		pr_err("(%s): failed to enable MSI-X %d\n",
 		       skd_name(skdev), rc);
 		goto msix_out;
 	}
 
-	skdev->msix_count = rc;
+	skdev->msix_count = SKD_MAX_MSIX_COUNT;
 	skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
 				      skdev->msix_count, GFP_KERNEL);
 	if (!skdev->msix_entries) {

From 24cddb83b4ae3a601f807e96cde9bf24fb90e5f5 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Mon, 21 Apr 2014 17:19:32 +0200
Subject: [PATCH 03/54] cciss: Use pci_enable_msix_exact() instead of
 pci_enable_msix()

As result of deprecation of MSI-X/MSI enablement functions
pci_enable_msix() and pci_enable_msi_block() all drivers
using these two interfaces need to be updated to use the
new pci_enable_msi_range()  or pci_enable_msi_exact()
and pci_enable_msix_range() or pci_enable_msix_exact()
interfaces.

Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: iss_storagedev@hp.com
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/cciss.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 73894ca33956..4595c22f33f7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4080,7 +4080,7 @@ static void cciss_interrupt_mode(ctlr_info_t *h)
 		goto default_int_mode;
 
 	if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) {
-		err = pci_enable_msix(h->pdev, cciss_msix_entries, 4);
+		err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4);
 		if (!err) {
 			h->intr[0] = cciss_msix_entries[0].vector;
 			h->intr[1] = cciss_msix_entries[1].vector;
@@ -4088,10 +4088,6 @@ static void cciss_interrupt_mode(ctlr_info_t *h)
 			h->intr[3] = cciss_msix_entries[3].vector;
 			h->msix_vector = 1;
 			return;
-		}
-		if (err > 0) {
-			dev_warn(&h->pdev->dev,
-				"only %d MSI-X vectors available\n", err);
 		} else {
 			dev_warn(&h->pdev->dev,
 				"MSI-X init failed %d\n", err);

From 670a641420a3d9586eebe7429dfeec4e7ed447aa Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Wed, 16 Apr 2014 20:27:54 -0700
Subject: [PATCH 04/54] mtip32xx: Increase timeout for STANDBY IMMEDIATE
 command

Increased timeout for STANDBY IMMEDIATE command to 2 minutes.

Signed-off-by: Selvan Mani <smani@micron.com>
Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 66 ++++++++++++++++---------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 59c5abe32f06..51628eb6f445 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1529,6 +1529,37 @@ static inline void ata_swap_string(u16 *buf, unsigned int len)
 		be16_to_cpus(&buf[i]);
 }
 
+static void mtip_set_timeout(struct driver_data *dd,
+					struct host_to_dev_fis *fis,
+					unsigned int *timeout, u8 erasemode)
+{
+	switch (fis->command) {
+	case ATA_CMD_DOWNLOAD_MICRO:
+		*timeout = 120000; /* 2 minutes */
+		break;
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case 0xFC:
+		if (erasemode)
+			*timeout = ((*(dd->port->identify + 90) * 2) * 60000);
+		else
+			*timeout = ((*(dd->port->identify + 89) * 2) * 60000);
+		break;
+	case ATA_CMD_STANDBYNOW1:
+		*timeout = 120000;  /* 2 minutes */
+		break;
+	case 0xF7:
+	case 0xFA:
+		*timeout = 60000;  /* 60 seconds */
+		break;
+	case ATA_CMD_SMART:
+		*timeout = 15000;  /* 15 seconds */
+		break;
+	default:
+		*timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+		break;
+	}
+}
+
 /*
  * Request the device identity information.
  *
@@ -1644,6 +1675,7 @@ static int mtip_standby_immediate(struct mtip_port *port)
 	int rv;
 	struct host_to_dev_fis	fis;
 	unsigned long start;
+	unsigned int timeout;
 
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
@@ -1651,6 +1683,8 @@ static int mtip_standby_immediate(struct mtip_port *port)
 	fis.opts	= 1 << 7;
 	fis.command	= ATA_CMD_STANDBYNOW1;
 
+	mtip_set_timeout(port->dd, &fis, &timeout, 0);
+
 	start = jiffies;
 	rv = mtip_exec_internal_command(port,
 					&fis,
@@ -1659,7 +1693,7 @@ static int mtip_standby_immediate(struct mtip_port *port)
 					0,
 					0,
 					GFP_ATOMIC,
-					15000);
+					timeout);
 	dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
 			jiffies_to_msecs(jiffies - start));
 	if (rv)
@@ -2202,36 +2236,6 @@ static unsigned int implicit_sector(unsigned char command,
 	}
 	return rv;
 }
-static void mtip_set_timeout(struct driver_data *dd,
-					struct host_to_dev_fis *fis,
-					unsigned int *timeout, u8 erasemode)
-{
-	switch (fis->command) {
-	case ATA_CMD_DOWNLOAD_MICRO:
-		*timeout = 120000; /* 2 minutes */
-		break;
-	case ATA_CMD_SEC_ERASE_UNIT:
-	case 0xFC:
-		if (erasemode)
-			*timeout = ((*(dd->port->identify + 90) * 2) * 60000);
-		else
-			*timeout = ((*(dd->port->identify + 89) * 2) * 60000);
-		break;
-	case ATA_CMD_STANDBYNOW1:
-		*timeout = 120000;  /* 2 minutes */
-		break;
-	case 0xF7:
-	case 0xFA:
-		*timeout = 60000;  /* 60 seconds */
-		break;
-	case ATA_CMD_SMART:
-		*timeout = 15000;  /* 15 seconds */
-		break;
-	default:
-		*timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
-		break;
-	}
-}
 
 /*
  * Executes a taskfile

From af5ded8ccf21627f9614afc03b356712666ed225 Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Wed, 16 Apr 2014 20:30:16 -0700
Subject: [PATCH 05/54] mtip32xx: Remove dfs_parent after pci unregister

In module exit, dfs_parent and it's subtree were removed before
unregistering with pci. When debugfs entry for each device is attempted
to remove in pci_remove() context, they don't exist, as dfs_parent and
its children were already ripped apart.

Modified to first unregister with pci and then remove dfs_parent.

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 51628eb6f445..27641bc83962 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4939,13 +4939,13 @@ static int __init mtip_init(void)
  */
 static void __exit mtip_exit(void)
 {
-	debugfs_remove_recursive(dfs_parent);
-
 	/* Release the allocated major block device number. */
 	unregister_blkdev(mtip_major, MTIP_DRV_NAME);
 
 	/* Unregister the PCI driver. */
 	pci_unregister_driver(&mtip_pci_driver);
+
+	debugfs_remove_recursive(dfs_parent);
 }
 
 MODULE_AUTHOR("Micron Technology, Inc");

From d1e714db8129a1d3670e449b87719c78e2c76f9f Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Thu, 13 Mar 2014 18:45:15 -0700
Subject: [PATCH 06/54] mtip32xx: Fix ERO and NoSnoop values in PCIe upstream
 on AMD systems

A hardware quirk in P320h/P420m interfere with PCIe transactions on some
AMD chipsets, making P320h/P420m unusable. This workaround is to disable
ERO and NoSnoop bits in the parent and root complex for normal
functioning of these devices

NOTE: This workaround is specific to AMD chipset with a PCIe upstream
device with device id 0x5aXX

Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Sam Bradshaw <sbradshaw@micron.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 53 +++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 27641bc83962..fb624469d0ee 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4483,6 +4483,57 @@ static DEFINE_HANDLER(5);
 static DEFINE_HANDLER(6);
 static DEFINE_HANDLER(7);
 
+static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
+{
+	int pos;
+	unsigned short pcie_dev_ctrl;
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (pos) {
+		pci_read_config_word(pdev,
+			pos + PCI_EXP_DEVCTL,
+			&pcie_dev_ctrl);
+		if (pcie_dev_ctrl & (1 << 11) ||
+		    pcie_dev_ctrl & (1 << 4)) {
+			dev_info(&dd->pdev->dev,
+				"Disabling ERO/No-Snoop on bridge device %04x:%04x\n",
+					pdev->vendor, pdev->device);
+			pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN |
+						PCI_EXP_DEVCTL_RELAX_EN);
+			pci_write_config_word(pdev,
+				pos + PCI_EXP_DEVCTL,
+				pcie_dev_ctrl);
+		}
+	}
+}
+
+static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev)
+{
+	/*
+	 * This workaround is specific to AMD/ATI chipset with a PCI upstream
+	 * device with device id 0x5aXX
+	 */
+	if (pdev->bus && pdev->bus->self) {
+		if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI &&
+		    ((pdev->bus->self->device & 0xff00) == 0x5a00)) {
+			mtip_disable_link_opts(dd, pdev->bus->self);
+		} else {
+			/* Check further up the topology */
+			struct pci_dev *parent_dev = pdev->bus->self;
+			if (parent_dev->bus &&
+				parent_dev->bus->parent &&
+				parent_dev->bus->parent->self &&
+				parent_dev->bus->parent->self->vendor ==
+					 PCI_VENDOR_ID_ATI &&
+				(parent_dev->bus->parent->self->device &
+					0xff00) == 0x5a00) {
+				mtip_disable_link_opts(dd,
+					parent_dev->bus->parent->self);
+			}
+		}
+	}
+}
+
 /*
  * Called for each supported PCI device detected.
  *
@@ -4634,6 +4685,8 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 		goto msi_initialize_err;
 	}
 
+	mtip_fix_ero_nosnoop(dd, pdev);
+
 	/* Initialize the block layer. */
 	rv = mtip_block_initialize(dd);
 	if (rv < 0) {

From 31007745a5f328b8d70d865c4a6118be01421b8c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:12 +0200
Subject: [PATCH 07/54] drbd: Break a deadlock while concurrent fencing and
 establishing a connection

When we need to outdate the peer while being promoted to primary,
and the connection gets established at the same time, we deadlock
in drbd_try_outdate_peer() when trying to clear the susp_fen
bit.

Fix this by setting the STATE_SENT bit while holding the mutex.

Using drbd_change_state(.. , CS_HARD, ..) which does not block
until STATE_SENT is cleared, is only for clearness. It does
not contribute anything to the fix.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_receiver.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 68e3992e8838..125c9e89388f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1026,24 +1026,27 @@ randomize:
 	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
 		return -1;
 
+	/* Prevent a race between resync-handshake and
+	 * being promoted to Primary.
+	 *
+	 * Grab and release the state mutex, so we know that any current
+	 * drbd_set_role() is finished, and any incoming drbd_set_role
+	 * will see the STATE_SENT flag, and wait for it to be cleared.
+	 */
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_lock(peer_device->device->state_mutex);
+
 	set_bit(STATE_SENT, &connection->flags);
 
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_unlock(peer_device->device->state_mutex);
+
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
 		kref_get(&device->kref);
 		rcu_read_unlock();
 
-		/* Prevent a race between resync-handshake and
-		 * being promoted to Primary.
-		 *
-		 * Grab and release the state mutex, so we know that any current
-		 * drbd_set_role() is finished, and any incoming drbd_set_role
-		 * will see the STATE_SENT flag, and wait for it to be cleared.
-		 */
-		mutex_lock(device->state_mutex);
-		mutex_unlock(device->state_mutex);
-
 		if (discard_my_data)
 			set_bit(DISCARD_MY_DATA, &device->flags);
 		else

From d7fe69c6a1940c3f9c5bed08634b6dd868612cdf Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:13 +0200
Subject: [PATCH 08/54] drbd: Leave IO suspended if the fence handler find the
 peer primary

Actually we are clearing the susp_fen flag if we are not going
to call a fencing handler.

For setting the susp_fen flag needs to be edge-triggerd, and not
level triggered.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c    |  8 ++++++++
 drivers/block/drbd/drbd_state.c | 24 +++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 526414bc2cab..5ccbd7164c78 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -426,6 +426,14 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
 	}
 	rcu_read_unlock();
 
+	if (fp == FP_NOT_AVAIL) {
+		/* IO Suspending works on the whole resource.
+		   Do it only for one device. */
+		vnr = 0;
+		peer_device = idr_get_next(&connection->peer_devices, &vnr);
+		drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
+	}
+
 	return fp;
 }
 
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 1a84345a3868..379666246569 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
 static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
 static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
 static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
-static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns,
-				       enum sanitize_state_warnings *warn);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
 
 static inline bool is_susp(union drbd_state s)
 {
@@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask,
 
 	spin_lock_irqsave(&device->resource->req_lock, flags);
 	os = drbd_read_state(device);
-	ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
 	rv = is_valid_transition(os, ns);
 	if (rv >= SS_SUCCESS)
 		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
@@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask,
 
 	spin_lock_irqsave(&device->resource->req_lock, flags);
 	os = drbd_read_state(device);
-	ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
 	rv = is_valid_transition(os, ns);
 	if (rv < SS_SUCCESS) {
 		spin_unlock_irqrestore(&device->resource->req_lock, flags);
@@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st
  * When we loose connection, we have to set the state of the peers disk (pdsk)
  * to D_UNKNOWN. This rule and many more along those lines are in this function.
  */
-static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns,
-				       enum sanitize_state_warnings *warn)
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
 {
 	enum drbd_fencing_p fp;
 	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
@@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st
 	}
 
 	if (fp == FP_STONITH &&
-	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED))
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
 		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
 
 	if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
-	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
 		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
 
 	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
@@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
 
 	os = drbd_read_state(device);
 
-	ns = sanitize_state(device, ns, &ssw);
+	ns = sanitize_state(device, os, ns, &ssw);
 	if (ns.i == os.i)
 		return SS_NOTHING_TO_DO;
 
@@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
 		os = drbd_read_state(device);
-		ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL);
+		ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
 
 		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
 			ns.disk = os.disk;
@@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
 		number_of_volumes++;
 		os = drbd_read_state(device);
 		ns = apply_mask_val(os, mask, val);
-		ns = sanitize_state(device, ns, NULL);
+		ns = sanitize_state(device, os, ns, NULL);
 
 		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
 			ns.disk = os.disk;

From d40e567149c7ac250344d1537261c87b2c3e852c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:14 +0200
Subject: [PATCH 09/54] drbd: Remove drbd_wrappers.h

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_actlog.c   |  1 -
 drivers/block/drbd/drbd_int.h      | 35 +++++++++++++++++++
 drivers/block/drbd/drbd_nl.c       |  1 -
 drivers/block/drbd/drbd_nla.c      |  1 -
 drivers/block/drbd/drbd_req.h      |  1 -
 drivers/block/drbd/drbd_wrappers.h | 54 ------------------------------
 6 files changed, 35 insertions(+), 58 deletions(-)
 delete mode 100644 drivers/block/drbd/drbd_wrappers.h

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 90ae4ba8f9ee..7e7b0e143655 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -29,7 +29,6 @@
 #include <linux/drbd_limits.h>
 #include <linux/dynamic_debug.h>
 #include "drbd_int.h"
-#include "drbd_wrappers.h"
 
 
 enum al_transaction_types {
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e7093d4291f1..361a2e9cd727 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1283,6 +1283,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
 extern int drbd_khelper(struct drbd_device *device, char *cmd);
 
 /* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_io_complete(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
 extern int drbd_worker(struct drbd_thread *thi);
 enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
 void drbd_resync_after_changed(struct drbd_device *device);
@@ -1401,6 +1405,37 @@ static inline void drbd_tcp_quickack(struct socket *sock)
 			(char*)&val, sizeof(val));
 }
 
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+					sector_t size)
+{
+	/* set_capacity(device->this_bdev->bd_disk, size); */
+	set_capacity(device->vdisk, size);
+	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
+				"bio->bi_bdev == NULL\n",
+		       device_to_minor(device));
+		dump_stack();
+		bio_endio(bio, -ENODEV);
+		return;
+	}
+
+	if (drbd_insert_fault(device, fault_type))
+		bio_endio(bio, -EIO);
+	else
+		generic_make_request(bio);
+}
+
 void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
 
 /* drbd_proc.c */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 5ccbd7164c78..e891cb0a83c9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -34,7 +34,6 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
-#include "drbd_wrappers.h"
 #include <asm/unaligned.h>
 #include <linux/drbd_limits.h>
 #include <linux/kthread.h>
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
index fa672b6df8d6..b2d4791498a6 100644
--- a/drivers/block/drbd/drbd_nla.c
+++ b/drivers/block/drbd/drbd_nla.c
@@ -1,4 +1,3 @@
-#include "drbd_wrappers.h"
 #include <linux/kernel.h>
 #include <net/netlink.h>
 #include <linux/drbd_genl_api.h>
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index c684c963538e..07cc8e8a6e2a 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -30,7 +30,6 @@
 #include <linux/slab.h>
 #include <linux/drbd.h>
 #include "drbd_int.h"
-#include "drbd_wrappers.h"
 
 /* The request callbacks will be called in irq context by the IDE drivers,
    and in Softirqs/Tasklets/BH context by the SCSI drivers,
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
deleted file mode 100644
index 3db9ebaf64f6..000000000000
--- a/drivers/block/drbd/drbd_wrappers.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _DRBD_WRAPPERS_H
-#define _DRBD_WRAPPERS_H
-
-#include <linux/ctype.h>
-#include <linux/mm.h>
-#include "drbd_int.h"
-
-/* see get_sb_bdev and bd_claim */
-extern char *drbd_sec_holder;
-
-/* sets the number of 512 byte sectors of our virtual device */
-static inline void drbd_set_my_capacity(struct drbd_device *device,
-					sector_t size)
-{
-	/* set_capacity(device->this_bdev->bd_disk, size); */
-	set_capacity(device->vdisk, size);
-	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
-}
-
-#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
-
-/* bi_end_io handlers */
-extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_peer_request_endio(struct bio *bio, int error);
-extern void drbd_request_endio(struct bio *bio, int error);
-
-/*
- * used to submit our private bio
- */
-static inline void drbd_generic_make_request(struct drbd_device *device,
-					     int fault_type, struct bio *bio)
-{
-	__release(local);
-	if (!bio->bi_bdev) {
-		printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
-				"bio->bi_bdev == NULL\n",
-		       device_to_minor(device));
-		dump_stack();
-		bio_endio(bio, -ENODEV);
-		return;
-	}
-
-	if (drbd_insert_fault(device, fault_type))
-		bio_endio(bio, -EIO);
-	else
-		generic_make_request(bio);
-}
-
-#ifndef __CHECKER__
-# undef __cond_lock
-# define __cond_lock(x,c) (c)
-#endif
-
-#endif

From cdc6af8df42313d632cb9ed79dd455daecbd8478 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:15 +0200
Subject: [PATCH 10/54] drbd: Allow online layout change of AL while peer is
 not connected

If a user forces the operation he takes the blame in case
the peer does not have enough space. No reason to dey this...

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e891cb0a83c9..d219210cca1e 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2489,7 +2489,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 			goto fail_ldev;
 		}
 
-		if (device->state.conn != C_CONNECTED) {
+		if (device->state.conn != C_CONNECTED && !rs.resize_force) {
 			retcode = ERR_MD_LAYOUT_CONNECTED;
 			goto fail_ldev;
 		}

From 9ae472605ad333d4db07da95cc42c68063d2cc0d Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:16 +0200
Subject: [PATCH 11/54] drbd: fix stalled resync detection in /proc/drbd

If we don't make resync or verify progress for "too long",
we want to flag it as "stalled".

Since 2010, "use rolling marks for resync speed calculation"
this "too long" was wrong by a factor of HZ.
With HZ 250, it would have been flagged as stalled
after 100 minutes.

Hardcode 3 minutes instead.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2f26e8ffa45b..89736bdbbc70 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
 	/* ------------------------ ~18s average ------------------------ */
 	i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
 	dt = (jiffies - device->rs_mark_time[i]) / HZ;
-	if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+	if (dt > 180)
 		stalled = 1;
 
 	if (!dt)

From 6377b9235056452cd5d592c3739baa379a8735fe Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:17 +0200
Subject: [PATCH 12/54] drbd: resync: fix too large bursts for very slow rates

While merging adjacent dirty blocks into resync requests,
the resync rate throttle was disregarded.
For very low resync rates, the effective rate may have exceeded
the intended rate by a larger margin.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_worker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 2c4ce42c3657..dcd95a61a8d0 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -647,7 +647,7 @@ next_sector:
 		 */
 		align = 1;
 		rollback_i = i;
-		for (;;) {
+		while (i < number) {
 			if (size + BM_BLOCK_SIZE > max_bio_size)
 				break;
 

From 0e49d7b014c5d591a053d08888a455bd74a88646 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:18 +0200
Subject: [PATCH 13/54] drbd: fix potential distributed deadlock during verify
 or resync

If max-buffers and socket buffer sizes are "too small" for the chosen
resync rate, this could lead potentially lead to a distributed deadlock,
which may or may not resolve itself via the "ko-count" and request
timeout mechanism, or could be resolved by forced disconnect.

One option to deal with this is proper configuration:
use larger max-buffer and socket buffers settings,
or reduce the resync rate.

But even with bad configuration we should not deadlock,
but "gracefully" recover.

The issue is avoided by using only up to max-buffers/2 for resync
requests, and by using max-buffers not as a hard limit for data buffer
allocations, but as a throttle threshold only.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_receiver.c | 17 ++++++++++++-----
 drivers/block/drbd/drbd_worker.c   | 27 +++++++++++++++++----------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 125c9e89388f..6ffbc22eba0b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -234,9 +234,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
  * @retry:	whether to retry, if not enough pages are available right now
  *
  * Tries to allocate number pages, first from our own page pool, then from
- * the kernel, unless this allocation would exceed the max_buffers setting.
+ * the kernel.
  * Possibly retry until DRBD frees sufficient pages somewhere else.
  *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
  * Returns a page chain linked via page->private.
  */
 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
@@ -246,10 +254,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 	struct page *page = NULL;
 	struct net_conf *nc;
 	DEFINE_WAIT(wait);
-	int mxb;
+	unsigned int mxb;
 
-	/* Yes, we may run up to @number over max_buffers. If we
-	 * follow it strictly, the admin will get it wrong anyways. */
 	rcu_read_lock();
 	nc = rcu_dereference(peer_device->connection->net_conf);
 	mxb = nc ? nc->max_buffers : 1000000;
@@ -277,7 +283,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
 			break;
 		}
 
-		schedule();
+		if (schedule_timeout(HZ/10) == 0)
+			mxb = UINT_MAX;
 	}
 	finish_wait(&drbd_pp_wait, &wait);
 
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index dcd95a61a8d0..7e633cad745a 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -492,10 +492,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 	return fb;
 }
 
-static int drbd_rs_controller(struct drbd_device *device)
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
 	struct disk_conf *dc;
-	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 	unsigned int want;     /* The number of sectors we want in the proxy */
 	int req_sect; /* Number of sectors to request in this turn */
 	int correction; /* Number of sectors more we need in the proxy*/
@@ -505,9 +504,6 @@ static int drbd_rs_controller(struct drbd_device *device)
 	int max_sect;
 	struct fifo_buffer *plan;
 
-	sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
-	device->rs_in_flight -= sect_in;
-
 	dc = rcu_dereference(device->ldev->disk_conf);
 	plan = rcu_dereference(device->rs_plan_s);
 
@@ -550,11 +546,16 @@ static int drbd_rs_controller(struct drbd_device *device)
 
 static int drbd_rs_number_requests(struct drbd_device *device)
 {
-	int number;
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	int number, mxb;
+
+	sect_in = atomic_xchg(&device->rs_sect_in, 0);
+	device->rs_in_flight -= sect_in;
 
 	rcu_read_lock();
+	mxb = drbd_get_max_buffers(device) / 2;
 	if (rcu_dereference(device->rs_plan_s)->size) {
-		number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9);
+		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
 		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
 	} else {
 		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
@@ -562,8 +563,14 @@ static int drbd_rs_number_requests(struct drbd_device *device)
 	}
 	rcu_read_unlock();
 
-	/* ignore the amount of pending requests, the resync controller should
-	 * throttle down to incoming reply rate soon enough anyways. */
+	/* Don't have more than "max-buffers"/2 in-flight.
+	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+	 * potentially causing a distributed deadlock on congestion during
+	 * online-verify or (checksum-based) resync, if max-buffers,
+	 * socket buffer sizes and resync rate settings are mis-configured. */
+	if (mxb - device->rs_in_flight < number)
+		number = mxb - device->rs_in_flight;
+
 	return number;
 }
 
@@ -597,7 +604,7 @@ static int make_resync_request(struct drbd_device *device, int cancel)
 
 	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
 	number = drbd_rs_number_requests(device);
-	if (number == 0)
+	if (number <= 0)
 		goto requeue;
 
 	for (i = 0; i < number; i++) {

From e82998743385ca861b9ec919eb2ba8177ce72180 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:19 +0200
Subject: [PATCH 14/54] drbd: don't let application IO pre-empt resync too
 often

Before, application IO could pre-empt resync activity
for up to hardcoded 20 seconds per resync request.
A very busy server could throttle the effective resync bandwidth
down to one request per 20 seconds.

Now, we only let application IO pre-empt resync traffic
while the current resync rate estimate is above c-min-rate.

If you disable the c-min-rate throttle feature (set c-min-rate = 0),
application IO will no longer pre-empt resync traffic at all.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_actlog.c   | 13 ++++-----
 drivers/block/drbd/drbd_int.h      |  3 +-
 drivers/block/drbd/drbd_receiver.c | 47 +++++++++++++++++-------------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 7e7b0e143655..8dd09a7f23c6 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -1022,8 +1022,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
 	unsigned int enr = BM_SECT_TO_EXT(sector);
 	struct bm_extent *bm_ext;
 	int i, sig;
-	int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait.
-			 200 times -> 20 seconds. */
+	bool sa;
 
 retry:
 	sig = wait_event_interruptible(device->al_wait,
@@ -1034,12 +1033,15 @@ retry:
 	if (test_bit(BME_LOCKED, &bm_ext->flags))
 		return 0;
 
+	/* step aside only while we are above c-min-rate; unless disabled. */
+	sa = drbd_rs_c_min_rate_throttle(device);
+
 	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
 		sig = wait_event_interruptible(device->al_wait,
 					       !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
-					       test_bit(BME_PRIORITY, &bm_ext->flags));
+					       (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
 
-		if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) {
+		if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
 			spin_lock_irq(&device->al_lock);
 			if (lc_put(device->resync, &bm_ext->lce) == 0) {
 				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
@@ -1051,9 +1053,6 @@ retry:
 				return -EINTR;
 			if (schedule_timeout_interruptible(HZ/10))
 				return -EINTR;
-			if (sa && --sa == 0)
-				drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec."
-					 "Resync stalled?\n");
 			goto retry;
 		}
 	}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 361a2e9cd727..f0cabea5cda2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1339,7 +1339,8 @@ extern void start_resync_timer_fn(unsigned long data);
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
-extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
 extern int drbd_submit_peer_request(struct drbd_device *,
 				    struct drbd_peer_request *, const unsigned,
 				    const int);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 6ffbc22eba0b..10d2dcb16bff 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2323,14 +2323,33 @@ out_interrupted:
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
+{
+	struct lc_element *tmp;
+	bool throttle = true;
+
+	if (!drbd_rs_c_min_rate_throttle(device))
+		return false;
+
+	spin_lock_irq(&device->al_lock);
+	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags))
+			throttle = false;
+		/* Do not slow down if app IO is already waiting for this extent */
+	}
+	spin_unlock_irq(&device->al_lock);
+
+	return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 {
 	struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
 	unsigned long db, dt, dbdt;
-	struct lc_element *tmp;
-	int curr_events;
-	int throttle = 0;
 	unsigned int c_min_rate;
+	int curr_events;
 
 	rcu_read_lock();
 	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
@@ -2338,24 +2357,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
 
 	/* feature disabled? */
 	if (c_min_rate == 0)
-		return 0;
-
-	spin_lock_irq(&device->al_lock);
-	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
-	if (tmp) {
-		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
-		if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
-			spin_unlock_irq(&device->al_lock);
-			return 0;
-		}
-		/* Do not slow down if app IO is already waiting for this extent */
-	}
-	spin_unlock_irq(&device->al_lock);
+		return false;
 
 	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 		      (int)part_stat_read(&disk->part0, sectors[1]) -
 			atomic_read(&device->rs_sect_ev);
-
 	if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
@@ -2378,12 +2384,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
 		dbdt = Bit2KB(db/dt);
 
 		if (dbdt > c_min_rate)
-			throttle = 1;
+			return true;
 	}
-	return throttle;
+	return false;
 }
 
-
 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
 {
 	struct drbd_peer_device *peer_device;

From 88ea685d33c75932fe90ed0b5d2cf68845462876 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:20 +0200
Subject: [PATCH 15/54] drbd: Do not BUG() when connection breaks in a special
 way

When a 'cluster wide' disconnect executes, the result comes back
from the peer, and immediately after that the connection breaks
then _conn_rq_cond() reported back SS_CW_SUCCESS.
Therefore _conn_request_state() calls conn_set_state(), which
has a BUG() in it.
The BUG() is hit because conn_is_valid_transition() does not like
the transaction. Which goes back to is_valid_soft_transition()
returning SS_OUTDATE_WO_CONN.

This fix is to consider an error reported by is_valid_soft_transition()
even when the peer agreed to the transaction.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_state.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 379666246569..a5d8aae00e04 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1765,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
 static enum drbd_state_rv
 _conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
 {
-	enum drbd_state_rv rv;
+	enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
 
 	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
-		return SS_CW_SUCCESS;
+		rv = SS_CW_SUCCESS;
 
 	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
-		return SS_CW_FAILED_BY_PEER;
+		rv = SS_CW_FAILED_BY_PEER;
 
-	rv = conn_is_valid_transition(connection, mask, val, 0);
-	if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
-		rv = SS_UNKNOWN_ERROR; /* continue waiting */
+	err = conn_is_valid_transition(connection, mask, val, 0);
+	if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+		return rv;
 
-	return rv;
+	return err;
 }
 
 enum drbd_state_rv

From a910b12352f5ddee712c3423c31fbb8b312dde88 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:21 +0200
Subject: [PATCH 16/54] drbd: perpare for genetlink parallel_ops

Because all administrative requests via genetlink have been globally
serialized via genl_lock(), we used to have one static struct
drbd_config_context "admin context".

Move this on-stack to the respective callback functions.

This will allow us to selectively drop the genl_lock()
(or use genl_family->parallel_ops) in the future.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h  |  28 ++-
 drivers/block/drbd/drbd_main.c |  12 +-
 drivers/block/drbd/drbd_nl.c   | 333 ++++++++++++++++-----------------
 3 files changed, 198 insertions(+), 175 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f0cabea5cda2..20a1772b245c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -814,6 +814,28 @@ struct drbd_device {
 	struct submit_worker submit;
 };
 
+struct drbd_config_context {
+	/* assigned from drbd_genlmsghdr */
+	unsigned int minor;
+	/* assigned from request attributes, if present */
+	unsigned int volume;
+#define VOLUME_UNSPECIFIED		(-1U)
+	/* pointer into the request skb,
+	 * limited lifetime! */
+	char *resource_name;
+	struct nlattr *my_addr;
+	struct nlattr *peer_addr;
+
+	/* reply buffer */
+	struct sk_buff *reply_skb;
+	/* pointer into reply buffer */
+	struct drbd_genlmsghdr *reply_dh;
+	/* resolved from attributes, if possible */
+	struct drbd_device *device;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+};
+
 static inline struct drbd_device *minor_to_device(unsigned int minor)
 {
 	return (struct drbd_device *)idr_find(&drbd_devices, minor);
@@ -1229,9 +1251,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 extern rwlock_t global_state_lock;
 
 extern int conn_lowest_minor(struct drbd_connection *connection);
-enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
 extern void drbd_destroy_device(struct kref *kref);
-extern void drbd_delete_device(struct drbd_device *mdev);
+extern void drbd_delete_device(struct drbd_device *device);
 
 extern struct drbd_resource *drbd_create_resource(const char *name);
 extern void drbd_free_resource(struct drbd_resource *resource);
@@ -1257,7 +1279,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
 
 
 /* drbd_nl.c */
-extern int drbd_msg_put_info(const char *info);
+extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
 extern void drbd_suspend_io(struct drbd_device *device);
 extern void drbd_resume_io(struct drbd_device *device);
 extern char *ppsize(char *buf, unsigned long long size);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 331e5cc1227d..865fdb3b0ce0 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2687,14 +2687,16 @@ static int init_submitter(struct drbd_device *device)
 	return 0;
 }
 
-enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr)
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
 {
+	struct drbd_resource *resource = adm_ctx->resource;
 	struct drbd_connection *connection;
 	struct drbd_device *device;
 	struct drbd_peer_device *peer_device, *tmp_peer_device;
 	struct gendisk *disk;
 	struct request_queue *q;
 	int id;
+	int vnr = adm_ctx->volume;
 	enum drbd_ret_code err = ERR_NOMEM;
 
 	device = minor_to_device(minor);
@@ -2763,7 +2765,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 	if (id < 0) {
 		if (id == -ENOSPC) {
 			err = ERR_MINOR_EXISTS;
-			drbd_msg_put_info("requested minor exists already");
+			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
 		}
 		goto out_no_minor_idr;
 	}
@@ -2773,7 +2775,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 	if (id < 0) {
 		if (id == -ENOSPC) {
 			err = ERR_MINOR_EXISTS;
-			drbd_msg_put_info("requested minor exists already");
+			drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
 		}
 		goto out_idr_remove_minor;
 	}
@@ -2794,7 +2796,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 		if (id < 0) {
 			if (id == -ENOSPC) {
 				err = ERR_INVALID_REQUEST;
-				drbd_msg_put_info("requested volume exists already");
+				drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
 			}
 			goto out_idr_remove_from_resource;
 		}
@@ -2803,7 +2805,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
 
 	if (init_submitter(device)) {
 		err = ERR_NOMEM;
-		drbd_msg_put_info("unable to create submit workqueue");
+		drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
 		goto out_idr_remove_vol;
 	}
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index d219210cca1e..118ac72f8699 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -81,32 +81,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
 /* used blkdev_get_by_path, to claim our meta data device(s) */
 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
 
-/* Configuration is strictly serialized, because generic netlink message
- * processing is strictly serialized by the genl_lock().
- * Which means we can use one static global drbd_config_context struct.
- */
-static struct drbd_config_context {
-	/* assigned from drbd_genlmsghdr */
-	unsigned int minor;
-	/* assigned from request attributes, if present */
-	unsigned int volume;
-#define VOLUME_UNSPECIFIED		(-1U)
-	/* pointer into the request skb,
-	 * limited lifetime! */
-	char *resource_name;
-	struct nlattr *my_addr;
-	struct nlattr *peer_addr;
-
-	/* reply buffer */
-	struct sk_buff *reply_skb;
-	/* pointer into reply buffer */
-	struct drbd_genlmsghdr *reply_dh;
-	/* resolved from attributes, if possible */
-	struct drbd_device *device;
-	struct drbd_resource *resource;
-	struct drbd_connection *connection;
-} adm_ctx;
-
 static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
@@ -116,9 +90,8 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 
 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
  * reason it could fail was no space in skb, and there are 4k available. */
-int drbd_msg_put_info(const char *info)
+int drbd_msg_put_info(struct sk_buff *skb, const char *info)
 {
-	struct sk_buff *skb = adm_ctx.reply_skb;
 	struct nlattr *nla;
 	int err = -EMSGSIZE;
 
@@ -146,38 +119,38 @@ int drbd_msg_put_info(const char *info)
 #define DRBD_ADM_NEED_MINOR	1
 #define DRBD_ADM_NEED_RESOURCE	2
 #define DRBD_ADM_NEED_CONNECTION 4
-static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
-		unsigned flags)
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+	struct sk_buff *skb, struct genl_info *info, unsigned flags)
 {
 	struct drbd_genlmsghdr *d_in = info->userhdr;
 	const u8 cmd = info->genlhdr->cmd;
 	int err;
 
-	memset(&adm_ctx, 0, sizeof(adm_ctx));
+	memset(adm_ctx, 0, sizeof(*adm_ctx));
 
 	/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 	if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
 	       return -EPERM;
 
-	adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!adm_ctx.reply_skb) {
+	adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!adm_ctx->reply_skb) {
 		err = -ENOMEM;
 		goto fail;
 	}
 
-	adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+	adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
 					info, &drbd_genl_family, 0, cmd);
 	/* put of a few bytes into a fresh skb of >= 4k will always succeed.
 	 * but anyways */
-	if (!adm_ctx.reply_dh) {
+	if (!adm_ctx->reply_dh) {
 		err = -ENOMEM;
 		goto fail;
 	}
 
-	adm_ctx.reply_dh->minor = d_in->minor;
-	adm_ctx.reply_dh->ret_code = NO_ERROR;
+	adm_ctx->reply_dh->minor = d_in->minor;
+	adm_ctx->reply_dh->ret_code = NO_ERROR;
 
-	adm_ctx.volume = VOLUME_UNSPECIFIED;
+	adm_ctx->volume = VOLUME_UNSPECIFIED;
 	if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
 		struct nlattr *nla;
 		/* parse and validate only */
@@ -187,7 +160,7 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
 
 		/* It was present, and valid,
 		 * copy it over to the reply skb. */
-		err = nla_put_nohdr(adm_ctx.reply_skb,
+		err = nla_put_nohdr(adm_ctx->reply_skb,
 				info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
 				info->attrs[DRBD_NLA_CFG_CONTEXT]);
 		if (err)
@@ -196,102 +169,103 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
 		/* and assign stuff to the global adm_ctx */
 		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
 		if (nla)
-			adm_ctx.volume = nla_get_u32(nla);
+			adm_ctx->volume = nla_get_u32(nla);
 		nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
 		if (nla)
-			adm_ctx.resource_name = nla_data(nla);
-		adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
-		adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
-		if ((adm_ctx.my_addr &&
-		     nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) ||
-		    (adm_ctx.peer_addr &&
-		     nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) {
+			adm_ctx->resource_name = nla_data(nla);
+		adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+		adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+		if ((adm_ctx->my_addr &&
+		     nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+		    (adm_ctx->peer_addr &&
+		     nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
 			err = -EINVAL;
 			goto fail;
 		}
 	}
 
-	adm_ctx.minor = d_in->minor;
-	adm_ctx.device = minor_to_device(d_in->minor);
-	if (adm_ctx.resource_name) {
-		adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name);
+	adm_ctx->minor = d_in->minor;
+	adm_ctx->device = minor_to_device(d_in->minor);
+	if (adm_ctx->resource_name) {
+		adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
 	}
 
-	if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) {
-		drbd_msg_put_info("unknown minor");
+	if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
 		return ERR_MINOR_INVALID;
 	}
-	if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
-		drbd_msg_put_info("unknown resource");
-		if (adm_ctx.resource_name)
+	if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+		if (adm_ctx->resource_name)
 			return ERR_RES_NOT_KNOWN;
 		return ERR_INVALID_REQUEST;
 	}
 
 	if (flags & DRBD_ADM_NEED_CONNECTION) {
-		if (adm_ctx.resource) {
-			drbd_msg_put_info("no resource name expected");
+		if (adm_ctx->resource) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
 			return ERR_INVALID_REQUEST;
 		}
-		if (adm_ctx.device) {
-			drbd_msg_put_info("no minor number expected");
+		if (adm_ctx->device) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
 			return ERR_INVALID_REQUEST;
 		}
-		if (adm_ctx.my_addr && adm_ctx.peer_addr)
-			adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
-							  nla_len(adm_ctx.my_addr),
-							  nla_data(adm_ctx.peer_addr),
-							  nla_len(adm_ctx.peer_addr));
-		if (!adm_ctx.connection) {
-			drbd_msg_put_info("unknown connection");
+		if (adm_ctx->my_addr && adm_ctx->peer_addr)
+			adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+							  nla_len(adm_ctx->my_addr),
+							  nla_data(adm_ctx->peer_addr),
+							  nla_len(adm_ctx->peer_addr));
+		if (!adm_ctx->connection) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
 			return ERR_INVALID_REQUEST;
 		}
 	}
 
 	/* some more paranoia, if the request was over-determined */
-	if (adm_ctx.device && adm_ctx.resource &&
-	    adm_ctx.device->resource != adm_ctx.resource) {
+	if (adm_ctx->device && adm_ctx->resource &&
+	    adm_ctx->device->resource != adm_ctx->resource) {
 		pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
-				adm_ctx.minor, adm_ctx.resource->name,
-				adm_ctx.device->resource->name);
-		drbd_msg_put_info("minor exists in different resource");
+				adm_ctx->minor, adm_ctx->resource->name,
+				adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
 		return ERR_INVALID_REQUEST;
 	}
-	if (adm_ctx.device &&
-	    adm_ctx.volume != VOLUME_UNSPECIFIED &&
-	    adm_ctx.volume != adm_ctx.device->vnr) {
+	if (adm_ctx->device &&
+	    adm_ctx->volume != VOLUME_UNSPECIFIED &&
+	    adm_ctx->volume != adm_ctx->device->vnr) {
 		pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
-				adm_ctx.minor, adm_ctx.volume,
-				adm_ctx.device->vnr,
-				adm_ctx.device->resource->name);
-		drbd_msg_put_info("minor exists as different volume");
+				adm_ctx->minor, adm_ctx->volume,
+				adm_ctx->device->vnr,
+				adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
 		return ERR_INVALID_REQUEST;
 	}
 
 	return NO_ERROR;
 
 fail:
-	nlmsg_free(adm_ctx.reply_skb);
-	adm_ctx.reply_skb = NULL;
+	nlmsg_free(adm_ctx->reply_skb);
+	adm_ctx->reply_skb = NULL;
 	return err;
 }
 
-static int drbd_adm_finish(struct genl_info *info, int retcode)
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+	struct genl_info *info, int retcode)
 {
-	if (adm_ctx.connection) {
-		kref_put(&adm_ctx.connection->kref, drbd_destroy_connection);
-		adm_ctx.connection = NULL;
+	if (adm_ctx->connection) {
+		kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+		adm_ctx->connection = NULL;
 	}
-	if (adm_ctx.resource) {
-		kref_put(&adm_ctx.resource->kref, drbd_destroy_resource);
-		adm_ctx.resource = NULL;
+	if (adm_ctx->resource) {
+		kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+		adm_ctx->resource = NULL;
 	}
 
-	if (!adm_ctx.reply_skb)
+	if (!adm_ctx->reply_skb)
 		return -ENOMEM;
 
-	adm_ctx.reply_dh->ret_code = retcode;
-	drbd_adm_send_reply(adm_ctx.reply_skb, info);
+	adm_ctx->reply_dh->ret_code = retcode;
+	drbd_adm_send_reply(adm_ctx->reply_skb, info);
 	return 0;
 }
 
@@ -707,11 +681,12 @@ static const char *from_attrs_err_to_txt(int err)
 
 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct set_role_parms parms;
 	int err;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -722,7 +697,7 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 		err = set_role_parms_from_attrs(&parms, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
-			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 			goto out;
 		}
 	}
@@ -732,7 +707,7 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 	else
 		retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -1265,13 +1240,14 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
 
 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct drbd_device *device;
 	struct disk_conf *new_disk_conf, *old_disk_conf;
 	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
 	int err, fifo_size;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -1301,7 +1277,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	err = disk_conf_from_attrs_for_change(new_disk_conf, info);
 	if (err && err != -ENOMSG) {
 		retcode = ERR_MANDATORY_TAG;
-		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 		goto fail_unlock;
 	}
 
@@ -1392,12 +1368,13 @@ fail_unlock:
 success:
 	put_ldev(device);
  out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
 	int err;
 	enum drbd_ret_code retcode;
@@ -1413,7 +1390,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	enum drbd_state_rv rv;
 	struct net_conf *nc;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -1462,7 +1439,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	err = disk_conf_from_attrs(new_disk_conf, info);
 	if (err) {
 		retcode = ERR_MANDATORY_TAG;
-		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
@@ -1804,7 +1781,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 	put_ldev(device);
 	conn_reconfig_done(first_peer_device(device)->connection);
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 
  force_diskless_dec:
@@ -1828,7 +1805,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	kfree(new_plan);
 
  finish:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -1867,11 +1844,12 @@ out:
  * Only then we have finally detached. */
 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct detach_parms parms = { };
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -1881,14 +1859,14 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
 		err = detach_parms_from_attrs(&parms, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
-			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 			goto out;
 		}
 	}
 
 	retcode = adm_detach(adm_ctx.device, parms.force_detach);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -2062,6 +2040,7 @@ static void free_crypto(struct crypto *crypto)
 
 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct drbd_connection *connection;
 	struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2070,7 +2049,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 	int rsr; /* re-sync running */
 	struct crypto crypto = { };
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2091,7 +2070,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 	old_net_conf = connection->net_conf;
 
 	if (!old_net_conf) {
-		drbd_msg_put_info("net conf missing, try connect");
+		drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
 		retcode = ERR_INVALID_REQUEST;
 		goto fail;
 	}
@@ -2103,7 +2082,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 	err = net_conf_from_attrs_for_change(new_net_conf, info);
 	if (err && err != -ENOMSG) {
 		retcode = ERR_MANDATORY_TAG;
-		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
@@ -2174,12 +2153,13 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
  done:
 	conn_reconfig_done(connection);
  out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_peer_device *peer_device;
 	struct net_conf *old_net_conf, *new_net_conf = NULL;
 	struct crypto crypto = { };
@@ -2189,14 +2169,14 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 	int i;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
 
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
 		goto out;
 	if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
-		drbd_msg_put_info("connection endpoint(s) missing");
+		drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
 		retcode = ERR_INVALID_REQUEST;
 		goto out;
 	}
@@ -2242,7 +2222,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 	err = net_conf_from_attrs(new_net_conf, info);
 	if (err && err != -ENOMSG) {
 		retcode = ERR_MANDATORY_TAG;
-		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
@@ -2291,7 +2271,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 	retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
 
 	conn_reconfig_done(connection);
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 
 fail:
@@ -2300,7 +2280,7 @@ fail:
 
 	conn_reconfig_done(connection);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -2363,13 +2343,14 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
 
 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct disconnect_parms parms;
 	struct drbd_connection *connection;
 	enum drbd_state_rv rv;
 	enum drbd_ret_code retcode;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2381,7 +2362,7 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
 		err = disconnect_parms_from_attrs(&parms, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
-			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 			goto fail;
 		}
 	}
@@ -2392,7 +2373,7 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
 	else
 		retcode = NO_ERROR;
  fail:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -2414,6 +2395,7 @@ void resync_after_online_grow(struct drbd_device *device)
 
 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
 	struct resize_parms rs;
 	struct drbd_device *device;
@@ -2424,7 +2406,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	sector_t u_size;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2443,7 +2425,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 		err = resize_parms_from_attrs(&rs, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
-			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 			goto fail_ldev;
 		}
 	}
@@ -2535,7 +2517,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	}
 
  fail:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 
  fail_ldev:
@@ -2545,11 +2527,12 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 
 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct res_opts res_opts;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2562,7 +2545,7 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
 	err = res_opts_from_attrs(&res_opts, info);
 	if (err && err != -ENOMSG) {
 		retcode = ERR_MANDATORY_TAG;
-		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
@@ -2574,16 +2557,17 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
 	}
 
 fail:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
 	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2614,16 +2598,17 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 	drbd_resume_io(device);
 
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
 		union drbd_state mask, union drbd_state val)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2631,7 +2616,7 @@ static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *
 
 	retcode = drbd_request_state(adm_ctx.device, mask, val);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -2646,10 +2631,11 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device)
 
 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	int retcode; /* drbd_ret_code, drbd_state_rv */
 	struct drbd_device *device;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2683,15 +2669,16 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 	drbd_resume_io(device);
 
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2700,16 +2687,17 @@ int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
 	if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
 		retcode = ERR_PAUSE_IS_SET;
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	union drbd_dev_state s;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2726,7 +2714,7 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
 	}
 
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -2737,10 +2725,11 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
 
 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
 	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2762,7 +2751,7 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 	drbd_resume_io(device);
 
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -2938,10 +2927,11 @@ nla_put_failure:
 
 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -2953,7 +2943,7 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
 		return err;
 	}
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -3140,11 +3130,12 @@ dump:
 
 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct timeout_parms tp;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3161,17 +3152,18 @@ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
 		return err;
 	}
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
 	enum drbd_ret_code retcode;
 	struct start_ov_parms parms;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3186,7 +3178,7 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
 		int err = start_ov_parms_from_attrs(&parms, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
-			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 			goto out;
 		}
 	}
@@ -3201,20 +3193,21 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
 	retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
 	drbd_resume_io(device);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 
 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_device *device;
 	enum drbd_ret_code retcode;
 	int skip_initial_sync = 0;
 	int err;
 	struct new_c_uuid_parms args;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3226,7 +3219,7 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
 		err = new_c_uuid_parms_from_attrs(&args, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
-			drbd_msg_put_info(from_attrs_err_to_txt(err));
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 			goto out_nolock;
 		}
 	}
@@ -3276,21 +3269,22 @@ out_dec:
 out:
 	mutex_unlock(device->state_mutex);
 out_nolock:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 static enum drbd_ret_code
-drbd_check_resource_name(const char *name)
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
 {
+	const char *name = adm_ctx->resource_name;
 	if (!name || !name[0]) {
-		drbd_msg_put_info("resource name missing");
+		drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
 		return ERR_MANDATORY_TAG;
 	}
 	/* if we want to use these in sysfs/configfs/debugfs some day,
 	 * we must not allow slashes */
 	if (strchr(name, '/')) {
-		drbd_msg_put_info("invalid resource name");
+		drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
 		return ERR_INVALID_REQUEST;
 	}
 	return NO_ERROR;
@@ -3298,11 +3292,12 @@ drbd_check_resource_name(const char *name)
 
 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 	struct res_opts res_opts;
 	int err;
 
-	retcode = drbd_adm_prepare(skb, info, 0);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3312,18 +3307,18 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 	err = res_opts_from_attrs(&res_opts, info);
 	if (err && err != -ENOMSG) {
 		retcode = ERR_MANDATORY_TAG;
-		drbd_msg_put_info(from_attrs_err_to_txt(err));
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
 		goto out;
 	}
 
-	retcode = drbd_check_resource_name(adm_ctx.resource_name);
+	retcode = drbd_check_resource_name(&adm_ctx);
 	if (retcode != NO_ERROR)
 		goto out;
 
 	if (adm_ctx.resource) {
 		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
 			retcode = ERR_INVALID_REQUEST;
-			drbd_msg_put_info("resource exists");
+			drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
 		}
 		/* else: still NO_ERROR */
 		goto out;
@@ -3332,28 +3327,29 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 	if (!conn_create(adm_ctx.resource_name, &res_opts))
 		retcode = ERR_NOMEM;
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_genlmsghdr *dh = info->userhdr;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
 		goto out;
 
 	if (dh->minor > MINORMASK) {
-		drbd_msg_put_info("requested minor out of range");
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
 		retcode = ERR_INVALID_REQUEST;
 		goto out;
 	}
 	if (adm_ctx.volume > DRBD_VOLUME_MAX) {
-		drbd_msg_put_info("requested volume id out of range");
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
 		retcode = ERR_INVALID_REQUEST;
 		goto out;
 	}
@@ -3367,9 +3363,9 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
-	retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume);
+	retcode = drbd_create_device(&adm_ctx, dh->minor);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
@@ -3390,9 +3386,10 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
 
 int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3400,19 +3397,20 @@ int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
 
 	retcode = adm_del_minor(adm_ctx.device);
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_resource *resource;
 	struct drbd_connection *connection;
 	struct drbd_device *device;
 	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
 	unsigned i;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3426,14 +3424,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 		idr_for_each_entry(&connection->peer_devices, peer_device, i) {
 			retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
 			if (retcode < SS_SUCCESS) {
-				drbd_msg_put_info("failed to demote");
+				drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
 				goto out;
 			}
 		}
 
 		retcode = conn_try_disconnect(connection, 0);
 		if (retcode < SS_SUCCESS) {
-			drbd_msg_put_info("failed to disconnect");
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
 			goto out;
 		}
 	}
@@ -3442,7 +3440,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 	idr_for_each_entry(&resource->devices, device, i) {
 		retcode = adm_detach(device, 0);
 		if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
-			drbd_msg_put_info("failed to detach");
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
 			goto out;
 		}
 	}
@@ -3460,7 +3458,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 		retcode = adm_del_minor(device);
 		if (retcode != NO_ERROR) {
 			/* "can not happen" */
-			drbd_msg_put_info("failed to delete volume");
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
 			goto out;
 		}
 	}
@@ -3471,17 +3469,18 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 	retcode = NO_ERROR;
 
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 
 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 {
+	struct drbd_config_context adm_ctx;
 	struct drbd_resource *resource;
 	struct drbd_connection *connection;
 	enum drbd_ret_code retcode;
 
-	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
@@ -3506,7 +3505,7 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 	drbd_free_resource(resource);
 	retcode = NO_ERROR;
 out:
-	drbd_adm_finish(info, retcode);
+	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
 

From 9e276872fe1665ea158f0c6f40df13008fed2908 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:22 +0200
Subject: [PATCH 17/54] drbd: allow parallel promote/demote actions

We plan to use genl_family->parallel_ops = true in the future,
but need to review all possible interactions first.

For now, only selectively drop genl_lock() in drbd_set_role(),
instead serializing on our own internal resource->conf_update mutex.

We now can be promoted/demoted on many resources in parallel,
which may significantly improve cluster failover times
when fencing is required.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h  |   1 +
 drivers/block/drbd/drbd_main.c |   1 +
 drivers/block/drbd/drbd_nl.c   | 100 ++++++++++++++++++++++++++++-----
 3 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 20a1772b245c..fe6295c30a27 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -579,6 +579,7 @@ struct drbd_resource {
 	struct list_head resources;
 	struct res_opts res_opts;
 	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
+	struct mutex adm_mutex;		/* mutex to serialize administrative requests */
 	spinlock_t req_lock;
 
 	unsigned susp:1;		/* IO suspended by user */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 865fdb3b0ce0..05db597401e6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2570,6 +2570,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
 	INIT_LIST_HEAD(&resource->connections);
 	list_add_tail_rcu(&resource->resources, &drbd_resources);
 	mutex_init(&resource->conf_update);
+	mutex_init(&resource->adm_mutex);
 	spin_lock_init(&resource->req_lock);
 	return resource;
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 118ac72f8699..bb3679263ea7 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -115,6 +115,10 @@ int drbd_msg_put_info(struct sk_buff *skb, const char *info)
  * and per-family private info->pointers.
  * But we need to stay compatible with older kernels.
  * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
  */
 #define DRBD_ADM_NEED_MINOR	1
 #define DRBD_ADM_NEED_RESOURCE	2
@@ -166,7 +170,7 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
 		if (err)
 			goto fail;
 
-		/* and assign stuff to the global adm_ctx */
+		/* and assign stuff to the adm_ctx */
 		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
 		if (nla)
 			adm_ctx->volume = nla_get_u32(nla);
@@ -186,6 +190,13 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
 
 	adm_ctx->minor = d_in->minor;
 	adm_ctx->device = minor_to_device(d_in->minor);
+
+	/* We are protected by the global genl_lock().
+	 * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+	 * so make sure this object stays around. */
+	if (adm_ctx->device)
+		kref_get(&adm_ctx->device->kref);
+
 	if (adm_ctx->resource_name) {
 		adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
 	}
@@ -241,6 +252,14 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
 		return ERR_INVALID_REQUEST;
 	}
 
+	/* still, provide adm_ctx->resource always, if possible. */
+	if (!adm_ctx->resource) {
+		adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+			: adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+		if (adm_ctx->resource)
+			kref_get(&adm_ctx->resource->kref);
+	}
+
 	return NO_ERROR;
 
 fail:
@@ -252,6 +271,10 @@ fail:
 static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
 	struct genl_info *info, int retcode)
 {
+	if (adm_ctx->device) {
+		kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+		adm_ctx->device = NULL;
+	}
 	if (adm_ctx->connection) {
 		kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
 		adm_ctx->connection = NULL;
@@ -635,11 +658,11 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 			put_ldev(device);
 		}
 	} else {
-		mutex_lock(&device->resource->conf_update);
+		/* Called from drbd_adm_set_role only.
+		 * We are still holding the conf_update mutex. */
 		nc = first_peer_device(device)->connection->net_conf;
 		if (nc)
 			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
-		mutex_unlock(&device->resource->conf_update);
 
 		set_disk_ro(device->vdisk, false);
 		if (get_ldev(device)) {
@@ -701,11 +724,16 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 	}
+	genl_unlock();
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
 		retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
 	else
 		retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	genl_lock();
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -1251,9 +1279,10 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
-		goto out;
+		goto finish;
 
 	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	/* we also need a disk
 	 * to change the options on */
@@ -1368,6 +1397,8 @@ fail_unlock:
 success:
 	put_ldev(device);
  out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
@@ -1397,6 +1428,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto finish;
 
 	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	conn_reconfig_start(first_peer_device(device)->connection);
 
 	/* if you want to reconfigure, please tear down first */
@@ -1781,6 +1813,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 	put_ldev(device);
 	conn_reconfig_done(first_peer_device(device)->connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 
@@ -1803,7 +1836,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	kfree(new_disk_conf);
 	lc_destroy(resync_lru);
 	kfree(new_plan);
-
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
  finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -1864,7 +1897,9 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	retcode = adm_detach(adm_ctx.device, parms.force_detach);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2053,9 +2088,10 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
-		goto out;
+		goto finish;
 
 	connection = adm_ctx.connection;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 
 	new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
 	if (!new_net_conf) {
@@ -2153,6 +2189,8 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
  done:
 	conn_reconfig_done(connection);
  out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
@@ -2202,6 +2240,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	connection = first_connection(adm_ctx.resource);
 	conn_reconfig_start(connection);
 
@@ -2271,6 +2310,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
 	retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
 
 	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 
@@ -2279,6 +2319,7 @@ fail:
 	kfree(new_net_conf);
 
 	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2367,11 +2408,13 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	rv = conn_try_disconnect(connection, parms.force_disconnect);
 	if (rv < SS_SUCCESS)
 		retcode = rv;  /* FIXME: Type mismatch. */
 	else
 		retcode = NO_ERROR;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
  fail:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2410,8 +2453,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
-		goto fail;
+		goto finish;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
 	if (!get_ldev(device)) {
 		retcode = ERR_NO_DISK;
@@ -2517,6 +2561,8 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	}
 
  fail:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 
@@ -2549,12 +2595,14 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 	}
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	err = set_resource_options(adm_ctx.resource, &res_opts);
 	if (err) {
 		retcode = ERR_INVALID_REQUEST;
 		if (err == -ENOMEM)
 			retcode = ERR_NOMEM;
 	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 
 fail:
 	drbd_adm_finish(&adm_ctx, info, retcode);
@@ -2573,6 +2621,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
 
 	/* If there is still bitmap IO pending, probably because of a previous
@@ -2596,7 +2645,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
 	} else
 		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
 	drbd_resume_io(device);
-
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2614,7 +2663,9 @@ static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	retcode = drbd_request_state(adm_ctx.device, mask, val);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2641,6 +2692,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
 
 	/* If there is still bitmap IO pending, probably because of a previous
@@ -2667,7 +2719,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
 	} else
 		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
 	drbd_resume_io(device);
-
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2684,8 +2736,10 @@ int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
 		retcode = ERR_PAUSE_IS_SET;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2703,6 +2757,7 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
 		s = adm_ctx.device->state;
 		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
@@ -2712,7 +2767,7 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
 			retcode = ERR_PAUSE_IS_CLEAR;
 		}
 	}
-
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -2735,6 +2790,7 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	device = adm_ctx.device;
 	if (test_bit(NEW_CUR_UUID, &device->flags)) {
 		drbd_uuid_new_current(device);
@@ -2749,7 +2805,7 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
 			tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
 	}
 	drbd_resume_io(device);
-
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -3182,6 +3238,8 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 		}
 	}
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
 	/* w_make_ov_request expects position to be aligned */
 	device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
 	device->ov_stop_sector = parms.ov_stop_sector;
@@ -3192,6 +3250,8 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
 	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
 	retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
 	drbd_resume_io(device);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -3224,6 +3284,7 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
 
 	if (!get_ldev(device)) {
@@ -3268,6 +3329,7 @@ out_dec:
 	put_ldev(device);
 out:
 	mutex_unlock(device->state_mutex);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out_nolock:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -3324,6 +3386,7 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
+	/* not yet safe for genl_family.parallel_ops */
 	if (!conn_create(adm_ctx.resource_name, &res_opts))
 		retcode = ERR_NOMEM;
 out:
@@ -3363,7 +3426,9 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	retcode = drbd_create_device(&adm_ctx, dh->minor);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -3395,7 +3460,9 @@ int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto out;
 
+	mutex_lock(&adm_ctx.resource->adm_mutex);
 	retcode = adm_del_minor(adm_ctx.device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
 out:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
@@ -3414,9 +3481,10 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
-		goto out;
+		goto finish;
 
 	resource = adm_ctx.resource;
+	mutex_lock(&resource->adm_mutex);
 	/* demote */
 	for_each_connection(connection, resource) {
 		struct drbd_peer_device *peer_device;
@@ -3467,8 +3535,9 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
 	synchronize_rcu();
 	drbd_free_resource(resource);
 	retcode = NO_ERROR;
-
 out:
+	mutex_unlock(&resource->adm_mutex);
+finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }
@@ -3484,9 +3553,10 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 	if (!adm_ctx.reply_skb)
 		return retcode;
 	if (retcode != NO_ERROR)
-		goto out;
+		goto finish;
 
 	resource = adm_ctx.resource;
+	mutex_lock(&resource->adm_mutex);
 	for_each_connection(connection, resource) {
 		if (connection->cstate > C_STANDALONE) {
 			retcode = ERR_NET_CONFIGURED;
@@ -3505,6 +3575,8 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
 	drbd_free_resource(resource);
 	retcode = NO_ERROR;
 out:
+	mutex_unlock(&resource->adm_mutex);
+finish:
 	drbd_adm_finish(&adm_ctx, info, retcode);
 	return 0;
 }

From a0fb3c47a1aae5d38a88ea858f14d6d088d05e07 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:23 +0200
Subject: [PATCH 18/54] drbd: prepare receiving side for REQ_DISCARD

If the receiver needs to serve a discard request on a queue that does
not announce to be discard cabable, it falls back to do synchronous
blkdev_issue_zeroout().

We expect only "reasonably" large (up to one activity log extent?)
discard requests.

We do this to not to not block the receiver for too long in this
fallback code path, and to not set/clear too many bits inside one
spinlock_irq_save() in drbd_set_in_sync/drbd_set_out_of_sync,

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_actlog.c   |  6 +--
 drivers/block/drbd/drbd_int.h      | 19 ++++++-
 drivers/block/drbd/drbd_protocol.h | 10 ++++
 drivers/block/drbd/drbd_receiver.c | 79 +++++++++++++++++++++++-------
 drivers/block/drbd/drbd_worker.c   | 12 +++--
 5 files changed, 100 insertions(+), 26 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 8dd09a7f23c6..8fc5d71077bf 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -845,7 +845,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
 	int wake_up = 0;
 	unsigned long flags;
 
-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
 		drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
@@ -919,7 +919,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
 	if (size == 0)
 		return 0;
 
-	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+	if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
 		drbd_err(device, "sector: %llus, size: %d\n",
 			(unsigned long long)sector, size);
 		return 0;
@@ -1286,7 +1286,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
 	sector_t esector, nr_sectors;
 	int wake_up = 0;
 
-	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
 		drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
 				(unsigned long long)sector, size);
 		return;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index fe6295c30a27..29123dfb5f84 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -382,6 +382,12 @@ enum {
 	__EE_CALL_AL_COMPLETE_IO,
 	__EE_MAY_SET_IN_SYNC,
 
+	/* is this a TRIM aka REQ_DISCARD? */
+	__EE_IS_TRIM,
+	/* our lower level cannot handle trim,
+	 * and we want to fall back to zeroout instead */
+	__EE_IS_TRIM_USE_ZEROOUT,
+
 	/* In case a barrier failed,
 	 * we need to resubmit without the barrier flag. */
 	__EE_RESUBMITTED,
@@ -405,7 +411,9 @@ enum {
 };
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
-#define	EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_IS_TRIM             (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
 #define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
@@ -1162,6 +1170,12 @@ struct bm_extent {
 #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
 #define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
 
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE	AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
 extern int  drbd_bm_init(struct drbd_device *device);
 extern int  drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
 extern void drbd_bm_cleanup(struct drbd_device *device);
@@ -1359,6 +1373,8 @@ extern int w_start_resync(struct drbd_work *, int);
 extern void resync_timer_fn(unsigned long data);
 extern void start_resync_timer_fn(unsigned long data);
 
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
 /* drbd_receiver.c */
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_asender(struct drbd_thread *thi);
@@ -1370,6 +1386,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
 extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
 extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
 						     sector_t, unsigned int,
+						     bool,
 						     gfp_t) __must_hold(local);
 extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
 				 int);
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 3c04ec0ea333..478c63d0b6f3 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -54,6 +54,11 @@ enum drbd_packet {
 	P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
 	P_RETRY_WRITE	      = 0x2c, /* Protocol C: retry conflicting write request */
 	P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
+        /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+	/* REQ_DISCARD. We used "discard" in different contexts before,
+	 * which is why I chose TRIM here, to disambiguate. */
+	P_TRIM                = 0x31,
 
 	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
 	P_MAX_OPT_CMD	      = 0x101,
@@ -119,6 +124,11 @@ struct p_data {
 	u32	    dp_flags;
 } __packed;
 
+struct p_trim {
+	struct p_data p_data;
+	u32	    size;	/* == bio->bi_size */
+} __packed;
+
 /*
  * commands which share a struct:
  *  p_block_ack:
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 10d2dcb16bff..9640b645d5ba 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -65,7 +65,7 @@ enum finish_epoch {
 static int drbd_do_features(struct drbd_connection *connection);
 static int drbd_do_auth(struct drbd_connection *connection);
 static int drbd_disconnected(struct drbd_peer_device *);
-
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
 static int e_end_block(struct drbd_work *, int);
 
@@ -338,7 +338,7 @@ You must not have the req_lock:
 
 struct drbd_peer_request *
 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-		    unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
+		    unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
 {
 	struct drbd_device *device = peer_device->device;
 	struct drbd_peer_request *peer_req;
@@ -355,7 +355,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 		return NULL;
 	}
 
-	if (data_size) {
+	if (has_payload && data_size) {
 		page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
 		if (!page)
 			goto fail;
@@ -1325,6 +1325,20 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
 	int err = -ENOMEM;
 
+	if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+		/* wait for all pending IO completions, before we start
+		 * zeroing things out. */
+		conn_wait_active_ee_empty(first_peer_device(device)->connection);
+		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+			sector, ds >> 9, GFP_NOIO))
+			peer_req->flags |= EE_WAS_ERROR;
+		drbd_endio_write_sec_final(peer_req);
+		return 0;
+	}
+
+	if (peer_req->flags & EE_IS_TRIM)
+		nr_pages = 0; /* discards don't have any payload. */
+
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
@@ -1336,7 +1350,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 next_bio:
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	if (!bio) {
-		drbd_err(device, "submit_ee: Allocation of a bio failed\n");
+		drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
 		goto fail;
 	}
 	/* > peer_req->i.sector, unless this is the first bio */
@@ -1350,6 +1364,11 @@ next_bio:
 	bios = bio;
 	++n_bios;
 
+	if (rw & REQ_DISCARD) {
+		bio->bi_iter.bi_size = ds;
+		goto submit;
+	}
+
 	page_chain_for_each(page) {
 		unsigned len = min_t(unsigned, ds, PAGE_SIZE);
 		if (!bio_add_page(bio, page, len, 0)) {
@@ -1370,8 +1389,9 @@ next_bio:
 		sector += len >> 9;
 		--nr_pages;
 	}
-	D_ASSERT(device, page == NULL);
 	D_ASSERT(device, ds == 0);
+submit:
+	D_ASSERT(device, page == NULL);
 
 	atomic_set(&peer_req->pending_bios, n_bios);
 	do {
@@ -1500,19 +1520,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
  * and from receive_Data */
 static struct drbd_peer_request *
 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-	      int data_size) __must_hold(local)
+	      struct packet_info *pi) __must_hold(local)
 {
 	struct drbd_device *device = peer_device->device;
 	const sector_t capacity = drbd_get_capacity(device->this_bdev);
 	struct drbd_peer_request *peer_req;
 	struct page *page;
 	int dgs, ds, err;
+	int data_size = pi->size;
 	void *dig_in = peer_device->connection->int_dig_in;
 	void *dig_vv = peer_device->connection->int_dig_vv;
 	unsigned long *data;
+	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
 
 	dgs = 0;
-	if (peer_device->connection->peer_integrity_tfm) {
+	if (!trim && peer_device->connection->peer_integrity_tfm) {
 		dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
 		/*
 		 * FIXME: Receive the incoming digest into the receive buffer
@@ -1524,9 +1546,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 		data_size -= dgs;
 	}
 
+	if (trim) {
+		D_ASSERT(peer_device, data_size == 0);
+		data_size = be32_to_cpu(trim->size);
+	}
+
 	if (!expect(IS_ALIGNED(data_size, 512)))
 		return NULL;
-	if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+	/* prepare for larger trim requests. */
+	if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
 		return NULL;
 
 	/* even though we trust out peer,
@@ -1542,11 +1570,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO);
+	peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
 	if (!peer_req)
 		return NULL;
 
-	if (!data_size)
+	if (trim)
 		return peer_req;
 
 	ds = data_size;
@@ -1686,12 +1714,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused)
 }
 
 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
-			    int data_size) __releases(local)
+			    struct packet_info *pi) __releases(local)
 {
 	struct drbd_device *device = peer_device->device;
 	struct drbd_peer_request *peer_req;
 
-	peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size);
+	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
 	if (!peer_req)
 		goto fail;
 
@@ -1707,7 +1735,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
 	list_add(&peer_req->w.list, &device->sync_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
-	atomic_add(data_size >> 9, &device->rs_sect_ev);
+	atomic_add(pi->size >> 9, &device->rs_sect_ev);
 	if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
 		return 0;
 
@@ -1795,7 +1823,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
 		/* data is submitted to disk within recv_resync_read.
 		 * corresponding put_ldev done below on error,
 		 * or in drbd_peer_request_endio. */
-		err = recv_resync_read(peer_device, sector, pi->size);
+		err = recv_resync_read(peer_device, sector, pi);
 	} else {
 		if (__ratelimit(&drbd_ratelimit_state))
 			drbd_err(device, "Can not write resync data to local disk.\n");
@@ -2206,7 +2234,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	 */
 
 	sector = be64_to_cpu(p->sector);
-	peer_req = read_in_block(peer_device, p->block_id, sector, pi->size);
+	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
 	if (!peer_req) {
 		put_ldev(device);
 		return -EIO;
@@ -2216,7 +2244,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 
 	dp_flags = be32_to_cpu(p->dp_flags);
 	rw |= wire_flags_to_bio(dp_flags);
-	if (peer_req->pages == NULL) {
+	if (pi->cmd == P_TRIM) {
+		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+		peer_req->flags |= EE_IS_TRIM;
+		if (!blk_queue_discard(q))
+			peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, rw & REQ_DISCARD);
+		D_ASSERT(peer_device, peer_req->pages == NULL);
+	} else if (peer_req->pages == NULL) {
 		D_ASSERT(device, peer_req->i.size == 0);
 		D_ASSERT(device, dp_flags & DP_FLUSH);
 	}
@@ -2252,7 +2288,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 		update_peer_seq(peer_device, peer_seq);
 		spin_lock_irq(&device->resource->req_lock);
 	}
-	list_add(&peer_req->w.list, &device->active_ee);
+	/* if we use the zeroout fallback code, we process synchronously
+	 * and we wait for all pending requests, respectively wait for
+	 * active_ee to become empty in drbd_submit_peer_request();
+	 * better not add ourselves here. */
+	if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+		list_add(&peer_req->w.list, &device->active_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
 	if (device->state.conn == C_SYNC_TARGET)
@@ -2451,7 +2492,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
 	 * "criss-cross" setup, that might cause write-out on some other DRBD,
 	 * which in turn might block on the other node at this very place.  */
-	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO);
+	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+			true /* has real payload */, GFP_NOIO);
 	if (!peer_req) {
 		put_ldev(device);
 		return -ENOMEM;
@@ -4438,6 +4480,7 @@ static struct data_cmd drbd_cmd_handler[] = {
 	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
 	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
 	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
 };
 
 static void drbdd(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 7e633cad745a..5fd4eaee49bc 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele
 
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver, final stage.  */
-static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 {
 	unsigned long flags = 0;
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
@@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel
 
 	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
 
-	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+	/* FIXME do we want to detach for failed REQ_DISCARD?
+	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+	if (peer_req->flags & EE_WAS_ERROR)
 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
 
@@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error)
 	struct drbd_device *device = peer_req->peer_device->device;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 	int is_write = bio_data_dir(bio) == WRITE;
+	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
 
 	if (error && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
-				is_write ? "write" : "read", error,
+				is_write ? (is_discard ? "discard" : "write")
+					: "read", error,
 				(unsigned long long)peer_req->i.sector);
 	if (!error && !uptodate) {
 		if (__ratelimit(&drbd_ratelimit_state))
@@ -395,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
-				       size, GFP_TRY);
+				       size, true /* has real payload */, GFP_TRY);
 	if (!peer_req)
 		goto defer;
 

From 2f632aeb5302da93f760d965e970600b35907026 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:24 +0200
Subject: [PATCH 19/54] drbd: prepare sending side for REQ_DISCARD

Note that I do NOT call __drbd_chk_io_error for failed REQ_DISCARD.
That may be wrong, though, or needs to differ between EOPNOTSUPP and
other errors...

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_main.c   | 15 +++++++++++++--
 drivers/block/drbd/drbd_req.c    |  7 +++++++
 drivers/block/drbd/drbd_req.h    |  5 ++++-
 drivers/block/drbd/drbd_worker.c |  7 ++++++-
 4 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 05db597401e6..960645c26e6f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b
 		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
 }
 
-/* Used to send write requests
- * R_PRIMARY -> Peer	(P_DATA)
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
  */
 int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
 {
@@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 			dp_flags |= DP_SEND_WRITE_ACK;
 	}
 	p->dp_flags = cpu_to_be32(dp_flags);
+
+	if (dp_flags & DP_DISCARD) {
+		struct p_trim *t = (struct p_trim*)p;
+		t->size = cpu_to_be32(req->i.size);
+		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+		goto out;
+	}
+
+	/* our digest is still only over the payload.
+	 * TRIM does not carry any payload. */
 	if (dgs)
 		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
 	err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
@@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 		     ... Be noisy about digest too large ...
 		} */
 	}
+out:
 	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
 
 	return err;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3779c8d2875b..7aadd0906b01 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
 		break;
 
+	case DISCARD_COMPLETED_NOTSUPP:
+	case DISCARD_COMPLETED_WITH_ERROR:
+		/* I'd rather not detach from local disk just because it
+		 * failed a REQ_DISCARD. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
 	case QUEUE_FOR_NET_READ:
 		/* READ or READA, and
 		 * no local disk,
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 07cc8e8a6e2a..8566cd5866b4 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -110,11 +110,14 @@ enum drbd_req_event {
 	BARRIER_ACKED, /* in protocol A and B */
 	DATA_RECEIVED, /* (remote read) */
 
+	COMPLETED_OK,
 	READ_COMPLETED_WITH_ERROR,
 	READ_AHEAD_COMPLETED_WITH_ERROR,
 	WRITE_COMPLETED_WITH_ERROR,
+	DISCARD_COMPLETED_NOTSUPP,
+	DISCARD_COMPLETED_WITH_ERROR,
+
 	ABORT_DISK_IO,
-	COMPLETED_OK,
 	RESEND,
 	FAIL_FROZEN_DISK_IO,
 	RESTART_FROZEN_DISK_IO,
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5fd4eaee49bc..26338bedb25b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -267,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error)
 
 	/* to avoid recursion in __req_mod */
 	if (unlikely(error)) {
-		what = (bio_data_dir(bio) == WRITE)
+		if (bio->bi_rw & REQ_DISCARD)
+			what = (error == -EOPNOTSUPP)
+				? DISCARD_COMPLETED_NOTSUPP
+				: DISCARD_COMPLETED_WITH_ERROR;
+		else
+			what = (bio_data_dir(bio) == WRITE)
 			? WRITE_COMPLETED_WITH_ERROR
 			: (bio_rw(bio) == READ)
 			  ? READ_COMPLETED_WITH_ERROR

From 20c68fdea1646ed746abf19122d7699493927005 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:25 +0200
Subject: [PATCH 20/54] drbd: Enable QUEUE_FLAG_DISCARD only if the peer can
 recieve P_TRIM

Allow the user of REQ_DISCARD.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h      |  1 +
 drivers/block/drbd/drbd_nl.c       | 20 ++++++++++++++++++++
 drivers/block/drbd/drbd_protocol.h |  2 ++
 drivers/block/drbd/drbd_receiver.c | 18 ++++++++++++++----
 4 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 29123dfb5f84..d5f39fda6462 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -618,6 +618,7 @@ struct drbd_connection {
 	struct drbd_socket data;	/* data/barrier/cstate/parameter packets */
 	struct drbd_socket meta;	/* ping/ack (metadata) packets */
 	int agreed_pro_version;		/* actually used protocol version */
+	u32 agreed_features;
 	unsigned long last_received;	/* in jiffies, either socket */
 	unsigned int ko_count;
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index bb3679263ea7..2364b781d9ac 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1133,6 +1133,26 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
 
 	if (get_ldev_if_state(device, D_ATTACHING)) {
 		struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
+		struct drbd_connection *connection = first_peer_device(device)->connection;
+
+		if (blk_queue_discard(b) &&
+		    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
+			/* inherit from backing queue */
+			q->limits.discard_zeroes_data = 1;
+			/* For now, don't allow more than one activity log extent worth of data
+			 * to be discarded in one go. We may need to rework drbd_al_begin_io()
+			 * to allow for even larger discard ranges */
+			q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
+
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+			/* REALLY? Is stacking secdiscard "legal"? */
+			if (blk_queue_secdiscard(b))
+				queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		} else {
+			q->limits.max_discard_sectors = 0;
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+			queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+		}
 
 		blk_queue_stack_limits(q, b);
 
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 478c63d0b6f3..2da9104a3851 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -160,6 +160,8 @@ struct p_block_req {
  *   ReportParams
  */
 
+#define FF_TRIM      1
+
 struct p_connection_features {
 	u32 protocol_min;
 	u32 feature_flags;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9640b645d5ba..3fa3b78e2ea1 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -46,9 +46,10 @@
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
-
 #include "drbd_vli.h"
 
+#define PRO_FEATURES (FF_TRIM)
+
 struct packet_info {
 	enum drbd_packet cmd;
 	unsigned int size;
@@ -3705,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 		put_ldev(device);
 	}
 
+	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	drbd_reconsider_max_bio_size(device);
+	/* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+	   drbd_reconsider_max_bio_size(), we can be sure that after
+	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(device)) {
 		dd = drbd_determine_dev_size(device, ddsf, NULL);
@@ -3717,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 		drbd_set_my_capacity(device, p_size);
 	}
 
-	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
-	drbd_reconsider_max_bio_size(device);
-
 	if (get_ldev(device)) {
 		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
 			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
@@ -4688,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection)
 	memset(p, 0, sizeof(*p));
 	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
 	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	p->feature_flags = cpu_to_be32(PRO_FEATURES);
 	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
 }
 
@@ -4741,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection)
 		goto incompat;
 
 	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
 
 	drbd_info(connection, "Handshake successful: "
 	     "Agreed network protocol version %d\n", connection->agreed_pro_version);
 
+	drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+		  connection->agreed_features & FF_TRIM ? " " : " not ");
+
 	return 1;
 
  incompat:

From 074f4afeb2277bd5ecb9fa7f91eaffa55e262126 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars@linbit.com>
Date: Mon, 28 Apr 2014 18:43:26 +0200
Subject: [PATCH 21/54] drbd: fix a race between start_resync and
 send_and_submit

In the drbd make request function, specifically in
drbd_send_and_submit(), we decide whether we want to send the actual
write request, or only a "set this block out of sync" information.

We do so based on the current connection state, while holding the req_lock.
The connection state is not supposed to change while holding the req_lock.

But in drbd_start_resync, we did change that state anyways,
while only holding the global_state_lock, which is enough to change
sync-after dependencies (paused vs active resync), but
not good enough to change the connection state.

Fix: in drbd_start_resync, first grab the req_lock to serialize with
drbd_send_and_submit(), before grabbing the global_state_lock
to be able to evaluate the sync-after dependencies.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_worker.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 26338bedb25b..34dde10fae48 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1686,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	}
 	clear_bit(B_RS_H_DONE, &device->flags);
 
-	write_lock_irq(&global_state_lock);
+	/* req_lock: serialize with drbd_send_and_submit() and others
+	 * global_state_lock: for stable sync-after dependencies */
+	spin_lock_irq(&device->resource->req_lock);
+	write_lock(&global_state_lock);
 	/* Did some connection breakage or IO error race with us? */
 	if (device->state.conn < C_CONNECTED
 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
-		write_unlock_irq(&global_state_lock);
+		write_unlock(&global_state_lock);
+		spin_unlock_irq(&device->resource->req_lock);
 		mutex_unlock(device->state_mutex);
 		return;
 	}
@@ -1730,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		}
 		_drbd_pause_after(device);
 	}
-	write_unlock_irq(&global_state_lock);
+	write_unlock(&global_state_lock);
+	spin_unlock_irq(&device->resource->req_lock);
 
 	if (r == SS_SUCCESS) {
 		/* reset rs_last_bcast when a resync or verify is started,

From fa090e708a72f0ea9cbe067fba28cfb2b8b787af Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars@linbit.com>
Date: Mon, 28 Apr 2014 18:43:27 +0200
Subject: [PATCH 22/54] drbd: keep max-bio size during detach/attach on
 disconnected primary

We want to store in persistent meta data what the peer DRBD can handle,
which, due to spreading requests to multiple bios,
may be more than its backing device can handle.

Otherwise, if a disconnected Primary temporarily loses access to its local data
as well, we may accidentally shrink the max-bio setting, portentially causing
already assembled, but not yet processed, application bios to be spuriously
failed due to device limits.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 2364b781d9ac..a187c5b0da27 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1194,8 +1194,14 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
 			peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
 		else
 			peer = DRBD_MAX_BIO_SIZE;
-	}
 
+		/* We may later detach and re-attach on a disconnected Primary.
+		 * Avoid this setting to jump back in that case.
+		 * We want to store what we know the peer DRBD can handle,
+		 * not what the peer IO backend can handle. */
+		if (peer > device->peer_max_bio_size)
+			device->peer_max_bio_size = peer;
+	}
 	new = min(local, peer);
 
 	if (device->state.role == R_PRIMARY && new < now)

From e4d7d6f4d36daff6aad84f96e48debde8e6ed09e Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars@linbit.com>
Date: Mon, 28 Apr 2014 18:43:28 +0200
Subject: [PATCH 23/54] drbd: add back some fairness to AL transactions

When batching more updates to the activity log into single transactions,
we lost the ability for new requests to force themselves into the active
set: all preparation steps became non-blocking, and if all currently
hot extents keep busy, they could starve out new incoming requests
to cold extents for quite a while.

This can only happen if your IO backend accepts more IO operations per
average DRBD replication round trip time than you have al-extents
configured.

If we have incoming requests to cold extents,
at least do one blocking update per transaction.

In an artificial worst-case workload on SSD with an asynchronous 600 ms
replication link, with al-extents = 7 (the minimum we allow), and
concurrent full resynch, without this patch, some write requests have
been observed to be starved for 40 seconds.
With this patch, application observed a worst case latency of twice the
replication round trip time.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_actlog.c |  3 +--
 drivers/block/drbd/drbd_int.h    |  1 +
 drivers/block/drbd/drbd_req.c    | 20 ++++++++++++++++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 8fc5d71077bf..05a1780ffa85 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -203,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
 
 	BUG_ON(!bdev->md_bdev);
 
-	drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+	dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
 	     current->comm, current->pid, __func__,
 	     (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
 	     (void*)_RET_IP_ );
@@ -275,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
 	return _al_get(device, first, true);
 }
 
-static
 bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
 {
 	/* for bios crossing activity log extent boundaries,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d5f39fda6462..297ba406cc2b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1487,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s);
 extern const char *drbd_role_str(enum drbd_role s);
 
 /* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
 extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
 extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
 extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 7aadd0906b01..99411bf9d901 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1242,6 +1242,7 @@ void do_submit(struct work_struct *ws)
 		if (list_empty(&incoming))
 			break;
 
+skip_fast_path:
 		wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
 		/* Maybe more was queued, while we prepared the transaction?
 		 * Try to stuff them into this transaction as well.
@@ -1280,6 +1281,25 @@ void do_submit(struct work_struct *ws)
 			list_del_init(&req->tl_requests);
 			drbd_send_and_submit(device, req);
 		}
+
+		/* If all currently hot activity log extents are kept busy by
+		 * incoming requests, we still must not totally starve new
+		 * requests to cold extents. In that case, prepare one request
+		 * in blocking mode. */
+		list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
+			list_del_init(&req->tl_requests);
+			req->rq_state |= RQ_IN_ACT_LOG;
+			if (!drbd_al_begin_io_prepare(device, &req->i)) {
+				/* Corresponding extent was hot after all? */
+				drbd_send_and_submit(device, req);
+			} else {
+				/* Found a request to a cold extent.
+				 * Put on "pending" list,
+				 * and try to cumulate with more. */
+				list_add(&req->tl_requests, &pending);
+				goto skip_fast_path;
+			}
+		}
 	}
 }
 

From f9c78128f833ae3057884ca219259c8ae5db8898 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars@linbit.com>
Date: Mon, 28 Apr 2014 18:43:29 +0200
Subject: [PATCH 24/54] drbd: always implicitly close last epoch when idle

Once our sender thread needs to wait_for_work(),
and actually needs to schedule(), just before we do that,
we already check if it is useful to implicitly close the last epoch.

The condition was too strict: only implicitly close the epoch,
if there have been no new (write) requests at all.

The assumption was that if there were new requests, they would
always be communicated one way or another, and would send necessary
epoch separating barriers explicitly.

This is not always true, e.g. when becoming diskless,
or while explicitly starting a full resync.

The last communicated epoch could stay open for a long time,
locking down corresponding activity log extents.

It is safe to always implicitly send that last barrier, as soon as we
determin that there cannot be more requests in the last communicated
epoch, even if there have been (uncommunicated) new requests in new
epochs meanwhile.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_worker.c | 48 ++++++++++----------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 34dde10fae48..d8f57b6305cd 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1799,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	mutex_unlock(device->state_mutex);
 }
 
-/* If the resource already closed the current epoch, but we did not
- * (because we have not yet seen new requests), we should send the
- * corresponding barrier now.  Must be checked within the same spinlock
- * that is used to check for new requests. */
-static bool need_to_send_barrier(struct drbd_connection *connection)
-{
-	if (!connection->send.seen_any_write_yet)
-		return false;
-
-	/* Skip barriers that do not contain any writes.
-	 * This may happen during AHEAD mode. */
-	if (!connection->send.current_epoch_writes)
-		return false;
-
-	/* ->req_lock is held when requests are queued on
-	 * connection->sender_work, and put into ->transfer_log.
-	 * It is also held when ->current_tle_nr is increased.
-	 * So either there are already new requests queued,
-	 * and corresponding barriers will be send there.
-	 * Or nothing new is queued yet, so the difference will be 1.
-	 */
-	if (atomic_read(&connection->current_tle_nr) !=
-	    connection->send.current_epoch_nr + 1)
-		return false;
-
-	return true;
-}
-
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
 	spin_lock_irq(&queue->q_lock);
@@ -1885,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 			spin_unlock_irq(&connection->resource->req_lock);
 			break;
 		}
-		send_barrier = need_to_send_barrier(connection);
+
+		/* We found nothing new to do, no to-be-communicated request,
+		 * no other work item.  We may still need to close the last
+		 * epoch.  Next incoming request epoch will be connection ->
+		 * current transfer log epoch number.  If that is different
+		 * from the epoch of the last request we communicated, it is
+		 * safe to send the epoch separating barrier now.
+		 */
+		send_barrier =
+			atomic_read(&connection->current_tle_nr) !=
+			connection->send.current_epoch_nr;
 		spin_unlock_irq(&connection->resource->req_lock);
-		if (send_barrier) {
-			drbd_send_barrier(connection);
-			connection->send.current_epoch_nr++;
-		}
+
+		if (send_barrier)
+			maybe_send_barrier(connection,
+					connection->send.current_epoch_nr + 1);
 		schedule();
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.

From 67cca286caa6e33f3134bd36834d2484538f4f78 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:30 +0200
Subject: [PATCH 25/54] drbd: Fix a hole in the challange-response connection
 authentication

In the implementation as it was, the two peers sent each other
a challenge, and expects the challenge hashed with the shared
secret back.

A attacker could simply wait for the challenge of the peer, and
send the same challenge back. Then it waits for the response, and
sends the same response back.

Prevent this by not accepting a challenge from the peer that is
the same as the challenge sent to the peer.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_receiver.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3fa3b78e2ea1..b6c8aaf4931b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4846,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
 		goto fail;
 	}
 
+	if (pi.size < CHALLENGE_LEN) {
+		drbd_err(connection, "AuthChallenge payload too small.\n");
+		rv = -1;
+		goto fail;
+	}
+
 	peers_ch = kmalloc(pi.size, GFP_NOIO);
 	if (peers_ch == NULL) {
 		drbd_err(connection, "kmalloc of peers_ch failed\n");
@@ -4859,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
 		goto fail;
 	}
 
+	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+		drbd_err(connection, "Peer presented the same challenge!\n");
+		rv = -1;
+		goto fail;
+	}
+
 	resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
 	response = kmalloc(resp_size, GFP_NOIO);
 	if (response == NULL) {

From 08535466bce6bd91320990b9a614d52a3dc0f21d Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:31 +0200
Subject: [PATCH 26/54] drbd: evaluate disk and network timeout on different
 requests

Just because it is the oldest not yet completed request
does not make it the oldest request waiting for disk.
Or waiting for the peer.

And we completely missed already completed requests
that would still hold references to activity log extents,
waiting only for the barrier ack.

Find two oldest not yet completely processed requests,
one that is still waiting for local completion,
and one that is still waiting for some response from the peer.
These may or may not be the same request object.

Then separately apply the network and disk timeouts, respectively.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_req.c | 47 ++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 99411bf9d901..09803d0d5207 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1353,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
 	return limit;
 }
 
-static struct drbd_request *find_oldest_request(struct drbd_connection *connection)
+static void find_oldest_requests(
+		struct drbd_connection *connection,
+		struct drbd_device *device,
+		struct drbd_request **oldest_req_waiting_for_peer,
+		struct drbd_request **oldest_req_waiting_for_disk)
 {
-	/* Walk the transfer log,
-	 * and find the oldest not yet completed request */
 	struct drbd_request *r;
+	*oldest_req_waiting_for_peer = NULL;
+	*oldest_req_waiting_for_disk = NULL;
 	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
-		if (atomic_read(&r->completion_ref))
-			return r;
+		const unsigned s = r->rq_state;
+		if (!*oldest_req_waiting_for_peer
+		&& ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
+			*oldest_req_waiting_for_peer = r;
+
+		if (!*oldest_req_waiting_for_disk
+		&& (s & RQ_LOCAL_PENDING) && r->device == device)
+			*oldest_req_waiting_for_disk = r;
+
+		if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
+			break;
 	}
-	return NULL;
 }
 
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
 	struct drbd_connection *connection = first_peer_device(device)->connection;
-	struct drbd_request *req; /* oldest request */
+	struct drbd_request *req_disk, *req_peer; /* oldest request */
 	struct net_conf *nc;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
@@ -1393,8 +1405,8 @@ void request_timer_fn(unsigned long data)
 	now = jiffies;
 
 	spin_lock_irq(&device->resource->req_lock);
-	req = find_oldest_request(connection);
-	if (!req) {
+	find_oldest_requests(connection, device, &req_peer, &req_disk);
+	if (req_peer == NULL && req_disk == NULL) {
 		spin_unlock_irq(&device->resource->req_lock);
 		mod_timer(&device->request_timer, now + et);
 		return;
@@ -1416,19 +1428,26 @@ void request_timer_fn(unsigned long data)
 	 * ~198 days with 250 HZ, we have a window where the timeout would need
 	 * to expire twice (worst case) to become effective. Good enough.
 	 */
-	if (ent && req->rq_state & RQ_NET_PENDING &&
-		 time_after(now, req->start_time + ent) &&
+	if (ent && req_peer &&
+		 time_after(now, req_peer->start_time + ent) &&
 		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
 		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
 		_drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
 	}
-	if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device &&
-		 time_after(now, req->start_time + dt) &&
+	if (dt && req_disk &&
+		 time_after(now, req_disk->start_time + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
 		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
 		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
 	}
-	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
+
+	/* Reschedule timer for the nearest not already expired timeout.
+	 * Fallback to now + min(effective network timeout, disk timeout). */
+	ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
+		? req_peer->start_time + ent : now + et;
+	dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
+		? req_disk->start_time + dt : now + et;
+	nt = time_before(ent, dt) ? ent : dt;
 	spin_unlock_irq(&connection->resource->req_lock);
 	mod_timer(&device->request_timer, nt);
 }

From c1b3156f121fd301191e0b4c5fa2fec42cd17871 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:32 +0200
Subject: [PATCH 27/54] drbd: use blk_set_stacking_limits()

...instead directly assigning to q->limits.discard_zeroes_data

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 12 ++++++------
 include/linux/drbd.h         |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index a187c5b0da27..b4fc401b587f 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1114,15 +1114,18 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
 	struct request_queue * const q = device->rq_queue;
 	unsigned int max_hw_sectors = max_bio_size >> 9;
 	unsigned int max_segments = 0;
+	struct request_queue *b = NULL;
 
 	if (get_ldev_if_state(device, D_ATTACHING)) {
-		struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
+		b = device->ldev->backing_bdev->bd_disk->queue;
 
 		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
 		rcu_read_lock();
 		max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
 		rcu_read_unlock();
-		put_ldev(device);
+
+		blk_set_stacking_limits(&q->limits);
+		blk_queue_max_write_same_sectors(q, 0);
 	}
 
 	blk_queue_logical_block_size(q, 512);
@@ -1131,14 +1134,11 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
 	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
 	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
 
-	if (get_ldev_if_state(device, D_ATTACHING)) {
-		struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
+	if (b) {
 		struct drbd_connection *connection = first_peer_device(device)->connection;
 
 		if (blk_queue_discard(b) &&
 		    (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
-			/* inherit from backing queue */
-			q->limits.discard_zeroes_data = 1;
 			/* For now, don't allow more than one activity log extent worth of data
 			 * to be discarded in one go. We may need to rework drbd_al_begin_io()
 			 * to allow for even larger discard ranges */
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 3dbe9bd57a09..fffd4b8563cb 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.3"
+#define REL_VERSION "8.4.4"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101

From 02df6fe145715f1d3858c0c65aed991f148b70b4 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:33 +0200
Subject: [PATCH 28/54] drbd: Test cstate while holding req_lock

In case a connection transitions into C_TIMEOUT within the timer
function (request_timer_fn()) we need to make sure that the receiver
thread (potentially running on a different CPU) sees the updated
cstate later on.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 3 ++-
 include/linux/drbd.h         | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index b4fc401b587f..f4d3aff89aa1 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -442,12 +442,13 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
 	char *ex_to_string;
 	int r;
 
+	spin_lock_irq(&connection->resource->req_lock);
 	if (connection->cstate >= C_WF_REPORT_PARAMS) {
 		drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+		spin_unlock_irq(&connection->resource->req_lock);
 		return false;
 	}
 
-	spin_lock_irq(&connection->resource->req_lock);
 	connect_cnt = connection->connect_cnt;
 	spin_unlock_irq(&connection->resource->req_lock);
 
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index fffd4b8563cb..3dbe9bd57a09 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.4"
+#define REL_VERSION "8.4.3"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101

From babea49ebe010d0a533b5db20fa63c327402a71c Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Mon, 28 Apr 2014 18:43:34 +0200
Subject: [PATCH 29/54] drbd: Allow attaching of a newly created device to any
 backing device

A newly created device was never exposed before, i.e. has a
exposed_data_uuid of 0. Then it is valid to attach to any current_uuid
of a backing device (of course also to a newly created one (4))

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_nl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index f4d3aff89aa1..1b35c45c92b7 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1662,7 +1662,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (device->state.conn < C_CONNECTED &&
-	    device->state.role == R_PRIMARY &&
+	    device->state.role == R_PRIMARY && device->ed_uuid &&
 	    (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
 		drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
 		    (unsigned long long)device->ed_uuid);

From ec4a340789be16831ae96be5f7552238a7a6e903 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Mon, 28 Apr 2014 18:43:35 +0200
Subject: [PATCH 30/54] drbd: use list_first_entry_or_null in
 first_peer_device/first_connection

If there are no peer_devices or connections, I'd rather have NULL
than some "arbitrary" address pretending to point to a struct.

Helps to avoid hard to debug symptoms, in case we ever try to use
and dereference a drbd_connection or drbd_peer_device
where we in fact don't have any connection at all.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/drbd/drbd_int.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 297ba406cc2b..a76ceb344d64 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -853,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor)
 
 static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
 {
-	return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
 }
 
 #define for_each_resource(resource, _resources) \
@@ -2222,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device)
 
 static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
 {
-	return list_first_entry(&resource->connections,
+	return list_first_entry_or_null(&resource->connections,
 				struct drbd_connection, connections);
 }
 

From fc27691f3537a0df087214322467b642d1f6dedb Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Thu, 1 May 2014 15:12:36 +0800
Subject: [PATCH 31/54] block: null_blk: fix use after free

entry(cmd->ll_list) may belong to new request once end_cmd()
returns, so fix the bug with the patch.

Without the change, it is easy to observe oops when
doing null_blk(timer) test.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 8e7e3a0b0d24..e932398588aa 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -203,8 +203,8 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
 		entry = llist_reverse_order(entry);
 		do {
 			cmd = container_of(entry, struct nullb_cmd, ll_list);
-			end_cmd(cmd);
 			entry = entry->next;
+			end_cmd(cmd);
 		} while (entry);
 	}
 

From 5944b2ce5778f2a9c5eb4805cf8569feb2b4ed49 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:02 -0700
Subject: [PATCH 32/54] cdrom: convert cdinfo to cd_dbg

It's a debugging message, mark it so.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 241 +++++++++++++++++++++---------------------
 1 file changed, 121 insertions(+), 120 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 8a3aff724d98..3828c9212d88 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -312,17 +312,17 @@ static const char *mrw_format_status[] = {
 
 static const char *mrw_address_space[] = { "DMA", "GAA" };
 
-#if (ERRLOGMASK!=CD_NOTHING)
-#define cdinfo(type, fmt, args...)			\
+#if (ERRLOGMASK != CD_NOTHING)
+#define cd_dbg(type, fmt, ...)				\
 do {							\
 	if ((ERRLOGMASK & type) || debug == 1)		\
-		pr_info(fmt, ##args);			\
+		pr_debug(fmt, ##__VA_ARGS__);		\
 } while (0)
 #else
-#define cdinfo(type, fmt, args...)			\
+#define cd_dbg(type, fmt, ...)				\
 do {							\
 	if (0 && (ERRLOGMASK & type) || debug == 1)	\
-		pr_info(fmt, ##args);			\
+		pr_debug(fmt, ##__VA_ARGS__);		\
 } while (0)
 #endif
 
@@ -395,7 +395,7 @@ int register_cdrom(struct cdrom_device_info *cdi)
         struct cdrom_device_ops *cdo = cdi->ops;
         int *change_capability = (int *)&cdo->capability; /* hack */
 
-	cdinfo(CD_OPEN, "entering register_cdrom\n"); 
+	cd_dbg(CD_OPEN, "entering register_cdrom\n");
 
 	if (cdo->open == NULL || cdo->release == NULL)
 		return -EINVAL;
@@ -439,7 +439,7 @@ int register_cdrom(struct cdrom_device_info *cdi)
 	if (!cdo->generic_packet)
 		cdo->generic_packet = cdrom_dummy_generic_packet;
 
-	cdinfo(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
+	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
 	mutex_lock(&cdrom_mutex);
 	list_add(&cdi->list, &cdrom_list);
 	mutex_unlock(&cdrom_mutex);
@@ -449,7 +449,7 @@ int register_cdrom(struct cdrom_device_info *cdi)
 
 void unregister_cdrom(struct cdrom_device_info *cdi)
 {
-	cdinfo(CD_OPEN, "entering unregister_cdrom\n"); 
+	cd_dbg(CD_OPEN, "entering unregister_cdrom\n");
 
 	mutex_lock(&cdrom_mutex);
 	list_del(&cdi->list);
@@ -459,7 +459,7 @@ void unregister_cdrom(struct cdrom_device_info *cdi)
 		cdi->exit(cdi);
 
 	cdi->ops->n_minors--;
-	cdinfo(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
+	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
 }
 
 int cdrom_get_media_event(struct cdrom_device_info *cdi,
@@ -839,7 +839,7 @@ static int cdrom_ram_open_write(struct cdrom_device_info *cdi)
 	else if (CDF_RWRT == be16_to_cpu(rfd.feature_code))
 		ret = !rfd.curr;
 
-	cdinfo(CD_OPEN, "can open for random write\n");
+	cd_dbg(CD_OPEN, "can open for random write\n");
 	return ret;
 }
 
@@ -928,12 +928,12 @@ static void cdrom_dvd_rw_close_write(struct cdrom_device_info *cdi)
 	struct packet_command cgc;
 
 	if (cdi->mmc3_profile != 0x1a) {
-		cdinfo(CD_CLOSE, "%s: No DVD+RW\n", cdi->name);
+		cd_dbg(CD_CLOSE, "%s: No DVD+RW\n", cdi->name);
 		return;
 	}
 
 	if (!cdi->media_written) {
-		cdinfo(CD_CLOSE, "%s: DVD+RW media clean\n", cdi->name);
+		cd_dbg(CD_CLOSE, "%s: DVD+RW media clean\n", cdi->name);
 		return;
 	}
 
@@ -981,7 +981,7 @@ int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t
 {
 	int ret;
 
-	cdinfo(CD_OPEN, "entering cdrom_open\n"); 
+	cd_dbg(CD_OPEN, "entering cdrom_open\n");
 
 	/* open is event synchronization point, check events first */
 	check_disk_change(bdev);
@@ -1010,13 +1010,13 @@ int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t
 	if (ret)
 		goto err;
 
-	cdinfo(CD_OPEN, "Use count for \"/dev/%s\" now %d\n",
-			cdi->name, cdi->use_count);
+	cd_dbg(CD_OPEN, "Use count for \"/dev/%s\" now %d\n",
+	       cdi->name, cdi->use_count);
 	return 0;
 err_release:
 	if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
 		cdi->ops->lock_door(cdi, 0);
-		cdinfo(CD_OPEN, "door unlocked.\n");
+		cd_dbg(CD_OPEN, "door unlocked\n");
 	}
 	cdi->ops->release(cdi);
 err:
@@ -1030,21 +1030,21 @@ int open_for_data(struct cdrom_device_info * cdi)
 	int ret;
 	struct cdrom_device_ops *cdo = cdi->ops;
 	tracktype tracks;
-	cdinfo(CD_OPEN, "entering open_for_data\n");
+	cd_dbg(CD_OPEN, "entering open_for_data\n");
 	/* Check if the driver can report drive status.  If it can, we
 	   can do clever things.  If it can't, well, we at least tried! */
 	if (cdo->drive_status != NULL) {
 		ret = cdo->drive_status(cdi, CDSL_CURRENT);
-		cdinfo(CD_OPEN, "drive_status=%d\n", ret); 
+		cd_dbg(CD_OPEN, "drive_status=%d\n", ret);
 		if (ret == CDS_TRAY_OPEN) {
-			cdinfo(CD_OPEN, "the tray is open...\n"); 
+			cd_dbg(CD_OPEN, "the tray is open...\n");
 			/* can/may i close it? */
 			if (CDROM_CAN(CDC_CLOSE_TRAY) &&
 			    cdi->options & CDO_AUTO_CLOSE) {
-				cdinfo(CD_OPEN, "trying to close the tray.\n"); 
+				cd_dbg(CD_OPEN, "trying to close the tray\n");
 				ret=cdo->tray_move(cdi,0);
 				if (ret) {
-					cdinfo(CD_OPEN, "bummer. tried to close the tray but failed.\n"); 
+					cd_dbg(CD_OPEN, "bummer. tried to close the tray but failed.\n");
 					/* Ignore the error from the low
 					level driver.  We don't care why it
 					couldn't close the tray.  We only care 
@@ -1054,19 +1054,19 @@ int open_for_data(struct cdrom_device_info * cdi)
 					goto clean_up_and_return;
 				}
 			} else {
-				cdinfo(CD_OPEN, "bummer. this drive can't close the tray.\n"); 
+				cd_dbg(CD_OPEN, "bummer. this drive can't close the tray.\n");
 				ret=-ENOMEDIUM;
 				goto clean_up_and_return;
 			}
 			/* Ok, the door should be closed now.. Check again */
 			ret = cdo->drive_status(cdi, CDSL_CURRENT);
 			if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) {
-				cdinfo(CD_OPEN, "bummer. the tray is still not closed.\n"); 
-				cdinfo(CD_OPEN, "tray might not contain a medium.\n");
+				cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n");
+				cd_dbg(CD_OPEN, "tray might not contain a medium\n");
 				ret=-ENOMEDIUM;
 				goto clean_up_and_return;
 			}
-			cdinfo(CD_OPEN, "the tray is now closed.\n"); 
+			cd_dbg(CD_OPEN, "the tray is now closed\n");
 		}
 		/* the door should be closed now, check for the disc */
 		ret = cdo->drive_status(cdi, CDSL_CURRENT);
@@ -1077,7 +1077,7 @@ int open_for_data(struct cdrom_device_info * cdi)
 	}
 	cdrom_count_tracks(cdi, &tracks);
 	if (tracks.error == CDS_NO_DISC) {
-		cdinfo(CD_OPEN, "bummer. no disc.\n");
+		cd_dbg(CD_OPEN, "bummer. no disc.\n");
 		ret=-ENOMEDIUM;
 		goto clean_up_and_return;
 	}
@@ -1087,34 +1087,34 @@ int open_for_data(struct cdrom_device_info * cdi)
 		if (cdi->options & CDO_CHECK_TYPE) {
 		    /* give people a warning shot, now that CDO_CHECK_TYPE
 		       is the default case! */
-		    cdinfo(CD_OPEN, "bummer. wrong media type.\n"); 
-		    cdinfo(CD_WARNING, "pid %d must open device O_NONBLOCK!\n",
-					(unsigned int)task_pid_nr(current));
+		    cd_dbg(CD_OPEN, "bummer. wrong media type.\n");
+		    cd_dbg(CD_WARNING, "pid %d must open device O_NONBLOCK!\n",
+			   (unsigned int)task_pid_nr(current));
 		    ret=-EMEDIUMTYPE;
 		    goto clean_up_and_return;
 		}
 		else {
-		    cdinfo(CD_OPEN, "wrong media type, but CDO_CHECK_TYPE not set.\n");
+		    cd_dbg(CD_OPEN, "wrong media type, but CDO_CHECK_TYPE not set\n");
 		}
 	}
 
-	cdinfo(CD_OPEN, "all seems well, opening the device.\n"); 
+	cd_dbg(CD_OPEN, "all seems well, opening the devicen");
 
 	/* all seems well, we can open the device */
 	ret = cdo->open(cdi, 0); /* open for data */
-	cdinfo(CD_OPEN, "opening the device gave me %d.\n", ret); 
+	cd_dbg(CD_OPEN, "opening the device gave me %d\n", ret);
 	/* After all this careful checking, we shouldn't have problems
 	   opening the device, but we don't want the device locked if 
 	   this somehow fails... */
 	if (ret) {
-		cdinfo(CD_OPEN, "open device failed.\n"); 
+		cd_dbg(CD_OPEN, "open device failed\n");
 		goto clean_up_and_return;
 	}
 	if (CDROM_CAN(CDC_LOCK) && (cdi->options & CDO_LOCK)) {
 			cdo->lock_door(cdi, 1);
-			cdinfo(CD_OPEN, "door locked.\n");
+			cd_dbg(CD_OPEN, "door locked\n");
 	}
-	cdinfo(CD_OPEN, "device opened successfully.\n"); 
+	cd_dbg(CD_OPEN, "device opened successfully\n");
 	return ret;
 
 	/* Something failed.  Try to unlock the drive, because some drivers
@@ -1123,10 +1123,10 @@ int open_for_data(struct cdrom_device_info * cdi)
 	This ensures that the drive gets unlocked after a mount fails.  This 
 	is a goto to avoid bloating the driver with redundant code. */ 
 clean_up_and_return:
-	cdinfo(CD_OPEN, "open failed.\n"); 
+	cd_dbg(CD_OPEN, "open failed\n");
 	if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
 			cdo->lock_door(cdi, 0);
-			cdinfo(CD_OPEN, "door unlocked.\n");
+			cd_dbg(CD_OPEN, "door unlocked\n");
 	}
 	return ret;
 }
@@ -1139,21 +1139,21 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
 {
         int ret;
 	tracktype tracks;
-	cdinfo(CD_OPEN, "entering check_for_audio_disc\n");
+	cd_dbg(CD_OPEN, "entering check_for_audio_disc\n");
 	if (!(cdi->options & CDO_CHECK_TYPE))
 		return 0;
 	if (cdo->drive_status != NULL) {
 		ret = cdo->drive_status(cdi, CDSL_CURRENT);
-		cdinfo(CD_OPEN, "drive_status=%d\n", ret); 
+		cd_dbg(CD_OPEN, "drive_status=%d\n", ret);
 		if (ret == CDS_TRAY_OPEN) {
-			cdinfo(CD_OPEN, "the tray is open...\n"); 
+			cd_dbg(CD_OPEN, "the tray is open...\n");
 			/* can/may i close it? */
 			if (CDROM_CAN(CDC_CLOSE_TRAY) &&
 			    cdi->options & CDO_AUTO_CLOSE) {
-				cdinfo(CD_OPEN, "trying to close the tray.\n"); 
+				cd_dbg(CD_OPEN, "trying to close the tray\n");
 				ret=cdo->tray_move(cdi,0);
 				if (ret) {
-					cdinfo(CD_OPEN, "bummer. tried to close tray but failed.\n"); 
+					cd_dbg(CD_OPEN, "bummer. tried to close tray but failed.\n");
 					/* Ignore the error from the low
 					level driver.  We don't care why it
 					couldn't close the tray.  We only care 
@@ -1162,20 +1162,20 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
 					return -ENOMEDIUM;
 				}
 			} else {
-				cdinfo(CD_OPEN, "bummer. this driver can't close the tray.\n"); 
+				cd_dbg(CD_OPEN, "bummer. this driver can't close the tray.\n");
 				return -ENOMEDIUM;
 			}
 			/* Ok, the door should be closed now.. Check again */
 			ret = cdo->drive_status(cdi, CDSL_CURRENT);
 			if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) {
-				cdinfo(CD_OPEN, "bummer. the tray is still not closed.\n"); 
+				cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n");
 				return -ENOMEDIUM;
 			}	
 			if (ret!=CDS_DISC_OK) {
-				cdinfo(CD_OPEN, "bummer. disc isn't ready.\n"); 
+				cd_dbg(CD_OPEN, "bummer. disc isn't ready.\n");
 				return -EIO;
 			}	
-			cdinfo(CD_OPEN, "the tray is now closed.\n"); 
+			cd_dbg(CD_OPEN, "the tray is now closed\n");
 		}	
 	}
 	cdrom_count_tracks(cdi, &tracks);
@@ -1193,17 +1193,18 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode)
 	struct cdrom_device_ops *cdo = cdi->ops;
 	int opened_for_data;
 
-	cdinfo(CD_CLOSE, "entering cdrom_release\n");
+	cd_dbg(CD_CLOSE, "entering cdrom_release\n");
 
 	if (cdi->use_count > 0)
 		cdi->use_count--;
 
 	if (cdi->use_count == 0) {
-		cdinfo(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n", cdi->name);
+		cd_dbg(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n",
+		       cdi->name);
 		cdrom_dvd_rw_close_write(cdi);
 
 		if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) {
-			cdinfo(CD_CLOSE, "Unlocking door!\n");
+			cd_dbg(CD_CLOSE, "Unlocking door!\n");
 			cdo->lock_door(cdi, 0);
 		}
 	}
@@ -1262,7 +1263,7 @@ static int cdrom_slot_status(struct cdrom_device_info *cdi, int slot)
 	struct cdrom_changer_info *info;
 	int ret;
 
-	cdinfo(CD_CHANGER, "entering cdrom_slot_status()\n"); 
+	cd_dbg(CD_CHANGER, "entering cdrom_slot_status()\n");
 	if (cdi->sanyo_slot)
 		return CDS_NO_INFO;
 	
@@ -1292,7 +1293,7 @@ int cdrom_number_of_slots(struct cdrom_device_info *cdi)
 	int nslots = 1;
 	struct cdrom_changer_info *info;
 
-	cdinfo(CD_CHANGER, "entering cdrom_number_of_slots()\n"); 
+	cd_dbg(CD_CHANGER, "entering cdrom_number_of_slots()\n");
 	/* cdrom_read_mech_status requires a valid value for capacity: */
 	cdi->capacity = 0; 
 
@@ -1313,7 +1314,7 @@ static int cdrom_load_unload(struct cdrom_device_info *cdi, int slot)
 {
 	struct packet_command cgc;
 
-	cdinfo(CD_CHANGER, "entering cdrom_load_unload()\n"); 
+	cd_dbg(CD_CHANGER, "entering cdrom_load_unload()\n");
 	if (cdi->sanyo_slot && slot < 0)
 		return 0;
 
@@ -1342,7 +1343,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
 	int curslot;
 	int ret;
 
-	cdinfo(CD_CHANGER, "entering cdrom_select_disc()\n"); 
+	cd_dbg(CD_CHANGER, "entering cdrom_select_disc()\n");
 	if (!CDROM_CAN(CDC_SELECT_DISC))
 		return -EDRIVE_CANT_DO_THIS;
 
@@ -1487,7 +1488,7 @@ static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks)
 	tracks->cdi=0;
 	tracks->xa=0;
 	tracks->error=0;
-	cdinfo(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n"); 
+	cd_dbg(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n");
 	/* Grab the TOC header so we can see how many tracks there are */
 	if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header))) {
 		if (ret == -ENOMEDIUM)
@@ -1513,12 +1514,12 @@ static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks)
 			tracks->data++;
 		} else
 		    tracks->audio++;
-		cdinfo(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n",
+		cd_dbg(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n",
 		       i, entry.cdte_format, entry.cdte_ctrl);
 	}	
-	cdinfo(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n", 
-		header.cdth_trk1, tracks->audio, tracks->data, 
-		tracks->cdi, tracks->xa);
+	cd_dbg(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n",
+	       header.cdth_trk1, tracks->audio, tracks->data,
+	       tracks->cdi, tracks->xa);
 }	
 
 /* Requests to the low-level drivers will /always/ be done in the
@@ -1632,7 +1633,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 	switch (ai->type) {
 	/* LU data send */
 	case DVD_LU_SEND_AGID:
-		cdinfo(CD_DVD, "entering DVD_LU_SEND_AGID\n"); 
+		cd_dbg(CD_DVD, "entering DVD_LU_SEND_AGID\n");
 		cgc.quiet = 1;
 		setup_report_key(&cgc, ai->lsa.agid, 0);
 
@@ -1644,7 +1645,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 		break;
 
 	case DVD_LU_SEND_KEY1:
-		cdinfo(CD_DVD, "entering DVD_LU_SEND_KEY1\n"); 
+		cd_dbg(CD_DVD, "entering DVD_LU_SEND_KEY1\n");
 		setup_report_key(&cgc, ai->lsk.agid, 2);
 
 		if ((ret = cdo->generic_packet(cdi, &cgc)))
@@ -1655,7 +1656,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 		break;
 
 	case DVD_LU_SEND_CHALLENGE:
-		cdinfo(CD_DVD, "entering DVD_LU_SEND_CHALLENGE\n"); 
+		cd_dbg(CD_DVD, "entering DVD_LU_SEND_CHALLENGE\n");
 		setup_report_key(&cgc, ai->lsc.agid, 1);
 
 		if ((ret = cdo->generic_packet(cdi, &cgc)))
@@ -1667,7 +1668,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 
 	/* Post-auth key */
 	case DVD_LU_SEND_TITLE_KEY:
-		cdinfo(CD_DVD, "entering DVD_LU_SEND_TITLE_KEY\n"); 
+		cd_dbg(CD_DVD, "entering DVD_LU_SEND_TITLE_KEY\n");
 		cgc.quiet = 1;
 		setup_report_key(&cgc, ai->lstk.agid, 4);
 		cgc.cmd[5] = ai->lstk.lba;
@@ -1686,7 +1687,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 		break;
 
 	case DVD_LU_SEND_ASF:
-		cdinfo(CD_DVD, "entering DVD_LU_SEND_ASF\n"); 
+		cd_dbg(CD_DVD, "entering DVD_LU_SEND_ASF\n");
 		setup_report_key(&cgc, ai->lsasf.agid, 5);
 		
 		if ((ret = cdo->generic_packet(cdi, &cgc)))
@@ -1697,7 +1698,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 
 	/* LU data receive (LU changes state) */
 	case DVD_HOST_SEND_CHALLENGE:
-		cdinfo(CD_DVD, "entering DVD_HOST_SEND_CHALLENGE\n"); 
+		cd_dbg(CD_DVD, "entering DVD_HOST_SEND_CHALLENGE\n");
 		setup_send_key(&cgc, ai->hsc.agid, 1);
 		buf[1] = 0xe;
 		copy_chal(&buf[4], ai->hsc.chal);
@@ -1709,7 +1710,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 		break;
 
 	case DVD_HOST_SEND_KEY2:
-		cdinfo(CD_DVD, "entering DVD_HOST_SEND_KEY2\n"); 
+		cd_dbg(CD_DVD, "entering DVD_HOST_SEND_KEY2\n");
 		setup_send_key(&cgc, ai->hsk.agid, 3);
 		buf[1] = 0xa;
 		copy_key(&buf[4], ai->hsk.key);
@@ -1724,7 +1725,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 	/* Misc */
 	case DVD_INVALIDATE_AGID:
 		cgc.quiet = 1;
-		cdinfo(CD_DVD, "entering DVD_INVALIDATE_AGID\n"); 
+		cd_dbg(CD_DVD, "entering DVD_INVALIDATE_AGID\n");
 		setup_report_key(&cgc, ai->lsa.agid, 0x3f);
 		if ((ret = cdo->generic_packet(cdi, &cgc)))
 			return ret;
@@ -1732,7 +1733,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 
 	/* Get region settings */
 	case DVD_LU_SEND_RPC_STATE:
-		cdinfo(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n");
+		cd_dbg(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n");
 		setup_report_key(&cgc, 0, 8);
 		memset(&rpc_state, 0, sizeof(rpc_state_t));
 		cgc.buffer = (char *) &rpc_state;
@@ -1749,7 +1750,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 
 	/* Set region settings */
 	case DVD_HOST_SEND_RPC_STATE:
-		cdinfo(CD_DVD, "entering DVD_HOST_SEND_RPC_STATE\n");
+		cd_dbg(CD_DVD, "entering DVD_HOST_SEND_RPC_STATE\n");
 		setup_send_key(&cgc, 0, 6);
 		buf[1] = 6;
 		buf[4] = ai->hrpcs.pdrc;
@@ -1759,7 +1760,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
 		break;
 
 	default:
-		cdinfo(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type);
+		cd_dbg(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type);
 		return -ENOTTY;
 	}
 
@@ -1891,7 +1892,8 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s,
 
 	s->bca.len = buf[0] << 8 | buf[1];
 	if (s->bca.len < 12 || s->bca.len > 188) {
-		cdinfo(CD_WARNING, "Received invalid BCA length (%d)\n", s->bca.len);
+		cd_dbg(CD_WARNING, "Received invalid BCA length (%d)\n",
+		       s->bca.len);
 		ret = -EIO;
 		goto out;
 	}
@@ -1927,14 +1929,13 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
 
 	s->manufact.len = buf[0] << 8 | buf[1];
 	if (s->manufact.len < 0) {
-		cdinfo(CD_WARNING, "Received invalid manufacture info length"
-				   " (%d)\n", s->manufact.len);
+		cd_dbg(CD_WARNING, "Received invalid manufacture info length (%d)\n",
+		       s->manufact.len);
 		ret = -EIO;
 	} else {
 		if (s->manufact.len > 2048) {
-			cdinfo(CD_WARNING, "Received invalid manufacture info "
-					"length (%d): truncating to 2048\n",
-					s->manufact.len);
+			cd_dbg(CD_WARNING, "Received invalid manufacture info length (%d): truncating to 2048\n",
+			       s->manufact.len);
 			s->manufact.len = 2048;
 		}
 		memcpy(s->manufact.value, &buf[4], s->manufact.len);
@@ -1965,8 +1966,8 @@ static int dvd_read_struct(struct cdrom_device_info *cdi, dvd_struct *s,
 		return dvd_read_manufact(cdi, s, cgc);
 		
 	default:
-		cdinfo(CD_WARNING, ": Invalid DVD structure read requested (%d)\n",
-					s->type);
+		cd_dbg(CD_WARNING, ": Invalid DVD structure read requested (%d)\n",
+		       s->type);
 		return -EINVAL;
 	}
 }
@@ -2255,7 +2256,7 @@ static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi,
 	u8 requested_format;
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROMMULTISESSION\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n");
 
 	if (!(cdi->ops->capability & CDC_MULTI_SESSION))
 		return -ENOSYS;
@@ -2277,13 +2278,13 @@ static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi,
 	if (copy_to_user(argp, &ms_info, sizeof(ms_info)))
 		return -EFAULT;
 
-	cdinfo(CD_DO_IOCTL, "CDROMMULTISESSION successful\n");
+	cd_dbg(CD_DO_IOCTL, "CDROMMULTISESSION successful\n");
 	return 0;
 }
 
 static int cdrom_ioctl_eject(struct cdrom_device_info *cdi)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROMEJECT\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMEJECT\n");
 
 	if (!CDROM_CAN(CDC_OPEN_TRAY))
 		return -ENOSYS;
@@ -2300,7 +2301,7 @@ static int cdrom_ioctl_eject(struct cdrom_device_info *cdi)
 
 static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROMCLOSETRAY\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMCLOSETRAY\n");
 
 	if (!CDROM_CAN(CDC_CLOSE_TRAY))
 		return -ENOSYS;
@@ -2310,7 +2311,7 @@ static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi)
 static int cdrom_ioctl_eject_sw(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROMEJECT_SW\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMEJECT_SW\n");
 
 	if (!CDROM_CAN(CDC_OPEN_TRAY))
 		return -ENOSYS;
@@ -2329,7 +2330,7 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi,
 	struct cdrom_changer_info *info;
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROM_MEDIA_CHANGED\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_MEDIA_CHANGED\n");
 
 	if (!CDROM_CAN(CDC_MEDIA_CHANGED))
 		return -ENOSYS;
@@ -2355,7 +2356,7 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi,
 static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_SET_OPTIONS\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_SET_OPTIONS\n");
 
 	/*
 	 * Options need to be in sync with capability.
@@ -2383,7 +2384,7 @@ static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi,
 static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_CLEAR_OPTIONS\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_CLEAR_OPTIONS\n");
 
 	cdi->options &= ~(int) arg;
 	return cdi->options;
@@ -2392,7 +2393,7 @@ static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi,
 static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_SELECT_SPEED\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_SPEED\n");
 
 	if (!CDROM_CAN(CDC_SELECT_SPEED))
 		return -ENOSYS;
@@ -2402,7 +2403,7 @@ static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi,
 static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n");
 
 	if (!CDROM_CAN(CDC_SELECT_DISC))
 		return -ENOSYS;
@@ -2420,14 +2421,14 @@ static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi,
 	if (cdi->ops->select_disc)
 		return cdi->ops->select_disc(cdi, arg);
 
-	cdinfo(CD_CHANGER, "Using generic cdrom_select_disc()\n");
+	cd_dbg(CD_CHANGER, "Using generic cdrom_select_disc()\n");
 	return cdrom_select_disc(cdi, arg);
 }
 
 static int cdrom_ioctl_reset(struct cdrom_device_info *cdi,
 		struct block_device *bdev)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_RESET\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_RESET\n");
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -2440,7 +2441,7 @@ static int cdrom_ioctl_reset(struct cdrom_device_info *cdi,
 static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "%socking door.\n", arg ? "L" : "Unl");
+	cd_dbg(CD_DO_IOCTL, "%socking door\n", arg ? "L" : "Unl");
 
 	if (!CDROM_CAN(CDC_LOCK))
 		return -EDRIVE_CANT_DO_THIS;
@@ -2459,7 +2460,7 @@ static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi,
 static int cdrom_ioctl_debug(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "%sabling debug.\n", arg ? "En" : "Dis");
+	cd_dbg(CD_DO_IOCTL, "%sabling debug\n", arg ? "En" : "Dis");
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -2469,7 +2470,7 @@ static int cdrom_ioctl_debug(struct cdrom_device_info *cdi,
 
 static int cdrom_ioctl_get_capability(struct cdrom_device_info *cdi)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n");
 	return (cdi->ops->capability & ~cdi->mask);
 }
 
@@ -2485,7 +2486,7 @@ static int cdrom_ioctl_get_mcn(struct cdrom_device_info *cdi,
 	struct cdrom_mcn mcn;
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROM_GET_MCN\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_GET_MCN\n");
 
 	if (!(cdi->ops->capability & CDC_MCN))
 		return -ENOSYS;
@@ -2495,14 +2496,14 @@ static int cdrom_ioctl_get_mcn(struct cdrom_device_info *cdi,
 
 	if (copy_to_user(argp, &mcn, sizeof(mcn)))
 		return -EFAULT;
-	cdinfo(CD_DO_IOCTL, "CDROM_GET_MCN successful\n");
+	cd_dbg(CD_DO_IOCTL, "CDROM_GET_MCN successful\n");
 	return 0;
 }
 
 static int cdrom_ioctl_drive_status(struct cdrom_device_info *cdi,
 		unsigned long arg)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_DRIVE_STATUS\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_DRIVE_STATUS\n");
 
 	if (!(cdi->ops->capability & CDC_DRIVE_STATUS))
 		return -ENOSYS;
@@ -2535,7 +2536,7 @@ static int cdrom_ioctl_disc_status(struct cdrom_device_info *cdi)
 {
 	tracktype tracks;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROM_DISC_STATUS\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_DISC_STATUS\n");
 
 	cdrom_count_tracks(cdi, &tracks);
 	if (tracks.error)
@@ -2557,13 +2558,13 @@ static int cdrom_ioctl_disc_status(struct cdrom_device_info *cdi)
 		return CDS_DATA_1;
 	/* Policy mode off */
 
-	cdinfo(CD_WARNING,"This disc doesn't have any tracks I recognize!\n");
+	cd_dbg(CD_WARNING, "This disc doesn't have any tracks I recognize!\n");
 	return CDS_NO_INFO;
 }
 
 static int cdrom_ioctl_changer_nslots(struct cdrom_device_info *cdi)
 {
-	cdinfo(CD_DO_IOCTL, "entering CDROM_CHANGER_NSLOTS\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_CHANGER_NSLOTS\n");
 	return cdi->capacity;
 }
 
@@ -2574,7 +2575,7 @@ static int cdrom_ioctl_get_subchnl(struct cdrom_device_info *cdi,
 	u8 requested, back;
 	int ret;
 
-	/* cdinfo(CD_DO_IOCTL,"entering CDROMSUBCHNL\n");*/
+	/* cd_dbg(CD_DO_IOCTL,"entering CDROMSUBCHNL\n");*/
 
 	if (copy_from_user(&q, argp, sizeof(q)))
 		return -EFAULT;
@@ -2594,7 +2595,7 @@ static int cdrom_ioctl_get_subchnl(struct cdrom_device_info *cdi,
 
 	if (copy_to_user(argp, &q, sizeof(q)))
 		return -EFAULT;
-	/* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
+	/* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
 	return 0;
 }
 
@@ -2604,7 +2605,7 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi,
 	struct cdrom_tochdr header;
 	int ret;
 
-	/* cdinfo(CD_DO_IOCTL, "entering CDROMREADTOCHDR\n"); */
+	/* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCHDR\n"); */
 
 	if (copy_from_user(&header, argp, sizeof(header)))
 		return -EFAULT;
@@ -2615,7 +2616,7 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi,
 
 	if (copy_to_user(argp, &header, sizeof(header)))
 		return -EFAULT;
-	/* cdinfo(CD_DO_IOCTL, "CDROMREADTOCHDR successful\n"); */
+	/* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCHDR successful\n"); */
 	return 0;
 }
 
@@ -2626,7 +2627,7 @@ static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi,
 	u8 requested_format;
 	int ret;
 
-	/* cdinfo(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */
+	/* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */
 
 	if (copy_from_user(&entry, argp, sizeof(entry)))
 		return -EFAULT;
@@ -2643,7 +2644,7 @@ static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi,
 
 	if (copy_to_user(argp, &entry, sizeof(entry)))
 		return -EFAULT;
-	/* cdinfo(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */
+	/* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */
 	return 0;
 }
 
@@ -2652,7 +2653,7 @@ static int cdrom_ioctl_play_msf(struct cdrom_device_info *cdi,
 {
 	struct cdrom_msf msf;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
 
 	if (!CDROM_CAN(CDC_PLAY_AUDIO))
 		return -ENOSYS;
@@ -2667,7 +2668,7 @@ static int cdrom_ioctl_play_trkind(struct cdrom_device_info *cdi,
 	struct cdrom_ti ti;
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROMPLAYTRKIND\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYTRKIND\n");
 
 	if (!CDROM_CAN(CDC_PLAY_AUDIO))
 		return -ENOSYS;
@@ -2684,7 +2685,7 @@ static int cdrom_ioctl_volctrl(struct cdrom_device_info *cdi,
 {
 	struct cdrom_volctrl volume;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROMVOLCTRL\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMVOLCTRL\n");
 
 	if (!CDROM_CAN(CDC_PLAY_AUDIO))
 		return -ENOSYS;
@@ -2699,7 +2700,7 @@ static int cdrom_ioctl_volread(struct cdrom_device_info *cdi,
 	struct cdrom_volctrl volume;
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROMVOLREAD\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMVOLREAD\n");
 
 	if (!CDROM_CAN(CDC_PLAY_AUDIO))
 		return -ENOSYS;
@@ -2718,7 +2719,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
 {
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "doing audio ioctl (start/stop/pause/resume)\n");
+	cd_dbg(CD_DO_IOCTL, "doing audio ioctl (start/stop/pause/resume)\n");
 
 	if (!CDROM_CAN(CDC_PLAY_AUDIO))
 		return -ENOSYS;
@@ -2796,7 +2797,7 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
 	}
 
 	/*
-	 * Note: most of the cdinfo() calls are commented out here,
+	 * Note: most of the cd_dbg() calls are commented out here,
 	 * because they fill up the sys log when CD players poll
 	 * the drive.
 	 */
@@ -2955,7 +2956,7 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
 	sanitize_format(&q.cdsc_absaddr, &back, requested);
 	sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
 	IOCTL_OUT(arg, struct cdrom_subchnl, q);
-	/* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
+	/* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
 	return 0;
 }
 
@@ -2965,7 +2966,7 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
 {
 	struct cdrom_device_ops *cdo = cdi->ops;
 	struct cdrom_msf msf;
-	cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
 	IOCTL_IN(arg, struct cdrom_msf, msf);
 	cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF;
 	cgc->cmd[3] = msf.cdmsf_min0;
@@ -2984,7 +2985,7 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
 {
 	struct cdrom_device_ops *cdo = cdi->ops;
 	struct cdrom_blk blk;
-	cdinfo(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
 	IOCTL_IN(arg, struct cdrom_blk, blk);
 	cgc->cmd[0] = GPCMD_PLAY_AUDIO_10;
 	cgc->cmd[2] = (blk.from >> 24) & 0xff;
@@ -3008,7 +3009,7 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
 	unsigned short offset;
 	int ret;
 
-	cdinfo(CD_DO_IOCTL, "entering CDROMVOLUME\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMVOLUME\n");
 
 	IOCTL_IN(arg, struct cdrom_volctrl, volctrl);
 
@@ -3073,7 +3074,7 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
 					int cmd)
 {
 	struct cdrom_device_ops *cdo = cdi->ops;
-	cdinfo(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
 	cgc->cmd[0] = GPCMD_START_STOP_UNIT;
 	cgc->cmd[1] = 1;
 	cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0;
@@ -3086,7 +3087,7 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
 					int cmd)
 {
 	struct cdrom_device_ops *cdo = cdi->ops;
-	cdinfo(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
 	cgc->cmd[0] = GPCMD_PAUSE_RESUME;
 	cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
 	cgc->data_direction = CGC_DATA_NONE;
@@ -3108,7 +3109,7 @@ static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
 	if (!s)
 		return -ENOMEM;
 
-	cdinfo(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n");
+	cd_dbg(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n");
 	if (copy_from_user(s, arg, size)) {
 		kfree(s);
 		return -EFAULT;
@@ -3132,7 +3133,7 @@ static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi,
 	dvd_authinfo ai;
 	if (!CDROM_CAN(CDC_DVD))
 		return -ENOSYS;
-	cdinfo(CD_DO_IOCTL, "entering DVD_AUTH\n");
+	cd_dbg(CD_DO_IOCTL, "entering DVD_AUTH\n");
 	IOCTL_IN(arg, dvd_authinfo, ai);
 	ret = dvd_do_auth(cdi, &ai);
 	if (ret)
@@ -3146,7 +3147,7 @@ static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi,
 {
 	int ret;
 	long next = 0;
-	cdinfo(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n");
 	ret = cdrom_get_next_writable(cdi, &next);
 	if (ret)
 		return ret;
@@ -3159,7 +3160,7 @@ static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi,
 {
 	int ret;
 	long last = 0;
-	cdinfo(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n");
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n");
 	ret = cdrom_get_last_written(cdi, &last);
 	if (ret)
 		return ret;

From e3b6b9ef61f41d0c79f7a1341679ad4b20d7e788 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:03 -0700
Subject: [PATCH 33/54] cdrom: Remove unused CHECKAUDIO macro

It's unused, make it disappear.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 3828c9212d88..8eaba535d2b2 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -339,9 +339,6 @@ do {							\
    a lot of places. This macro makes the code more clear. */
 #define CDROM_CAN(type) (cdi->ops->capability & ~cdi->mask & (type))
 
-/* used in the audio ioctls */
-#define CHECKAUDIO if ((ret=check_for_audio_disc(cdi, cdo))) return ret
-
 /*
  * Another popular OS uses 7 seconds as the hard timeout for default
  * commands, so it is a good choice for us as well.

From a09c391df309c846b7c49771526acd311ccbc93e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:04 -0700
Subject: [PATCH 34/54] cdrom: Remove obfuscating IOCTL_IN and IOCTL_OUT macros

Macros with hidden control flow aren't nice.
Just use copy_to/from_user directly instead.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 48 ++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 8eaba535d2b2..47dee5ed5cba 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -326,15 +326,6 @@ do {							\
 } while (0)
 #endif
 
-/* These are used to simplify getting data in from and back to user land */
-#define IOCTL_IN(arg, type, in)					\
-	if (copy_from_user(&(in), (type __user *) (arg), sizeof (in)))	\
-		return -EFAULT;
-
-#define IOCTL_OUT(arg, type, out) \
-	if (copy_to_user((type __user *) (arg), &(out), sizeof (out)))	\
-		return -EFAULT;
-
 /* The (cdo->capability & ~cdi->mask & CDC_XXX) construct was used in
    a lot of places. This macro makes the code more clear. */
 #define CDROM_CAN(type) (cdi->ops->capability & ~cdi->mask & (type))
@@ -2874,7 +2865,8 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
 		blocksize = CD_FRAMESIZE_RAW0;
 		break;
 	}
-	IOCTL_IN(arg, struct cdrom_msf, msf);
+	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
+		return -EFAULT;
 	lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0);
 	/* FIXME: we need upper bound checking, too!! */
 	if (lba < 0)
@@ -2916,7 +2908,9 @@ static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi,
 	struct cdrom_read_audio ra;
 	int lba;
 
-	IOCTL_IN(arg, struct cdrom_read_audio, ra);
+	if (copy_from_user(&ra, (struct cdrom_read_audio __user *)arg,
+			   sizeof(ra)))
+		return -EFAULT;
 
 	if (ra.addr_format == CDROM_MSF)
 		lba = msf_to_lba(ra.addr.msf.minute,
@@ -2940,7 +2934,8 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
 	int ret;
 	struct cdrom_subchnl q;
 	u_char requested, back;
-	IOCTL_IN(arg, struct cdrom_subchnl, q);
+	if (copy_from_user(&q, (struct cdrom_subchnl __user *)arg, sizeof(q)))
+		return -EFAULT;
 	requested = q.cdsc_format;
 	if (!((requested == CDROM_MSF) ||
 	      (requested == CDROM_LBA)))
@@ -2952,7 +2947,8 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
 	back = q.cdsc_format; /* local copy */
 	sanitize_format(&q.cdsc_absaddr, &back, requested);
 	sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
-	IOCTL_OUT(arg, struct cdrom_subchnl, q);
+	if (copy_to_user((struct cdrom_subchnl __user *)arg, &q, sizeof(q)))
+		return -EFAULT;
 	/* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
 	return 0;
 }
@@ -2964,7 +2960,8 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
 	struct cdrom_device_ops *cdo = cdi->ops;
 	struct cdrom_msf msf;
 	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
-	IOCTL_IN(arg, struct cdrom_msf, msf);
+	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
+		return -EFAULT;
 	cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF;
 	cgc->cmd[3] = msf.cdmsf_min0;
 	cgc->cmd[4] = msf.cdmsf_sec0;
@@ -2983,7 +2980,8 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
 	struct cdrom_device_ops *cdo = cdi->ops;
 	struct cdrom_blk blk;
 	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
-	IOCTL_IN(arg, struct cdrom_blk, blk);
+	if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk)))
+		return -EFAULT;
 	cgc->cmd[0] = GPCMD_PLAY_AUDIO_10;
 	cgc->cmd[2] = (blk.from >> 24) & 0xff;
 	cgc->cmd[3] = (blk.from >> 16) & 0xff;
@@ -3008,7 +3006,9 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
 
 	cd_dbg(CD_DO_IOCTL, "entering CDROMVOLUME\n");
 
-	IOCTL_IN(arg, struct cdrom_volctrl, volctrl);
+	if (copy_from_user(&volctrl, (struct cdrom_volctrl __user *)arg,
+			   sizeof(volctrl)))
+		return -EFAULT;
 
 	cgc->buffer = buffer;
 	cgc->buflen = 24;
@@ -3045,7 +3045,9 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
 		volctrl.channel1 = buffer[offset+11];
 		volctrl.channel2 = buffer[offset+13];
 		volctrl.channel3 = buffer[offset+15];
-		IOCTL_OUT(arg, struct cdrom_volctrl, volctrl);
+		if (copy_to_user((struct cdrom_volctrl __user *)arg, &volctrl,
+				 sizeof(volctrl)))
+			return -EFAULT;
 		return 0;
 	}
 		
@@ -3131,11 +3133,13 @@ static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi,
 	if (!CDROM_CAN(CDC_DVD))
 		return -ENOSYS;
 	cd_dbg(CD_DO_IOCTL, "entering DVD_AUTH\n");
-	IOCTL_IN(arg, dvd_authinfo, ai);
+	if (copy_from_user(&ai, (dvd_authinfo __user *)arg, sizeof(ai)))
+		return -EFAULT;
 	ret = dvd_do_auth(cdi, &ai);
 	if (ret)
 		return ret;
-	IOCTL_OUT(arg, dvd_authinfo, ai);
+	if (copy_to_user((dvd_authinfo __user *)arg, &ai, sizeof(ai)))
+		return -EFAULT;
 	return 0;
 }
 
@@ -3148,7 +3152,8 @@ static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi,
 	ret = cdrom_get_next_writable(cdi, &next);
 	if (ret)
 		return ret;
-	IOCTL_OUT(arg, long, next);
+	if (copy_to_user((long __user *)arg, &next, sizeof(next)))
+		return -EFAULT;
 	return 0;
 }
 
@@ -3161,7 +3166,8 @@ static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi,
 	ret = cdrom_get_last_written(cdi, &last);
 	if (ret)
 		return ret;
-	IOCTL_OUT(arg, long, last);
+	if (copy_to_user((long __user *)arg, &last, sizeof(last)))
+		return -EFAULT;
 	return 0;
 }
 

From 82b91540ba24a1295c82572b922d50cbcf48fcee Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:05 -0700
Subject: [PATCH 35/54] cdrom: Remove prototype for open_for_data

Move static function to the appropriate place to remove
the now unnecessary prototype.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 114 +++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 47dee5ed5cba..5a38b5681ca3 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -337,7 +337,6 @@ do {							\
 #define CDROM_DEF_TIMEOUT	(7 * HZ)
 
 /* Not-exported routines. */
-static int open_for_data(struct cdrom_device_info * cdi);
 static int check_for_audio_disc(struct cdrom_device_info * cdi,
 			 struct cdrom_device_ops * cdo);
 static void sanitize_format(union cdrom_addr *addr, 
@@ -957,63 +956,8 @@ static int cdrom_close_write(struct cdrom_device_info *cdi)
 #endif
 }
 
-/* We use the open-option O_NONBLOCK to indicate that the
- * purpose of opening is only for subsequent ioctl() calls; no device
- * integrity checks are performed.
- *
- * We hope that all cd-player programs will adopt this convention. It
- * is in their own interest: device control becomes a lot easier
- * this way.
- */
-int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode)
-{
-	int ret;
-
-	cd_dbg(CD_OPEN, "entering cdrom_open\n");
-
-	/* open is event synchronization point, check events first */
-	check_disk_change(bdev);
-
-	/* if this was a O_NONBLOCK open and we should honor the flags,
-	 * do a quick open without drive/disc integrity checks. */
-	cdi->use_count++;
-	if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) {
-		ret = cdi->ops->open(cdi, 1);
-	} else {
-		ret = open_for_data(cdi);
-		if (ret)
-			goto err;
-		cdrom_mmc3_profile(cdi);
-		if (mode & FMODE_WRITE) {
-			ret = -EROFS;
-			if (cdrom_open_write(cdi))
-				goto err_release;
-			if (!CDROM_CAN(CDC_RAM))
-				goto err_release;
-			ret = 0;
-			cdi->media_written = 0;
-		}
-	}
-
-	if (ret)
-		goto err;
-
-	cd_dbg(CD_OPEN, "Use count for \"/dev/%s\" now %d\n",
-	       cdi->name, cdi->use_count);
-	return 0;
-err_release:
-	if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
-		cdi->ops->lock_door(cdi, 0);
-		cd_dbg(CD_OPEN, "door unlocked\n");
-	}
-	cdi->ops->release(cdi);
-err:
-	cdi->use_count--;
-	return ret;
-}
-
 static
-int open_for_data(struct cdrom_device_info * cdi)
+int open_for_data(struct cdrom_device_info *cdi)
 {
 	int ret;
 	struct cdrom_device_ops *cdo = cdi->ops;
@@ -1119,6 +1063,62 @@ clean_up_and_return:
 	return ret;
 }
 
+/* We use the open-option O_NONBLOCK to indicate that the
+ * purpose of opening is only for subsequent ioctl() calls; no device
+ * integrity checks are performed.
+ *
+ * We hope that all cd-player programs will adopt this convention. It
+ * is in their own interest: device control becomes a lot easier
+ * this way.
+ */
+int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev,
+	       fmode_t mode)
+{
+	int ret;
+
+	cd_dbg(CD_OPEN, "entering cdrom_open\n");
+
+	/* open is event synchronization point, check events first */
+	check_disk_change(bdev);
+
+	/* if this was a O_NONBLOCK open and we should honor the flags,
+	 * do a quick open without drive/disc integrity checks. */
+	cdi->use_count++;
+	if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) {
+		ret = cdi->ops->open(cdi, 1);
+	} else {
+		ret = open_for_data(cdi);
+		if (ret)
+			goto err;
+		cdrom_mmc3_profile(cdi);
+		if (mode & FMODE_WRITE) {
+			ret = -EROFS;
+			if (cdrom_open_write(cdi))
+				goto err_release;
+			if (!CDROM_CAN(CDC_RAM))
+				goto err_release;
+			ret = 0;
+			cdi->media_written = 0;
+		}
+	}
+
+	if (ret)
+		goto err;
+
+	cd_dbg(CD_OPEN, "Use count for \"/dev/%s\" now %d\n",
+	       cdi->name, cdi->use_count);
+	return 0;
+err_release:
+	if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
+		cdi->ops->lock_door(cdi, 0);
+		cd_dbg(CD_OPEN, "door unlocked\n");
+	}
+	cdi->ops->release(cdi);
+err:
+	cdi->use_count--;
+	return ret;
+}
+
 /* This code is similar to that in open_for_data. The routine is called
    whenever an audio play operation is requested.
 */

From b62d6e82a73d5834ed08d04bd3e52d19aa62c100 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:06 -0700
Subject: [PATCH 36/54] cdrom: Remove unnecessary check_for_audio_disc
 prototype

The actual static is defined below it but not used until later.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 5a38b5681ca3..a570e5f66e48 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -337,8 +337,6 @@ do {							\
 #define CDROM_DEF_TIMEOUT	(7 * HZ)
 
 /* Not-exported routines. */
-static int check_for_audio_disc(struct cdrom_device_info * cdi,
-			 struct cdrom_device_ops * cdo);
 static void sanitize_format(union cdrom_addr *addr, 
 		u_char * curr, u_char requested);
 static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,

From 1237d1eecff27015617e5a43cc9528658a5fa8ef Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:07 -0700
Subject: [PATCH 37/54] cdrom: Remove unnecessary sanitize_format prototype

It's defined below without being called.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index a570e5f66e48..08abbae6bf68 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -337,8 +337,6 @@ do {							\
 #define CDROM_DEF_TIMEOUT	(7 * HZ)
 
 /* Not-exported routines. */
-static void sanitize_format(union cdrom_addr *addr, 
-		u_char * curr, u_char requested);
 static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
 		     unsigned long arg);
 

From 2e9aa08a4801d439daec4c7d1b953c27b212d423 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:08 -0700
Subject: [PATCH 38/54] cdrom: Move mmc_ioctls above cdrom_ioctl to remove
 unnecessary prototype

Neaten the spacing too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 806 +++++++++++++++++++++---------------------
 1 file changed, 402 insertions(+), 404 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 08abbae6bf68..332c3aee1346 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -337,8 +337,6 @@ do {							\
 #define CDROM_DEF_TIMEOUT	(7 * HZ)
 
 /* Not-exported routines. */
-static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
-		     unsigned long arg);
 
 int cdrom_get_last_written(struct cdrom_device_info *, long *);
 static int cdrom_get_next_writable(struct cdrom_device_info *, long *);
@@ -2713,6 +2711,408 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
 	return cdi->ops->audio_ioctl(cdi, cmd, NULL);
 }
 
+/*
+ * Required when we need to use READ_10 to issue other than 2048 block
+ * reads
+ */
+static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct packet_command cgc;
+	struct modesel_head mh;
+
+	memset(&mh, 0, sizeof(mh));
+	mh.block_desc_length = 0x08;
+	mh.block_length_med = (size >> 8) & 0xff;
+	mh.block_length_lo = size & 0xff;
+
+	memset(&cgc, 0, sizeof(cgc));
+	cgc.cmd[0] = 0x15;
+	cgc.cmd[1] = 1 << 4;
+	cgc.cmd[4] = 12;
+	cgc.buflen = sizeof(mh);
+	cgc.buffer = (char *) &mh;
+	cgc.data_direction = CGC_DATA_WRITE;
+	mh.block_desc_length = 0x08;
+	mh.block_length_med = (size >> 8) & 0xff;
+	mh.block_length_lo = size & 0xff;
+
+	return cdo->generic_packet(cdi, &cgc);
+}
+
+static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
+					      void __user *arg,
+					      struct packet_command *cgc,
+					      int cmd)
+{
+	struct request_sense sense;
+	struct cdrom_msf msf;
+	int blocksize = 0, format = 0, lba;
+	int ret;
+
+	switch (cmd) {
+	case CDROMREADRAW:
+		blocksize = CD_FRAMESIZE_RAW;
+		break;
+	case CDROMREADMODE1:
+		blocksize = CD_FRAMESIZE;
+		format = 2;
+		break;
+	case CDROMREADMODE2:
+		blocksize = CD_FRAMESIZE_RAW0;
+		break;
+	}
+	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
+		return -EFAULT;
+	lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0);
+	/* FIXME: we need upper bound checking, too!! */
+	if (lba < 0)
+		return -EINVAL;
+
+	cgc->buffer = kzalloc(blocksize, GFP_KERNEL);
+	if (cgc->buffer == NULL)
+		return -ENOMEM;
+
+	memset(&sense, 0, sizeof(sense));
+	cgc->sense = &sense;
+	cgc->data_direction = CGC_DATA_READ;
+	ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
+	if (ret && sense.sense_key == 0x05 &&
+	    sense.asc == 0x20 &&
+	    sense.ascq == 0x00) {
+		/*
+		 * SCSI-II devices are not required to support
+		 * READ_CD, so let's try switching block size
+		 */
+		/* FIXME: switch back again... */
+		ret = cdrom_switch_blocksize(cdi, blocksize);
+		if (ret)
+			goto out;
+		cgc->sense = NULL;
+		ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
+		ret |= cdrom_switch_blocksize(cdi, blocksize);
+	}
+	if (!ret && copy_to_user(arg, cgc->buffer, blocksize))
+		ret = -EFAULT;
+out:
+	kfree(cgc->buffer);
+	return ret;
+}
+
+static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi,
+					       void __user *arg)
+{
+	struct cdrom_read_audio ra;
+	int lba;
+
+	if (copy_from_user(&ra, (struct cdrom_read_audio __user *)arg,
+			   sizeof(ra)))
+		return -EFAULT;
+
+	if (ra.addr_format == CDROM_MSF)
+		lba = msf_to_lba(ra.addr.msf.minute,
+				 ra.addr.msf.second,
+				 ra.addr.msf.frame);
+	else if (ra.addr_format == CDROM_LBA)
+		lba = ra.addr.lba;
+	else
+		return -EINVAL;
+
+	/* FIXME: we need upper bound checking, too!! */
+	if (lba < 0 || ra.nframes <= 0 || ra.nframes > CD_FRAMES)
+		return -EINVAL;
+
+	return cdrom_read_cdda(cdi, ra.buf, lba, ra.nframes);
+}
+
+static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
+					       void __user *arg)
+{
+	int ret;
+	struct cdrom_subchnl q;
+	u_char requested, back;
+	if (copy_from_user(&q, (struct cdrom_subchnl __user *)arg, sizeof(q)))
+		return -EFAULT;
+	requested = q.cdsc_format;
+	if (!((requested == CDROM_MSF) ||
+	      (requested == CDROM_LBA)))
+		return -EINVAL;
+	q.cdsc_format = CDROM_MSF;
+	ret = cdrom_read_subchannel(cdi, &q, 0);
+	if (ret)
+		return ret;
+	back = q.cdsc_format; /* local copy */
+	sanitize_format(&q.cdsc_absaddr, &back, requested);
+	sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
+	if (copy_to_user((struct cdrom_subchnl __user *)arg, &q, sizeof(q)))
+		return -EFAULT;
+	/* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
+	return 0;
+}
+
+static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
+					     void __user *arg,
+					     struct packet_command *cgc)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct cdrom_msf msf;
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
+	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
+		return -EFAULT;
+	cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF;
+	cgc->cmd[3] = msf.cdmsf_min0;
+	cgc->cmd[4] = msf.cdmsf_sec0;
+	cgc->cmd[5] = msf.cdmsf_frame0;
+	cgc->cmd[6] = msf.cdmsf_min1;
+	cgc->cmd[7] = msf.cdmsf_sec1;
+	cgc->cmd[8] = msf.cdmsf_frame1;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
+
+static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
+					     void __user *arg,
+					     struct packet_command *cgc)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct cdrom_blk blk;
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
+	if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk)))
+		return -EFAULT;
+	cgc->cmd[0] = GPCMD_PLAY_AUDIO_10;
+	cgc->cmd[2] = (blk.from >> 24) & 0xff;
+	cgc->cmd[3] = (blk.from >> 16) & 0xff;
+	cgc->cmd[4] = (blk.from >>  8) & 0xff;
+	cgc->cmd[5] = blk.from & 0xff;
+	cgc->cmd[7] = (blk.len >> 8) & 0xff;
+	cgc->cmd[8] = blk.len & 0xff;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
+
+static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
+					   void __user *arg,
+					   struct packet_command *cgc,
+					   unsigned int cmd)
+{
+	struct cdrom_volctrl volctrl;
+	unsigned char buffer[32];
+	char mask[sizeof(buffer)];
+	unsigned short offset;
+	int ret;
+
+	cd_dbg(CD_DO_IOCTL, "entering CDROMVOLUME\n");
+
+	if (copy_from_user(&volctrl, (struct cdrom_volctrl __user *)arg,
+			   sizeof(volctrl)))
+		return -EFAULT;
+
+	cgc->buffer = buffer;
+	cgc->buflen = 24;
+	ret = cdrom_mode_sense(cdi, cgc, GPMODE_AUDIO_CTL_PAGE, 0);
+	if (ret)
+		return ret;
+		
+	/* originally the code depended on buffer[1] to determine
+	   how much data is available for transfer. buffer[1] is
+	   unfortunately ambigious and the only reliable way seem
+	   to be to simply skip over the block descriptor... */
+	offset = 8 + be16_to_cpu(*(__be16 *)(buffer + 6));
+
+	if (offset + 16 > sizeof(buffer))
+		return -E2BIG;
+
+	if (offset + 16 > cgc->buflen) {
+		cgc->buflen = offset + 16;
+		ret = cdrom_mode_sense(cdi, cgc,
+				       GPMODE_AUDIO_CTL_PAGE, 0);
+		if (ret)
+			return ret;
+	}
+
+	/* sanity check */
+	if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE ||
+	    buffer[offset + 1] < 14)
+		return -EINVAL;
+
+	/* now we have the current volume settings. if it was only
+	   a CDROMVOLREAD, return these values */
+	if (cmd == CDROMVOLREAD) {
+		volctrl.channel0 = buffer[offset+9];
+		volctrl.channel1 = buffer[offset+11];
+		volctrl.channel2 = buffer[offset+13];
+		volctrl.channel3 = buffer[offset+15];
+		if (copy_to_user((struct cdrom_volctrl __user *)arg, &volctrl,
+				 sizeof(volctrl)))
+			return -EFAULT;
+		return 0;
+	}
+		
+	/* get the volume mask */
+	cgc->buffer = mask;
+	ret = cdrom_mode_sense(cdi, cgc, GPMODE_AUDIO_CTL_PAGE, 1);
+	if (ret)
+		return ret;
+
+	buffer[offset + 9]  = volctrl.channel0 & mask[offset + 9];
+	buffer[offset + 11] = volctrl.channel1 & mask[offset + 11];
+	buffer[offset + 13] = volctrl.channel2 & mask[offset + 13];
+	buffer[offset + 15] = volctrl.channel3 & mask[offset + 15];
+
+	/* set volume */
+	cgc->buffer = buffer + offset - 8;
+	memset(cgc->buffer, 0, 8);
+	return cdrom_mode_select(cdi, cgc);
+}
+
+static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
+					       struct packet_command *cgc,
+					       int cmd)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
+	cgc->cmd[0] = GPCMD_START_STOP_UNIT;
+	cgc->cmd[1] = 1;
+	cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
+
+static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
+						 struct packet_command *cgc,
+						 int cmd)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
+	cgc->cmd[0] = GPCMD_PAUSE_RESUME;
+	cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
+	cgc->data_direction = CGC_DATA_NONE;
+	return cdo->generic_packet(cdi, cgc);
+}
+
+static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
+					      void __user *arg,
+					      struct packet_command *cgc)
+{
+	int ret;
+	dvd_struct *s;
+	int size = sizeof(dvd_struct);
+
+	if (!CDROM_CAN(CDC_DVD))
+		return -ENOSYS;
+
+	s = kmalloc(size, GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	cd_dbg(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n");
+	if (copy_from_user(s, arg, size)) {
+		kfree(s);
+		return -EFAULT;
+	}
+
+	ret = dvd_read_struct(cdi, s, cgc);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(arg, s, size))
+		ret = -EFAULT;
+out:
+	kfree(s);
+	return ret;
+}
+
+static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi,
+				       void __user *arg)
+{
+	int ret;
+	dvd_authinfo ai;
+	if (!CDROM_CAN(CDC_DVD))
+		return -ENOSYS;
+	cd_dbg(CD_DO_IOCTL, "entering DVD_AUTH\n");
+	if (copy_from_user(&ai, (dvd_authinfo __user *)arg, sizeof(ai)))
+		return -EFAULT;
+	ret = dvd_do_auth(cdi, &ai);
+	if (ret)
+		return ret;
+	if (copy_to_user((dvd_authinfo __user *)arg, &ai, sizeof(ai)))
+		return -EFAULT;
+	return 0;
+}
+
+static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi,
+						  void __user *arg)
+{
+	int ret;
+	long next = 0;
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n");
+	ret = cdrom_get_next_writable(cdi, &next);
+	if (ret)
+		return ret;
+	if (copy_to_user((long __user *)arg, &next, sizeof(next)))
+		return -EFAULT;
+	return 0;
+}
+
+static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi,
+						 void __user *arg)
+{
+	int ret;
+	long last = 0;
+	cd_dbg(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n");
+	ret = cdrom_get_last_written(cdi, &last);
+	if (ret)
+		return ret;
+	if (copy_to_user((long __user *)arg, &last, sizeof(last)))
+		return -EFAULT;
+	return 0;
+}
+
+static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
+		     unsigned long arg)
+{
+	struct packet_command cgc;
+	void __user *userptr = (void __user *)arg;
+
+	memset(&cgc, 0, sizeof(cgc));
+
+	/* build a unified command and queue it through
+	   cdo->generic_packet() */
+	switch (cmd) {
+	case CDROMREADRAW:
+	case CDROMREADMODE1:
+	case CDROMREADMODE2:
+		return mmc_ioctl_cdrom_read_data(cdi, userptr, &cgc, cmd);
+	case CDROMREADAUDIO:
+		return mmc_ioctl_cdrom_read_audio(cdi, userptr);
+	case CDROMSUBCHNL:
+		return mmc_ioctl_cdrom_subchannel(cdi, userptr);
+	case CDROMPLAYMSF:
+		return mmc_ioctl_cdrom_play_msf(cdi, userptr, &cgc);
+	case CDROMPLAYBLK:
+		return mmc_ioctl_cdrom_play_blk(cdi, userptr, &cgc);
+	case CDROMVOLCTRL:
+	case CDROMVOLREAD:
+		return mmc_ioctl_cdrom_volume(cdi, userptr, &cgc, cmd);
+	case CDROMSTART:
+	case CDROMSTOP:
+		return mmc_ioctl_cdrom_start_stop(cdi, &cgc, cmd);
+	case CDROMPAUSE:
+	case CDROMRESUME:
+		return mmc_ioctl_cdrom_pause_resume(cdi, &cgc, cmd);
+	case DVD_READ_STRUCT:
+		return mmc_ioctl_dvd_read_struct(cdi, userptr, &cgc);
+	case DVD_AUTH:
+		return mmc_ioctl_dvd_auth(cdi, userptr);
+	case CDROM_NEXT_WRITABLE:
+		return mmc_ioctl_cdrom_next_writable(cdi, userptr);
+	case CDROM_LAST_WRITTEN:
+		return mmc_ioctl_cdrom_last_written(cdi, userptr);
+	}
+
+	return -ENOTTY;
+}
+
 /*
  * Just about every imaginable ioctl is supported in the Uniform layer
  * these days.
@@ -2810,408 +3210,6 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
 	return -ENOSYS;
 }
 
-/*
- * Required when we need to use READ_10 to issue other than 2048 block
- * reads
- */
-static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	struct packet_command cgc;
-	struct modesel_head mh;
-
-	memset(&mh, 0, sizeof(mh));
-	mh.block_desc_length = 0x08;
-	mh.block_length_med = (size >> 8) & 0xff;
-	mh.block_length_lo = size & 0xff;
-
-	memset(&cgc, 0, sizeof(cgc));
-	cgc.cmd[0] = 0x15;
-	cgc.cmd[1] = 1 << 4;
-	cgc.cmd[4] = 12;
-	cgc.buflen = sizeof(mh);
-	cgc.buffer = (char *) &mh;
-	cgc.data_direction = CGC_DATA_WRITE;
-	mh.block_desc_length = 0x08;
-	mh.block_length_med = (size >> 8) & 0xff;
-	mh.block_length_lo = size & 0xff;
-
-	return cdo->generic_packet(cdi, &cgc);
-}
-
-static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
-					void __user *arg,
-					struct packet_command *cgc,
-					int cmd)
-{
-	struct request_sense sense;
-	struct cdrom_msf msf;
-	int blocksize = 0, format = 0, lba;
-	int ret;
-
-	switch (cmd) {
-	case CDROMREADRAW:
-		blocksize = CD_FRAMESIZE_RAW;
-		break;
-	case CDROMREADMODE1:
-		blocksize = CD_FRAMESIZE;
-		format = 2;
-		break;
-	case CDROMREADMODE2:
-		blocksize = CD_FRAMESIZE_RAW0;
-		break;
-	}
-	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
-		return -EFAULT;
-	lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0);
-	/* FIXME: we need upper bound checking, too!! */
-	if (lba < 0)
-		return -EINVAL;
-
-	cgc->buffer = kzalloc(blocksize, GFP_KERNEL);
-	if (cgc->buffer == NULL)
-		return -ENOMEM;
-
-	memset(&sense, 0, sizeof(sense));
-	cgc->sense = &sense;
-	cgc->data_direction = CGC_DATA_READ;
-	ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
-	if (ret && sense.sense_key == 0x05 &&
-		   sense.asc == 0x20 &&
-		   sense.ascq == 0x00) {
-		/*
-		 * SCSI-II devices are not required to support
-		 * READ_CD, so let's try switching block size
-		 */
-		/* FIXME: switch back again... */
-		ret = cdrom_switch_blocksize(cdi, blocksize);
-		if (ret)
-			goto out;
-		cgc->sense = NULL;
-		ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
-		ret |= cdrom_switch_blocksize(cdi, blocksize);
-	}
-	if (!ret && copy_to_user(arg, cgc->buffer, blocksize))
-		ret = -EFAULT;
-out:
-	kfree(cgc->buffer);
-	return ret;
-}
-
-static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi,
-					void __user *arg)
-{
-	struct cdrom_read_audio ra;
-	int lba;
-
-	if (copy_from_user(&ra, (struct cdrom_read_audio __user *)arg,
-			   sizeof(ra)))
-		return -EFAULT;
-
-	if (ra.addr_format == CDROM_MSF)
-		lba = msf_to_lba(ra.addr.msf.minute,
-				 ra.addr.msf.second,
-				 ra.addr.msf.frame);
-	else if (ra.addr_format == CDROM_LBA)
-		lba = ra.addr.lba;
-	else
-		return -EINVAL;
-
-	/* FIXME: we need upper bound checking, too!! */
-	if (lba < 0 || ra.nframes <= 0 || ra.nframes > CD_FRAMES)
-		return -EINVAL;
-
-	return cdrom_read_cdda(cdi, ra.buf, lba, ra.nframes);
-}
-
-static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
-					void __user *arg)
-{
-	int ret;
-	struct cdrom_subchnl q;
-	u_char requested, back;
-	if (copy_from_user(&q, (struct cdrom_subchnl __user *)arg, sizeof(q)))
-		return -EFAULT;
-	requested = q.cdsc_format;
-	if (!((requested == CDROM_MSF) ||
-	      (requested == CDROM_LBA)))
-		return -EINVAL;
-	q.cdsc_format = CDROM_MSF;
-	ret = cdrom_read_subchannel(cdi, &q, 0);
-	if (ret)
-		return ret;
-	back = q.cdsc_format; /* local copy */
-	sanitize_format(&q.cdsc_absaddr, &back, requested);
-	sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
-	if (copy_to_user((struct cdrom_subchnl __user *)arg, &q, sizeof(q)))
-		return -EFAULT;
-	/* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
-	return 0;
-}
-
-static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
-					void __user *arg,
-					struct packet_command *cgc)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	struct cdrom_msf msf;
-	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
-	if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
-		return -EFAULT;
-	cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF;
-	cgc->cmd[3] = msf.cdmsf_min0;
-	cgc->cmd[4] = msf.cdmsf_sec0;
-	cgc->cmd[5] = msf.cdmsf_frame0;
-	cgc->cmd[6] = msf.cdmsf_min1;
-	cgc->cmd[7] = msf.cdmsf_sec1;
-	cgc->cmd[8] = msf.cdmsf_frame1;
-	cgc->data_direction = CGC_DATA_NONE;
-	return cdo->generic_packet(cdi, cgc);
-}
-
-static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
-					void __user *arg,
-					struct packet_command *cgc)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	struct cdrom_blk blk;
-	cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
-	if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk)))
-		return -EFAULT;
-	cgc->cmd[0] = GPCMD_PLAY_AUDIO_10;
-	cgc->cmd[2] = (blk.from >> 24) & 0xff;
-	cgc->cmd[3] = (blk.from >> 16) & 0xff;
-	cgc->cmd[4] = (blk.from >>  8) & 0xff;
-	cgc->cmd[5] = blk.from & 0xff;
-	cgc->cmd[7] = (blk.len >> 8) & 0xff;
-	cgc->cmd[8] = blk.len & 0xff;
-	cgc->data_direction = CGC_DATA_NONE;
-	return cdo->generic_packet(cdi, cgc);
-}
-
-static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
-					void __user *arg,
-					struct packet_command *cgc,
-					unsigned int cmd)
-{
-	struct cdrom_volctrl volctrl;
-	unsigned char buffer[32];
-	char mask[sizeof(buffer)];
-	unsigned short offset;
-	int ret;
-
-	cd_dbg(CD_DO_IOCTL, "entering CDROMVOLUME\n");
-
-	if (copy_from_user(&volctrl, (struct cdrom_volctrl __user *)arg,
-			   sizeof(volctrl)))
-		return -EFAULT;
-
-	cgc->buffer = buffer;
-	cgc->buflen = 24;
-	ret = cdrom_mode_sense(cdi, cgc, GPMODE_AUDIO_CTL_PAGE, 0);
-	if (ret)
-		return ret;
-		
-	/* originally the code depended on buffer[1] to determine
-	   how much data is available for transfer. buffer[1] is
-	   unfortunately ambigious and the only reliable way seem
-	   to be to simply skip over the block descriptor... */
-	offset = 8 + be16_to_cpu(*(__be16 *)(buffer + 6));
-
-	if (offset + 16 > sizeof(buffer))
-		return -E2BIG;
-
-	if (offset + 16 > cgc->buflen) {
-		cgc->buflen = offset + 16;
-		ret = cdrom_mode_sense(cdi, cgc,
-					GPMODE_AUDIO_CTL_PAGE, 0);
-		if (ret)
-			return ret;
-	}
-
-	/* sanity check */
-	if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE ||
-			buffer[offset + 1] < 14)
-		return -EINVAL;
-
-	/* now we have the current volume settings. if it was only
-	   a CDROMVOLREAD, return these values */
-	if (cmd == CDROMVOLREAD) {
-		volctrl.channel0 = buffer[offset+9];
-		volctrl.channel1 = buffer[offset+11];
-		volctrl.channel2 = buffer[offset+13];
-		volctrl.channel3 = buffer[offset+15];
-		if (copy_to_user((struct cdrom_volctrl __user *)arg, &volctrl,
-				 sizeof(volctrl)))
-			return -EFAULT;
-		return 0;
-	}
-		
-	/* get the volume mask */
-	cgc->buffer = mask;
-	ret = cdrom_mode_sense(cdi, cgc, GPMODE_AUDIO_CTL_PAGE, 1);
-	if (ret)
-		return ret;
-
-	buffer[offset + 9]  = volctrl.channel0 & mask[offset + 9];
-	buffer[offset + 11] = volctrl.channel1 & mask[offset + 11];
-	buffer[offset + 13] = volctrl.channel2 & mask[offset + 13];
-	buffer[offset + 15] = volctrl.channel3 & mask[offset + 15];
-
-	/* set volume */
-	cgc->buffer = buffer + offset - 8;
-	memset(cgc->buffer, 0, 8);
-	return cdrom_mode_select(cdi, cgc);
-}
-
-static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
-					struct packet_command *cgc,
-					int cmd)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
-	cgc->cmd[0] = GPCMD_START_STOP_UNIT;
-	cgc->cmd[1] = 1;
-	cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0;
-	cgc->data_direction = CGC_DATA_NONE;
-	return cdo->generic_packet(cdi, cgc);
-}
-
-static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
-					struct packet_command *cgc,
-					int cmd)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
-	cgc->cmd[0] = GPCMD_PAUSE_RESUME;
-	cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
-	cgc->data_direction = CGC_DATA_NONE;
-	return cdo->generic_packet(cdi, cgc);
-}
-
-static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
-						void __user *arg,
-						struct packet_command *cgc)
-{
-	int ret;
-	dvd_struct *s;
-	int size = sizeof(dvd_struct);
-
-	if (!CDROM_CAN(CDC_DVD))
-		return -ENOSYS;
-
-	s = kmalloc(size, GFP_KERNEL);
-	if (!s)
-		return -ENOMEM;
-
-	cd_dbg(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n");
-	if (copy_from_user(s, arg, size)) {
-		kfree(s);
-		return -EFAULT;
-	}
-
-	ret = dvd_read_struct(cdi, s, cgc);
-	if (ret)
-		goto out;
-
-	if (copy_to_user(arg, s, size))
-		ret = -EFAULT;
-out:
-	kfree(s);
-	return ret;
-}
-
-static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi,
-					void __user *arg)
-{
-	int ret;
-	dvd_authinfo ai;
-	if (!CDROM_CAN(CDC_DVD))
-		return -ENOSYS;
-	cd_dbg(CD_DO_IOCTL, "entering DVD_AUTH\n");
-	if (copy_from_user(&ai, (dvd_authinfo __user *)arg, sizeof(ai)))
-		return -EFAULT;
-	ret = dvd_do_auth(cdi, &ai);
-	if (ret)
-		return ret;
-	if (copy_to_user((dvd_authinfo __user *)arg, &ai, sizeof(ai)))
-		return -EFAULT;
-	return 0;
-}
-
-static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi,
-						void __user *arg)
-{
-	int ret;
-	long next = 0;
-	cd_dbg(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n");
-	ret = cdrom_get_next_writable(cdi, &next);
-	if (ret)
-		return ret;
-	if (copy_to_user((long __user *)arg, &next, sizeof(next)))
-		return -EFAULT;
-	return 0;
-}
-
-static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi,
-						void __user *arg)
-{
-	int ret;
-	long last = 0;
-	cd_dbg(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n");
-	ret = cdrom_get_last_written(cdi, &last);
-	if (ret)
-		return ret;
-	if (copy_to_user((long __user *)arg, &last, sizeof(last)))
-		return -EFAULT;
-	return 0;
-}
-
-static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
-		     unsigned long arg)
-{
-	struct packet_command cgc;
-	void __user *userptr = (void __user *)arg;
-
-	memset(&cgc, 0, sizeof(cgc));
-
-	/* build a unified command and queue it through
-	   cdo->generic_packet() */
-	switch (cmd) {
-	case CDROMREADRAW:
-	case CDROMREADMODE1:
-	case CDROMREADMODE2:
-		return mmc_ioctl_cdrom_read_data(cdi, userptr, &cgc, cmd);
-	case CDROMREADAUDIO:
-		return mmc_ioctl_cdrom_read_audio(cdi, userptr);
-	case CDROMSUBCHNL:
-		return mmc_ioctl_cdrom_subchannel(cdi, userptr);
-	case CDROMPLAYMSF:
-		return mmc_ioctl_cdrom_play_msf(cdi, userptr, &cgc);
-	case CDROMPLAYBLK:
-		return mmc_ioctl_cdrom_play_blk(cdi, userptr, &cgc);
-	case CDROMVOLCTRL:
-	case CDROMVOLREAD:
-		return mmc_ioctl_cdrom_volume(cdi, userptr, &cgc, cmd);
-	case CDROMSTART:
-	case CDROMSTOP:
-		return mmc_ioctl_cdrom_start_stop(cdi, &cgc, cmd);
-	case CDROMPAUSE:
-	case CDROMRESUME:
-		return mmc_ioctl_cdrom_pause_resume(cdi, &cgc, cmd);
-	case DVD_READ_STRUCT:
-		return mmc_ioctl_dvd_read_struct(cdi, userptr, &cgc);
-	case DVD_AUTH:
-		return mmc_ioctl_dvd_auth(cdi, userptr);
-	case CDROM_NEXT_WRITABLE:
-		return mmc_ioctl_cdrom_next_writable(cdi, userptr);
-	case CDROM_LAST_WRITTEN:
-		return mmc_ioctl_cdrom_last_written(cdi, userptr);
-	}
-
-	return -ENOTTY;
-}
-
 static int cdrom_get_track_info(struct cdrom_device_info *cdi, __u16 track, __u8 type,
 			 track_information *ti)
 {

From e1e60fda48b0eb2afebdfe7cf7c382adc00de470 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:09 -0700
Subject: [PATCH 39/54] cdrom: Remove cdrom_get_last_written prototype

Move the function instead.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 193 +++++++++++++++++++++---------------------
 1 file changed, 97 insertions(+), 96 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 332c3aee1346..fac603a16a15 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -338,7 +338,6 @@ do {							\
 
 /* Not-exported routines. */
 
-int cdrom_get_last_written(struct cdrom_device_info *, long *);
 static int cdrom_get_next_writable(struct cdrom_device_info *, long *);
 static void cdrom_count_tracks(struct cdrom_device_info *, tracktype*);
 
@@ -2740,6 +2739,103 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
 	return cdo->generic_packet(cdi, &cgc);
 }
 
+static int cdrom_get_track_info(struct cdrom_device_info *cdi,
+				__u16 track, __u8 type, track_information *ti)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct packet_command cgc;
+	int ret, buflen;
+
+	init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
+	cgc.cmd[1] = type & 3;
+	cgc.cmd[4] = (track & 0xff00) >> 8;
+	cgc.cmd[5] = track & 0xff;
+	cgc.cmd[8] = 8;
+	cgc.quiet = 1;
+
+	ret = cdo->generic_packet(cdi, &cgc);
+	if (ret)
+		return ret;
+
+	buflen = be16_to_cpu(ti->track_information_length) +
+		sizeof(ti->track_information_length);
+
+	if (buflen > sizeof(track_information))
+		buflen = sizeof(track_information);
+
+	cgc.cmd[8] = cgc.buflen = buflen;
+	ret = cdo->generic_packet(cdi, &cgc);
+	if (ret)
+		return ret;
+
+	/* return actual fill size */
+	return buflen;
+}
+
+/* return the last written block on the CD-R media. this is for the udf
+   file system. */
+int cdrom_get_last_written(struct cdrom_device_info *cdi, long *last_written)
+{
+	struct cdrom_tocentry toc;
+	disc_information di;
+	track_information ti;
+	__u32 last_track;
+	int ret = -1, ti_size;
+
+	if (!CDROM_CAN(CDC_GENERIC_PACKET))
+		goto use_toc;
+
+	ret = cdrom_get_disc_info(cdi, &di);
+	if (ret < (int)(offsetof(typeof(di), last_track_lsb)
+			+ sizeof(di.last_track_lsb)))
+		goto use_toc;
+
+	/* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
+	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
+	ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
+	if (ti_size < (int)offsetof(typeof(ti), track_start))
+		goto use_toc;
+
+	/* if this track is blank, try the previous. */
+	if (ti.blank) {
+		if (last_track == 1)
+			goto use_toc;
+		last_track--;
+		ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
+	}
+
+	if (ti_size < (int)(offsetof(typeof(ti), track_size)
+				+ sizeof(ti.track_size)))
+		goto use_toc;
+
+	/* if last recorded field is valid, return it. */
+	if (ti.lra_v && ti_size >= (int)(offsetof(typeof(ti), last_rec_address)
+				+ sizeof(ti.last_rec_address))) {
+		*last_written = be32_to_cpu(ti.last_rec_address);
+	} else {
+		/* make it up instead */
+		*last_written = be32_to_cpu(ti.track_start) +
+				be32_to_cpu(ti.track_size);
+		if (ti.free_blocks)
+			*last_written -= (be32_to_cpu(ti.free_blocks) + 7);
+	}
+	return 0;
+
+	/* this is where we end up if the drive either can't do a
+	   GPCMD_READ_DISC_INFO or GPCMD_READ_TRACK_RZONE_INFO or if
+	   it doesn't give enough information or fails. then we return
+	   the toc contents. */
+use_toc:
+	toc.cdte_format = CDROM_MSF;
+	toc.cdte_track = CDROM_LEADOUT;
+	if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &toc)))
+		return ret;
+	sanitize_format(&toc.cdte_addr, &toc.cdte_format, CDROM_LBA);
+	*last_written = toc.cdte_addr.lba;
+	return 0;
+}
+
 static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
 					      void __user *arg,
 					      struct packet_command *cgc,
@@ -3210,38 +3306,6 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
 	return -ENOSYS;
 }
 
-static int cdrom_get_track_info(struct cdrom_device_info *cdi, __u16 track, __u8 type,
-			 track_information *ti)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	struct packet_command cgc;
-	int ret, buflen;
-
-	init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
-	cgc.cmd[1] = type & 3;
-	cgc.cmd[4] = (track & 0xff00) >> 8;
-	cgc.cmd[5] = track & 0xff;
-	cgc.cmd[8] = 8;
-	cgc.quiet = 1;
-
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
-		return ret;
-	
-	buflen = be16_to_cpu(ti->track_information_length) +
-		     sizeof(ti->track_information_length);
-
-	if (buflen > sizeof(track_information))
-		buflen = sizeof(track_information);
-
-	cgc.cmd[8] = cgc.buflen = buflen;
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
-		return ret;
-
-	/* return actual fill size */
-	return buflen;
-}
-
 /* requires CD R/RW */
 static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di)
 {
@@ -3275,69 +3339,6 @@ static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *
 	return buflen;
 }
 
-/* return the last written block on the CD-R media. this is for the udf
-   file system. */
-int cdrom_get_last_written(struct cdrom_device_info *cdi, long *last_written)
-{
-	struct cdrom_tocentry toc;
-	disc_information di;
-	track_information ti;
-	__u32 last_track;
-	int ret = -1, ti_size;
-
-	if (!CDROM_CAN(CDC_GENERIC_PACKET))
-		goto use_toc;
-
-	ret = cdrom_get_disc_info(cdi, &di);
-	if (ret < (int)(offsetof(typeof(di), last_track_lsb)
-			+ sizeof(di.last_track_lsb)))
-		goto use_toc;
-
-	/* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
-	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
-	ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
-	if (ti_size < (int)offsetof(typeof(ti), track_start))
-		goto use_toc;
-
-	/* if this track is blank, try the previous. */
-	if (ti.blank) {
-		if (last_track==1)
-			goto use_toc;
-		last_track--;
-		ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
-	}
-
-	if (ti_size < (int)(offsetof(typeof(ti), track_size)
-				+ sizeof(ti.track_size)))
-		goto use_toc;
-
-	/* if last recorded field is valid, return it. */
-	if (ti.lra_v && ti_size >= (int)(offsetof(typeof(ti), last_rec_address)
-				+ sizeof(ti.last_rec_address))) {
-		*last_written = be32_to_cpu(ti.last_rec_address);
-	} else {
-		/* make it up instead */
-		*last_written = be32_to_cpu(ti.track_start) +
-				be32_to_cpu(ti.track_size);
-		if (ti.free_blocks)
-			*last_written -= (be32_to_cpu(ti.free_blocks) + 7);
-	}
-	return 0;
-
-	/* this is where we end up if the drive either can't do a
-	   GPCMD_READ_DISC_INFO or GPCMD_READ_TRACK_RZONE_INFO or if
-	   it doesn't give enough information or fails. then we return
-	   the toc contents. */
-use_toc:
-	toc.cdte_format = CDROM_MSF;
-	toc.cdte_track = CDROM_LEADOUT;
-	if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &toc)))
-		return ret;
-	sanitize_format(&toc.cdte_addr, &toc.cdte_format, CDROM_LBA);
-	*last_written = toc.cdte_addr.lba;
-	return 0;
-}
-
 /* return the next writable block. also for udf file system. */
 static int cdrom_get_next_writable(struct cdrom_device_info *cdi, long *next_writable)
 {

From dac1c5cf448c2bfdb8f6e1fe10a0eec616c34138 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:10 -0700
Subject: [PATCH 40/54] cdrom: Remove cdrom_get_next_writeable prototype

Move the function to the right spot instead.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 101 +++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 50 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index fac603a16a15..8888ed3a8d4f 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -338,7 +338,6 @@ do {							\
 
 /* Not-exported routines. */
 
-static int cdrom_get_next_writable(struct cdrom_device_info *, long *);
 static void cdrom_count_tracks(struct cdrom_device_info *, tracktype*);
 
 static int cdrom_mrw_exit(struct cdrom_device_info *cdi);
@@ -2836,6 +2835,57 @@ use_toc:
 	return 0;
 }
 
+/* return the next writable block. also for udf file system. */
+static int cdrom_get_next_writable(struct cdrom_device_info *cdi,
+				   long *next_writable)
+{
+	disc_information di;
+	track_information ti;
+	__u16 last_track;
+	int ret, ti_size;
+
+	if (!CDROM_CAN(CDC_GENERIC_PACKET))
+		goto use_last_written;
+
+	ret = cdrom_get_disc_info(cdi, &di);
+	if (ret < 0 || ret < offsetof(typeof(di), last_track_lsb)
+				+ sizeof(di.last_track_lsb))
+		goto use_last_written;
+
+	/* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
+	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
+	ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
+	if (ti_size < 0 || ti_size < offsetof(typeof(ti), track_start))
+		goto use_last_written;
+
+	/* if this track is blank, try the previous. */
+	if (ti.blank) {
+		if (last_track == 1)
+			goto use_last_written;
+		last_track--;
+		ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
+		if (ti_size < 0)
+			goto use_last_written;
+	}
+
+	/* if next recordable address field is valid, use it. */
+	if (ti.nwa_v && ti_size >= offsetof(typeof(ti), next_writable)
+				+ sizeof(ti.next_writable)) {
+		*next_writable = be32_to_cpu(ti.next_writable);
+		return 0;
+	}
+
+use_last_written:
+	ret = cdrom_get_last_written(cdi, next_writable);
+	if (ret) {
+		*next_writable = 0;
+		return ret;
+	} else {
+		*next_writable += 7;
+		return 0;
+	}
+}
+
 static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
 					      void __user *arg,
 					      struct packet_command *cgc,
@@ -3339,55 +3389,6 @@ static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *
 	return buflen;
 }
 
-/* return the next writable block. also for udf file system. */
-static int cdrom_get_next_writable(struct cdrom_device_info *cdi, long *next_writable)
-{
-	disc_information di;
-	track_information ti;
-	__u16 last_track;
-	int ret, ti_size;
-
-	if (!CDROM_CAN(CDC_GENERIC_PACKET))
-		goto use_last_written;
-
-	ret = cdrom_get_disc_info(cdi, &di);
-	if (ret < 0 || ret < offsetof(typeof(di), last_track_lsb)
-				+ sizeof(di.last_track_lsb))
-		goto use_last_written;
-
-	/* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
-	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
-	ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
-	if (ti_size < 0 || ti_size < offsetof(typeof(ti), track_start))
-		goto use_last_written;
-
-        /* if this track is blank, try the previous. */
-	if (ti.blank) {
-		if (last_track == 1)
-			goto use_last_written;
-		last_track--;
-		ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
-		if (ti_size < 0)
-			goto use_last_written;
-	}
-
-	/* if next recordable address field is valid, use it. */
-	if (ti.nwa_v && ti_size >= offsetof(typeof(ti), next_writable)
-				+ sizeof(ti.next_writable)) {
-		*next_writable = be32_to_cpu(ti.next_writable);
-		return 0;
-	}
-
-use_last_written:
-	if ((ret = cdrom_get_last_written(cdi, next_writable))) {
-		*next_writable = 0;
-		return ret;
-	} else {
-		*next_writable += 7;
-		return 0;
-	}
-}
-
 EXPORT_SYMBOL(cdrom_get_last_written);
 EXPORT_SYMBOL(register_cdrom);
 EXPORT_SYMBOL(unregister_cdrom);

From a803393bf8b2b0dd69d9aa87471e7afee91fbdc0 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:11 -0700
Subject: [PATCH 41/54] cdrom: Remove cdrom_count_tracks prototype

Move function to proper location instead.
Fix whitespace and embedded if too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 94 +++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 8888ed3a8d4f..f614847aad8f 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -338,8 +338,6 @@ do {							\
 
 /* Not-exported routines. */
 
-static void cdrom_count_tracks(struct cdrom_device_info *, tracktype*);
-
 static int cdrom_mrw_exit(struct cdrom_device_info *cdi);
 
 static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di);
@@ -948,6 +946,53 @@ static int cdrom_close_write(struct cdrom_device_info *cdi)
 #endif
 }
 
+/* badly broken, I know. Is due for a fixup anytime. */
+static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype *tracks)
+{
+	struct cdrom_tochdr header;
+	struct cdrom_tocentry entry;
+	int ret, i;
+	tracks->data = 0;
+	tracks->audio = 0;
+	tracks->cdi = 0;
+	tracks->xa = 0;
+	tracks->error = 0;
+	cd_dbg(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n");
+	/* Grab the TOC header so we can see how many tracks there are */
+	ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header);
+	if (ret) {
+		if (ret == -ENOMEDIUM)
+			tracks->error = CDS_NO_DISC;
+		else
+			tracks->error = CDS_NO_INFO;
+		return;
+	}
+	/* check what type of tracks are on this disc */
+	entry.cdte_format = CDROM_MSF;
+	for (i = header.cdth_trk0; i <= header.cdth_trk1; i++) {
+		entry.cdte_track = i;
+		if (cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry)) {
+			tracks->error = CDS_NO_INFO;
+			return;
+		}
+		if (entry.cdte_ctrl & CDROM_DATA_TRACK) {
+			if (entry.cdte_format == 0x10)
+				tracks->cdi++;
+			else if (entry.cdte_format == 0x20)
+				tracks->xa++;
+			else
+				tracks->data++;
+		} else {
+			tracks->audio++;
+		}
+		cd_dbg(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n",
+		       i, entry.cdte_format, entry.cdte_ctrl);
+	}
+	cd_dbg(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n",
+	       header.cdth_trk1, tracks->audio, tracks->data,
+	       tracks->cdi, tracks->xa);
+}
+
 static
 int open_for_data(struct cdrom_device_info *cdi)
 {
@@ -1457,51 +1502,6 @@ int cdrom_media_changed(struct cdrom_device_info *cdi)
 	return media_changed(cdi, 0);
 }
 
-/* badly broken, I know. Is due for a fixup anytime. */
-static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks)
-{
-	struct cdrom_tochdr header;
-	struct cdrom_tocentry entry;
-	int ret, i;
-	tracks->data=0;
-	tracks->audio=0;
-	tracks->cdi=0;
-	tracks->xa=0;
-	tracks->error=0;
-	cd_dbg(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n");
-	/* Grab the TOC header so we can see how many tracks there are */
-	if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header))) {
-		if (ret == -ENOMEDIUM)
-			tracks->error = CDS_NO_DISC;
-		else
-			tracks->error = CDS_NO_INFO;
-		return;
-	}	
-	/* check what type of tracks are on this disc */
-	entry.cdte_format = CDROM_MSF;
-	for (i = header.cdth_trk0; i <= header.cdth_trk1; i++) {
-		entry.cdte_track  = i;
-		if (cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry)) {
-			tracks->error=CDS_NO_INFO;
-			return;
-		}	
-		if (entry.cdte_ctrl & CDROM_DATA_TRACK) {
-		    if (entry.cdte_format == 0x10)
-			tracks->cdi++;
-		    else if (entry.cdte_format == 0x20) 
-			tracks->xa++;
-		    else
-			tracks->data++;
-		} else
-		    tracks->audio++;
-		cd_dbg(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n",
-		       i, entry.cdte_format, entry.cdte_ctrl);
-	}	
-	cd_dbg(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n",
-	       header.cdth_trk1, tracks->audio, tracks->data,
-	       tracks->cdi, tracks->xa);
-}	
-
 /* Requests to the low-level drivers will /always/ be done in the
    following format convention:
 

From 40569c6144242087c9663c5d0c93138882dd52e8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:12 -0700
Subject: [PATCH 42/54] cdrom: Remove unnecessary prototype for cdrom_mrw_exit

Move the function to appropriate locations instead.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 240 +++++++++++++++++++++---------------------
 1 file changed, 122 insertions(+), 118 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index f614847aad8f..c8ca3426c5b0 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -338,8 +338,6 @@ do {							\
 
 /* Not-exported routines. */
 
-static int cdrom_mrw_exit(struct cdrom_device_info *cdi);
-
 static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di);
 
 static void cdrom_sysctl_register(void);
@@ -359,112 +357,28 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
 	return -EIO;
 }
 
+static int cdrom_flush_cache(struct cdrom_device_info *cdi)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_FLUSH_CACHE;
+
+	cgc.timeout = 5 * 60 * HZ;
+
+	return cdi->ops->generic_packet(cdi, &cgc);
+}
+
 /* This macro makes sure we don't have to check on cdrom_device_ops
  * existence in the run-time routines below. Change_capability is a
  * hack to have the capability flags defined const, while we can still
  * change it here without gcc complaining at every line.
  */
-#define ENSURE(call, bits) if (cdo->call == NULL) *change_capability &= ~(bits)
-
-int register_cdrom(struct cdrom_device_info *cdi)
-{
-	static char banner_printed;
-        struct cdrom_device_ops *cdo = cdi->ops;
-        int *change_capability = (int *)&cdo->capability; /* hack */
-
-	cd_dbg(CD_OPEN, "entering register_cdrom\n");
-
-	if (cdo->open == NULL || cdo->release == NULL)
-		return -EINVAL;
-	if (!banner_printed) {
-		pr_info("Uniform CD-ROM driver " REVISION "\n");
-		banner_printed = 1;
-		cdrom_sysctl_register();
-	}
-
-	ENSURE(drive_status, CDC_DRIVE_STATUS );
-	if (cdo->check_events == NULL && cdo->media_changed == NULL)
-		*change_capability = ~(CDC_MEDIA_CHANGED | CDC_SELECT_DISC);
-	ENSURE(tray_move, CDC_CLOSE_TRAY | CDC_OPEN_TRAY);
-	ENSURE(lock_door, CDC_LOCK);
-	ENSURE(select_speed, CDC_SELECT_SPEED);
-	ENSURE(get_last_session, CDC_MULTI_SESSION);
-	ENSURE(get_mcn, CDC_MCN);
-	ENSURE(reset, CDC_RESET);
-	ENSURE(generic_packet, CDC_GENERIC_PACKET);
-	cdi->mc_flags = 0;
-	cdo->n_minors = 0;
-        cdi->options = CDO_USE_FFLAGS;
-	
-	if (autoclose==1 && CDROM_CAN(CDC_CLOSE_TRAY))
-		cdi->options |= (int) CDO_AUTO_CLOSE;
-	if (autoeject==1 && CDROM_CAN(CDC_OPEN_TRAY))
-		cdi->options |= (int) CDO_AUTO_EJECT;
-	if (lockdoor==1)
-		cdi->options |= (int) CDO_LOCK;
-	if (check_media_type==1)
-		cdi->options |= (int) CDO_CHECK_TYPE;
-
-	if (CDROM_CAN(CDC_MRW_W))
-		cdi->exit = cdrom_mrw_exit;
-
-	if (cdi->disk)
-		cdi->cdda_method = CDDA_BPC_FULL;
-	else
-		cdi->cdda_method = CDDA_OLD;
-
-	if (!cdo->generic_packet)
-		cdo->generic_packet = cdrom_dummy_generic_packet;
-
-	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
-	mutex_lock(&cdrom_mutex);
-	list_add(&cdi->list, &cdrom_list);
-	mutex_unlock(&cdrom_mutex);
-	return 0;
-}
-#undef ENSURE
-
-void unregister_cdrom(struct cdrom_device_info *cdi)
-{
-	cd_dbg(CD_OPEN, "entering unregister_cdrom\n");
-
-	mutex_lock(&cdrom_mutex);
-	list_del(&cdi->list);
-	mutex_unlock(&cdrom_mutex);
-
-	if (cdi->exit)
-		cdi->exit(cdi);
-
-	cdi->ops->n_minors--;
-	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
-}
-
-int cdrom_get_media_event(struct cdrom_device_info *cdi,
-			  struct media_event_desc *med)
-{
-	struct packet_command cgc;
-	unsigned char buffer[8];
-	struct event_header *eh = (struct event_header *) buffer;
-
-	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION;
-	cgc.cmd[1] = 1;		/* IMMED */
-	cgc.cmd[4] = 1 << 4;	/* media event */
-	cgc.cmd[8] = sizeof(buffer);
-	cgc.quiet = 1;
-
-	if (cdi->ops->generic_packet(cdi, &cgc))
-		return 1;
-
-	if (be16_to_cpu(eh->data_len) < sizeof(*med))
-		return 1;
-
-	if (eh->nea || eh->notification_class != 0x4)
-		return 1;
-
-	memcpy(med, &buffer[sizeof(*eh)], sizeof(*med));
-	return 0;
-}
+#define ENSURE(call, bits)			\
+do {						\
+	if (cdo->call == NULL)			\
+		*change_capability &= ~(bits);	\
+} while (0)
 
 /*
  * the first prototypes used 0x2c as the page code for the mrw mode page,
@@ -582,18 +496,6 @@ static int cdrom_mrw_bgformat_susp(struct cdrom_device_info *cdi, int immed)
 	return cdi->ops->generic_packet(cdi, &cgc);
 }
 
-static int cdrom_flush_cache(struct cdrom_device_info *cdi)
-{
-	struct packet_command cgc;
-
-	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.cmd[0] = GPCMD_FLUSH_CACHE;
-
-	cgc.timeout = 5 * 60 * HZ;
-
-	return cdi->ops->generic_packet(cdi, &cgc);
-}
-
 static int cdrom_mrw_exit(struct cdrom_device_info *cdi)
 {
 	disc_information di;
@@ -627,17 +529,19 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space)
 	cgc.buffer = buffer;
 	cgc.buflen = sizeof(buffer);
 
-	if ((ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0)))
+	ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0);
+	if (ret)
 		return ret;
 
-	mph = (struct mode_page_header *) buffer;
+	mph = (struct mode_page_header *)buffer;
 	offset = be16_to_cpu(mph->desc_length);
 	size = be16_to_cpu(mph->mode_data_length) + 2;
 
 	buffer[offset + 3] = space;
 	cgc.buflen = size;
 
-	if ((ret = cdrom_mode_select(cdi, &cgc)))
+	ret = cdrom_mode_select(cdi, &cgc);
+	if (ret)
 		return ret;
 
 	pr_info("%s: mrw address space %s selected\n",
@@ -645,6 +549,106 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space)
 	return 0;
 }
 
+int register_cdrom(struct cdrom_device_info *cdi)
+{
+	static char banner_printed;
+	struct cdrom_device_ops *cdo = cdi->ops;
+	int *change_capability = (int *)&cdo->capability; /* hack */
+
+	cd_dbg(CD_OPEN, "entering register_cdrom\n");
+
+	if (cdo->open == NULL || cdo->release == NULL)
+		return -EINVAL;
+	if (!banner_printed) {
+		pr_info("Uniform CD-ROM driver " REVISION "\n");
+		banner_printed = 1;
+		cdrom_sysctl_register();
+	}
+
+	ENSURE(drive_status, CDC_DRIVE_STATUS);
+	if (cdo->check_events == NULL && cdo->media_changed == NULL)
+		*change_capability = ~(CDC_MEDIA_CHANGED | CDC_SELECT_DISC);
+	ENSURE(tray_move, CDC_CLOSE_TRAY | CDC_OPEN_TRAY);
+	ENSURE(lock_door, CDC_LOCK);
+	ENSURE(select_speed, CDC_SELECT_SPEED);
+	ENSURE(get_last_session, CDC_MULTI_SESSION);
+	ENSURE(get_mcn, CDC_MCN);
+	ENSURE(reset, CDC_RESET);
+	ENSURE(generic_packet, CDC_GENERIC_PACKET);
+	cdi->mc_flags = 0;
+	cdo->n_minors = 0;
+	cdi->options = CDO_USE_FFLAGS;
+
+	if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY))
+		cdi->options |= (int) CDO_AUTO_CLOSE;
+	if (autoeject == 1 && CDROM_CAN(CDC_OPEN_TRAY))
+		cdi->options |= (int) CDO_AUTO_EJECT;
+	if (lockdoor == 1)
+		cdi->options |= (int) CDO_LOCK;
+	if (check_media_type == 1)
+		cdi->options |= (int) CDO_CHECK_TYPE;
+
+	if (CDROM_CAN(CDC_MRW_W))
+		cdi->exit = cdrom_mrw_exit;
+
+	if (cdi->disk)
+		cdi->cdda_method = CDDA_BPC_FULL;
+	else
+		cdi->cdda_method = CDDA_OLD;
+
+	if (!cdo->generic_packet)
+		cdo->generic_packet = cdrom_dummy_generic_packet;
+
+	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
+	mutex_lock(&cdrom_mutex);
+	list_add(&cdi->list, &cdrom_list);
+	mutex_unlock(&cdrom_mutex);
+	return 0;
+}
+#undef ENSURE
+
+void unregister_cdrom(struct cdrom_device_info *cdi)
+{
+	cd_dbg(CD_OPEN, "entering unregister_cdrom\n");
+
+	mutex_lock(&cdrom_mutex);
+	list_del(&cdi->list);
+	mutex_unlock(&cdrom_mutex);
+
+	if (cdi->exit)
+		cdi->exit(cdi);
+
+	cdi->ops->n_minors--;
+	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
+}
+
+int cdrom_get_media_event(struct cdrom_device_info *cdi,
+			  struct media_event_desc *med)
+{
+	struct packet_command cgc;
+	unsigned char buffer[8];
+	struct event_header *eh = (struct event_header *)buffer;
+
+	init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION;
+	cgc.cmd[1] = 1;		/* IMMED */
+	cgc.cmd[4] = 1 << 4;	/* media event */
+	cgc.cmd[8] = sizeof(buffer);
+	cgc.quiet = 1;
+
+	if (cdi->ops->generic_packet(cdi, &cgc))
+		return 1;
+
+	if (be16_to_cpu(eh->data_len) < sizeof(*med))
+		return 1;
+
+	if (eh->nea || eh->notification_class != 0x4)
+		return 1;
+
+	memcpy(med, &buffer[sizeof(*eh)], sizeof(*med));
+	return 0;
+}
+
 static int cdrom_get_random_writable(struct cdrom_device_info *cdi,
 			      struct rwrt_feature_desc *rfd)
 {

From bd6f0bba1d2705748ec94e0aa23ae0c5bd6b2287 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 4 May 2014 17:05:13 -0700
Subject: [PATCH 43/54] cdrom: Remove unnecessary prototype for
 cdrom_get_disc_info

Move the function to the proper spot instead.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/cdrom/cdrom.c | 71 ++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index c8ca3426c5b0..49ac5662585b 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -338,8 +338,6 @@ do {							\
 
 /* Not-exported routines. */
 
-static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di);
-
 static void cdrom_sysctl_register(void);
 
 static LIST_HEAD(cdrom_list);
@@ -369,6 +367,42 @@ static int cdrom_flush_cache(struct cdrom_device_info *cdi)
 	return cdi->ops->generic_packet(cdi, &cgc);
 }
 
+/* requires CD R/RW */
+static int cdrom_get_disc_info(struct cdrom_device_info *cdi,
+			       disc_information *di)
+{
+	struct cdrom_device_ops *cdo = cdi->ops;
+	struct packet_command cgc;
+	int ret, buflen;
+
+	/* set up command and get the disc info */
+	init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_DISC_INFO;
+	cgc.cmd[8] = cgc.buflen = 2;
+	cgc.quiet = 1;
+
+	ret = cdo->generic_packet(cdi, &cgc);
+	if (ret)
+		return ret;
+
+	/* not all drives have the same disc_info length, so requeue
+	 * packet with the length the drive tells us it can supply
+	 */
+	buflen = be16_to_cpu(di->disc_information_length) +
+		sizeof(di->disc_information_length);
+
+	if (buflen > sizeof(disc_information))
+		buflen = sizeof(disc_information);
+
+	cgc.cmd[8] = cgc.buflen = buflen;
+	ret = cdo->generic_packet(cdi, &cgc);
+	if (ret)
+		return ret;
+
+	/* return actual fill size */
+	return buflen;
+}
+
 /* This macro makes sure we don't have to check on cdrom_device_ops
  * existence in the run-time routines below. Change_capability is a
  * hack to have the capability flags defined const, while we can still
@@ -3360,39 +3394,6 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
 	return -ENOSYS;
 }
 
-/* requires CD R/RW */
-static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di)
-{
-	struct cdrom_device_ops *cdo = cdi->ops;
-	struct packet_command cgc;
-	int ret, buflen;
-
-	/* set up command and get the disc info */
-	init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DISC_INFO;
-	cgc.cmd[8] = cgc.buflen = 2;
-	cgc.quiet = 1;
-
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
-		return ret;
-
-	/* not all drives have the same disc_info length, so requeue
-	 * packet with the length the drive tells us it can supply
-	 */
-	buflen = be16_to_cpu(di->disc_information_length) +
-		     sizeof(di->disc_information_length);
-
-	if (buflen > sizeof(disc_information))
-		buflen = sizeof(disc_information);
-
-	cgc.cmd[8] = cgc.buflen = buflen;
-	if ((ret = cdo->generic_packet(cdi, &cgc)))
-		return ret;
-
-	/* return actual fill size */
-	return buflen;
-}
-
 EXPORT_SYMBOL(cdrom_get_last_written);
 EXPORT_SYMBOL(register_cdrom);
 EXPORT_SYMBOL(unregister_cdrom);

From ffc771b3ca8b2c03e5e9faa6335b4862108f111f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 9 May 2014 09:42:02 -0600
Subject: [PATCH 44/54] mtip32xx: convert to use blk-mq

This rips out timeout handling, requeueing, etc in converting
it to use blk-mq instead.

Acked-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 907 ++++++++++--------------------
 drivers/block/mtip32xx/mtip32xx.h |  24 +-
 2 files changed, 306 insertions(+), 625 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index fb624469d0ee..3a0882ee1642 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/genhd.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/bio.h>
 #include <linux/dma-mapping.h>
 #include <linux/idr.h>
@@ -173,60 +174,36 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 	return false; /* device present */
 }
 
-/*
- * Obtain an empty command slot.
- *
- * This function needs to be reentrant since it could be called
- * at the same time on multiple CPUs. The allocation of the
- * command slot must be atomic.
- *
- * @port Pointer to the port data structure.
- *
- * return value
- *	>= 0	Index of command slot obtained.
- *	-1	No command slots available.
- */
-static int get_slot(struct mtip_port *port)
+static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
-	int slot, i;
-	unsigned int num_command_slots = port->dd->slot_groups * 32;
+	struct request *rq;
 
-	/*
-	 * Try 10 times, because there is a small race here.
-	 *  that's ok, because it's still cheaper than a lock.
-	 *
-	 * Race: Since this section is not protected by lock, same bit
-	 * could be chosen by different process contexts running in
-	 * different processor. So instead of costly lock, we are going
-	 * with loop.
-	 */
-	for (i = 0; i < 10; i++) {
-		slot = find_next_zero_bit(port->allocated,
-					 num_command_slots, 1);
-		if ((slot < num_command_slots) &&
-		    (!test_and_set_bit(slot, port->allocated)))
-			return slot;
-	}
-	dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
+	rq = blk_mq_alloc_reserved_request(dd->queue, 0, __GFP_WAIT);
+	return blk_mq_rq_to_pdu(rq);
+}
 
-	mtip_check_surprise_removal(port->dd->pdev);
-	return -1;
+static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd)
+{
+	blk_put_request(blk_mq_rq_from_pdu(cmd));
 }
 
 /*
- * Release a command slot.
- *
- * @port Pointer to the port data structure.
- * @tag  Tag of command to release
- *
- * return value
- *	None
+ * Once we add support for one hctx per mtip group, this will change a bit
  */
-static inline void release_slot(struct mtip_port *port, int tag)
+static struct request *mtip_rq_from_tag(struct driver_data *dd,
+					unsigned int tag)
 {
-	smp_mb__before_clear_bit();
-	clear_bit(tag, port->allocated);
-	smp_mb__after_clear_bit();
+	struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
+
+	return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
+					  unsigned int tag)
+{
+	struct request *rq = mtip_rq_from_tag(dd, tag);
+
+	return blk_mq_rq_to_pdu(rq);
 }
 
 /*
@@ -248,93 +225,28 @@ static inline void release_slot(struct mtip_port *port, int tag)
  *	None
  */
 static void mtip_async_complete(struct mtip_port *port,
-				int tag,
-				void *data,
-				int status)
+				int tag, struct mtip_cmd *cmd, int status)
 {
-	struct mtip_cmd *cmd;
-	struct driver_data *dd = data;
-	int unaligned, cb_status = status ? -EIO : 0;
-	void (*func)(void *, int);
+	struct driver_data *dd = port->dd;
+	struct request *rq;
 
 	if (unlikely(!dd) || unlikely(!port))
 		return;
 
-	cmd = &port->commands[tag];
-
 	if (unlikely(status == PORT_IRQ_TF_ERR)) {
 		dev_warn(&port->dd->pdev->dev,
 			"Command tag %d failed due to TFE\n", tag);
 	}
 
-	/* Clear the active flag */
-	atomic_set(&port->commands[tag].active, 0);
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction);
 
-	/* Upper layer callback */
-	func = cmd->async_callback;
-	if (likely(func && cmpxchg(&cmd->async_callback, func, 0) == func)) {
+	rq = mtip_rq_from_tag(dd, tag);
 
-		/* Unmap the DMA scatter list entries */
-		dma_unmap_sg(&dd->pdev->dev,
-			cmd->sg,
-			cmd->scatter_ents,
-			cmd->direction);
+	if (unlikely(cmd->unaligned))
+		up(&port->cmd_slot_unal);
 
-		func(cmd->async_data, cb_status);
-		unaligned = cmd->unaligned;
-
-		/* Clear the allocated bit for the command */
-		release_slot(port, tag);
-
-		if (unlikely(unaligned))
-			up(&port->cmd_slot_unal);
-		else
-			up(&port->cmd_slot);
-	}
-}
-
-/*
- * This function is called for clean the pending command in the
- * command slot during the surprise removal of device and return
- * error to the upper layer.
- *
- * @dd Pointer to the DRIVER_DATA structure.
- *
- * return value
- *	None
- */
-static void mtip_command_cleanup(struct driver_data *dd)
-{
-	int tag = 0;
-	struct mtip_cmd *cmd;
-	struct mtip_port *port = dd->port;
-	unsigned int num_cmd_slots = dd->slot_groups * 32;
-
-	if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
-		return;
-
-	if (!port)
-		return;
-
-	cmd = &port->commands[MTIP_TAG_INTERNAL];
-	if (atomic_read(&cmd->active))
-		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) &
-					(1 << MTIP_TAG_INTERNAL))
-			if (cmd->comp_func)
-				cmd->comp_func(port, MTIP_TAG_INTERNAL,
-					 cmd->comp_data, -ENODEV);
-
-	while (1) {
-		tag = find_next_bit(port->allocated, num_cmd_slots, tag);
-		if (tag >= num_cmd_slots)
-			break;
-
-		cmd = &port->commands[tag];
-		if (atomic_read(&cmd->active))
-			mtip_async_complete(port, tag, dd, -ENODEV);
-	}
-
-	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+	blk_mq_end_io(rq, status ? -EIO : 0);
 }
 
 /*
@@ -388,8 +300,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
 {
 	int group = tag >> 5;
 
-	atomic_set(&port->commands[tag].active, 1);
-
 	/* guard SACT and CI registers */
 	spin_lock(&port->cmd_issue_lock[group]);
 	writel((1 << MTIP_TAG_BIT(tag)),
@@ -397,10 +307,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
 	writel((1 << MTIP_TAG_BIT(tag)),
 			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
 	spin_unlock(&port->cmd_issue_lock[group]);
-
-	/* Set the command's timeout value.*/
-	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
-					MTIP_NCQ_COMMAND_TIMEOUT_MS);
 }
 
 /*
@@ -648,131 +554,12 @@ static void print_tags(struct driver_data *dd,
 
 	memset(tagmap, 0, sizeof(tagmap));
 	for (group = SLOTBITS_IN_LONGS; group > 0; group--)
-		tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ",
+		tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ",
 						tagbits[group-1]);
 	dev_warn(&dd->pdev->dev,
 			"%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
 }
 
-/*
- * Called periodically to see if any read/write commands are
- * taking too long to complete.
- *
- * @data Pointer to the PORT data structure.
- *
- * return value
- *	None
- */
-static void mtip_timeout_function(unsigned long int data)
-{
-	struct mtip_port *port = (struct mtip_port *) data;
-	struct host_to_dev_fis *fis;
-	struct mtip_cmd *cmd;
-	int unaligned, tag, cmdto_cnt = 0;
-	unsigned int bit, group;
-	unsigned int num_command_slots;
-	unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
-	void (*func)(void *, int);
-
-	if (unlikely(!port))
-		return;
-
-	if (unlikely(port->dd->sr))
-		return;
-
-	if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
-		mod_timer(&port->cmd_timer,
-			jiffies + msecs_to_jiffies(30000));
-		return;
-	}
-	/* clear the tag accumulator */
-	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
-	num_command_slots = port->dd->slot_groups * 32;
-
-	for (tag = 0; tag < num_command_slots; tag++) {
-		/*
-		 * Skip internal command slot as it has
-		 * its own timeout mechanism
-		 */
-		if (tag == MTIP_TAG_INTERNAL)
-			continue;
-
-		if (atomic_read(&port->commands[tag].active) &&
-		   (time_after(jiffies, port->commands[tag].comp_time))) {
-			group = tag >> 5;
-			bit = tag & 0x1F;
-
-			cmd = &port->commands[tag];
-			fis = (struct host_to_dev_fis *) cmd->command;
-
-			set_bit(tag, tagaccum);
-			cmdto_cnt++;
-			if (cmdto_cnt == 1)
-				set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
-
-			/*
-			 * Clear the completed bit. This should prevent
-			 *  any interrupt handlers from trying to retire
-			 *  the command.
-			 */
-			writel(1 << bit, port->completed[group]);
-
-			/* Clear the active flag for the command */
-			atomic_set(&port->commands[tag].active, 0);
-
-			func = cmd->async_callback;
-			if (func &&
-			    cmpxchg(&cmd->async_callback, func, 0) == func) {
-
-				/* Unmap the DMA scatter list entries */
-				dma_unmap_sg(&port->dd->pdev->dev,
-						cmd->sg,
-						cmd->scatter_ents,
-						cmd->direction);
-
-				func(cmd->async_data, -EIO);
-				unaligned = cmd->unaligned;
-
-				/* Clear the allocated bit for the command. */
-				release_slot(port, tag);
-
-				if (unaligned)
-					up(&port->cmd_slot_unal);
-				else
-					up(&port->cmd_slot);
-			}
-		}
-	}
-
-	if (cmdto_cnt) {
-		print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
-		if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
-			mtip_device_reset(port->dd);
-			wake_up_interruptible(&port->svc_wait);
-		}
-		clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
-	}
-
-	if (port->ic_pause_timer) {
-		to  = port->ic_pause_timer + msecs_to_jiffies(1000);
-		if (time_after(jiffies, to)) {
-			if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
-				port->ic_pause_timer = 0;
-				clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
-				clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
-				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
-				wake_up_interruptible(&port->svc_wait);
-			}
-
-
-		}
-	}
-
-	/* Restart the timer */
-	mod_timer(&port->cmd_timer,
-		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
-}
-
 /*
  * Internal command completion callback function.
  *
@@ -789,28 +576,19 @@ static void mtip_timeout_function(unsigned long int data)
  *	None
  */
 static void mtip_completion(struct mtip_port *port,
-			    int tag,
-			    void *data,
-			    int status)
+			    int tag, struct mtip_cmd *command, int status)
 {
-	struct mtip_cmd *command = &port->commands[tag];
-	struct completion *waiting = data;
+	struct completion *waiting = command->comp_data;
 	if (unlikely(status == PORT_IRQ_TF_ERR))
 		dev_warn(&port->dd->pdev->dev,
 			"Internal command %d completed with TFE\n", tag);
 
-	command->async_callback = NULL;
-	command->comp_func = NULL;
-
 	complete(waiting);
 }
 
 static void mtip_null_completion(struct mtip_port *port,
-			    int tag,
-			    void *data,
-			    int status)
+			    int tag, struct mtip_cmd *command, int status)
 {
-	return;
 }
 
 static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
@@ -843,18 +621,16 @@ static void mtip_handle_tfe(struct driver_data *dd)
 	port = dd->port;
 
 	/* Stop the timer to prevent command timeouts. */
-	del_timer(&port->cmd_timer);
 	set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 
 	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
 			test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
-		cmd = &port->commands[MTIP_TAG_INTERNAL];
+		cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
 		dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
 
-		atomic_inc(&cmd->active); /* active > 1 indicates error */
 		if (cmd->comp_data && cmd->comp_func) {
 			cmd->comp_func(port, MTIP_TAG_INTERNAL,
-					cmd->comp_data, PORT_IRQ_TF_ERR);
+					cmd, PORT_IRQ_TF_ERR);
 		}
 		goto handle_tfe_exit;
 	}
@@ -866,6 +642,8 @@ static void mtip_handle_tfe(struct driver_data *dd)
 	for (group = 0; group < dd->slot_groups; group++) {
 		completed = readl(port->completed[group]);
 
+		dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed);
+
 		/* clear completed status register in the hardware.*/
 		writel(completed, port->completed[group]);
 
@@ -879,15 +657,11 @@ static void mtip_handle_tfe(struct driver_data *dd)
 			if (tag == MTIP_TAG_INTERNAL)
 				continue;
 
-			cmd = &port->commands[tag];
+			cmd = mtip_cmd_from_tag(dd, tag);
 			if (likely(cmd->comp_func)) {
 				set_bit(tag, tagaccum);
 				cmd_cnt++;
-				atomic_set(&cmd->active, 0);
-				cmd->comp_func(port,
-					 tag,
-					 cmd->comp_data,
-					 0);
+				cmd->comp_func(port, tag, cmd, 0);
 			} else {
 				dev_err(&port->dd->pdev->dev,
 					"Missing completion func for tag %d",
@@ -947,11 +721,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
 		for (bit = 0; bit < 32; bit++) {
 			reissue = 1;
 			tag = (group << 5) + bit;
-			cmd = &port->commands[tag];
-
-			/* If the active bit is set re-issue the command */
-			if (atomic_read(&cmd->active) == 0)
-				continue;
+			cmd = mtip_cmd_from_tag(dd, tag);
 
 			fis = (struct host_to_dev_fis *)cmd->command;
 
@@ -970,11 +740,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
 					tag,
 					fail_reason != NULL ?
 						fail_reason : "unknown");
-					atomic_set(&cmd->active, 0);
 					if (cmd->comp_func) {
 						cmd->comp_func(port, tag,
-							cmd->comp_data,
-							-ENODATA);
+							cmd, -ENODATA);
 					}
 					continue;
 				}
@@ -997,14 +765,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
 			/* Retire a command that will not be reissued */
 			dev_warn(&port->dd->pdev->dev,
 				"retiring tag %d\n", tag);
-			atomic_set(&cmd->active, 0);
 
 			if (cmd->comp_func)
-				cmd->comp_func(
-					port,
-					tag,
-					cmd->comp_data,
-					PORT_IRQ_TF_ERR);
+				cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR);
 			else
 				dev_warn(&port->dd->pdev->dev,
 					"Bad completion for tag %d\n",
@@ -1017,9 +780,6 @@ handle_tfe_exit:
 	/* clear eh_active */
 	clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 	wake_up_interruptible(&port->svc_wait);
-
-	mod_timer(&port->cmd_timer,
-		 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
 }
 
 /*
@@ -1048,15 +808,10 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
 			if (unlikely(tag == MTIP_TAG_INTERNAL))
 				continue;
 
-			command = &port->commands[tag];
-			/* make internal callback */
-			if (likely(command->comp_func)) {
-				command->comp_func(
-					port,
-					tag,
-					command->comp_data,
-					0);
-			} else {
+			command = mtip_cmd_from_tag(dd, tag);
+			if (likely(command->comp_func))
+				command->comp_func(port, tag, command, 0);
+			else {
 				dev_dbg(&dd->pdev->dev,
 					"Null completion for tag %d",
 					tag);
@@ -1081,16 +836,13 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
 static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
 {
 	struct mtip_port *port = dd->port;
-	struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
+	struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
 
 	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
 	    (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
 		& (1 << MTIP_TAG_INTERNAL))) {
 		if (cmd->comp_func) {
-			cmd->comp_func(port,
-				MTIP_TAG_INTERNAL,
-				cmd->comp_data,
-				0);
+			cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0);
 			return;
 		}
 	}
@@ -1222,7 +974,6 @@ static irqreturn_t mtip_irq_handler(int irq, void *instance)
 
 static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
 {
-	atomic_set(&port->commands[tag].active, 1);
 	writel(1 << MTIP_TAG_BIT(tag),
 		port->cmd_issue[MTIP_TAG_INDEX(tag)]);
 }
@@ -1335,10 +1086,9 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 {
 	struct mtip_cmd_sg *command_sg;
 	DECLARE_COMPLETION_ONSTACK(wait);
-	int rv = 0, ready2go = 1;
-	struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
-	unsigned long to;
+	struct mtip_cmd *int_cmd;
 	struct driver_data *dd = port->dd;
+	int rv = 0;
 
 	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
 	if (buffer & 0x00000007) {
@@ -1346,19 +1096,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		return -EFAULT;
 	}
 
-	to = jiffies + msecs_to_jiffies(timeout);
-	do {
-		ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL,
-						port->allocated);
-		if (ready2go)
-			break;
-		mdelay(100);
-	} while (time_before(jiffies, to));
-	if (!ready2go) {
-		dev_warn(&dd->pdev->dev,
-			"Internal cmd active. new cmd [%02X]\n", fis->command);
-		return -EBUSY;
-	}
+	int_cmd = mtip_get_int_command(dd);
+
 	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
 	port->ic_pause_timer = 0;
 
@@ -1371,7 +1110,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 			if (mtip_quiesce_io(port, 5000) < 0) {
 				dev_warn(&dd->pdev->dev,
 					"Failed to quiesce IO\n");
-				release_slot(port, MTIP_TAG_INTERNAL);
+				mtip_put_int_command(dd, int_cmd);
 				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
 				wake_up_interruptible(&port->svc_wait);
 				return -EBUSY;
@@ -1497,8 +1236,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	}
 exec_ic_exit:
 	/* Clear the allocated and active bits for the internal command. */
-	atomic_set(&int_cmd->active, 0);
-	release_slot(port, MTIP_TAG_INTERNAL);
+	mtip_put_int_command(dd, int_cmd);
 	if (rv >= 0 && mtip_pause_ncq(port, fis)) {
 		/* NCQ paused */
 		return rv;
@@ -2610,22 +2348,21 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
  * return value
  *	None
  */
-static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
-			      int nsect, int nents, int tag, void *callback,
-			      void *data, int dir, int unaligned)
+static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
+			      struct mtip_cmd *command, int nents,
+			      struct blk_mq_hw_ctx *hctx)
 {
 	struct host_to_dev_fis	*fis;
 	struct mtip_port *port = dd->port;
-	struct mtip_cmd *command = &port->commands[tag];
-	int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-	u64 start = sector;
+	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	u64 start = blk_rq_pos(rq);
+	unsigned int nsect = blk_rq_sectors(rq);
 
 	/* Map the scatter list for DMA access */
 	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
 
 	command->scatter_ents = nents;
 
-	command->unaligned = unaligned;
 	/*
 	 * The number of retries for this command before it is
 	 * reported as a failure to the upper layers.
@@ -2636,8 +2373,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
 	fis = command->command;
 	fis->type        = 0x27;
 	fis->opts        = 1 << 7;
-	fis->command     =
-		(dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE);
+	if (rq_data_dir(rq) == READ)
+		fis->command = ATA_CMD_FPDMA_READ;
+	else
+		fis->command = ATA_CMD_FPDMA_WRITE;
 	fis->lba_low     = start & 0xFF;
 	fis->lba_mid     = (start >> 8) & 0xFF;
 	fis->lba_hi      = (start >> 16) & 0xFF;
@@ -2647,14 +2386,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
 	fis->device	 = 1 << 6;
 	fis->features    = nsect & 0xFF;
 	fis->features_ex = (nsect >> 8) & 0xFF;
-	fis->sect_count  = ((tag << 3) | (tag >> 5));
+	fis->sect_count  = ((rq->tag << 3) | (rq->tag >> 5));
 	fis->sect_cnt_ex = 0;
 	fis->control     = 0;
 	fis->res2        = 0;
 	fis->res3        = 0;
 	fill_command_sg(dd, command, nents);
 
-	if (unaligned)
+	if (command->unaligned)
 		fis->device |= 1 << 7;
 
 	/* Populate the command header */
@@ -2671,82 +2410,18 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
 	command->comp_func = mtip_async_complete;
 	command->direction = dma_dir;
 
-	/*
-	 * Set the completion function and data for the command passed
-	 * from the upper layer.
-	 */
-	command->async_data = data;
-	command->async_callback = callback;
-
 	/*
 	 * To prevent this command from being issued
 	 * if an internal command is in progress or error handling is active.
 	 */
 	if (port->flags & MTIP_PF_PAUSE_IO) {
-		set_bit(tag, port->cmds_to_issue);
+		set_bit(rq->tag, port->cmds_to_issue);
 		set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
 		return;
 	}
 
 	/* Issue the command to the hardware */
-	mtip_issue_ncq_command(port, tag);
-
-	return;
-}
-
-/*
- * Release a command slot.
- *
- * @dd  Pointer to the driver data structure.
- * @tag Slot tag
- *
- * return value
- *      None
- */
-static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
-								int unaligned)
-{
-	struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
-							&dd->port->cmd_slot;
-	release_slot(dd->port, tag);
-	up(sem);
-}
-
-/*
- * Obtain a command slot and return its associated scatter list.
- *
- * @dd  Pointer to the driver data structure.
- * @tag Pointer to an int that will receive the allocated command
- *            slot tag.
- *
- * return value
- *	Pointer to the scatter list for the allocated command slot
- *	or NULL if no command slots are available.
- */
-static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
-						   int *tag, int unaligned)
-{
-	struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
-							&dd->port->cmd_slot;
-
-	/*
-	 * It is possible that, even with this semaphore, a thread
-	 * may think that no command slots are available. Therefore, we
-	 * need to make an attempt to get_slot().
-	 */
-	down(sem);
-	*tag = get_slot(dd->port);
-
-	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
-		up(sem);
-		return NULL;
-	}
-	if (unlikely(*tag < 0)) {
-		up(sem);
-		return NULL;
-	}
-
-	return dd->port->commands[*tag].sg;
+	mtip_issue_ncq_command(port, rq->tag);
 }
 
 /*
@@ -3117,6 +2792,7 @@ static int mtip_free_orphan(struct driver_data *dd)
 		if (dd->queue) {
 			dd->queue->queuedata = NULL;
 			blk_cleanup_queue(dd->queue);
+			blk_mq_free_tag_set(&dd->tags);
 			dd->queue = NULL;
 		}
 	}
@@ -3369,7 +3045,6 @@ st_out:
  */
 static void mtip_dma_free(struct driver_data *dd)
 {
-	int i;
 	struct mtip_port *port = dd->port;
 
 	if (port->block1)
@@ -3380,13 +3055,6 @@ static void mtip_dma_free(struct driver_data *dd)
 		dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
 				port->command_list, port->command_list_dma);
 	}
-
-	for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
-		if (port->commands[i].command)
-			dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
-				port->commands[i].command,
-				port->commands[i].command_dma);
-	}
 }
 
 /*
@@ -3400,8 +3068,6 @@ static void mtip_dma_free(struct driver_data *dd)
 static int mtip_dma_alloc(struct driver_data *dd)
 {
 	struct mtip_port *port = dd->port;
-	int i, rv = 0;
-	u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
 
 	/* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
 	port->block1 =
@@ -3434,43 +3100,65 @@ static int mtip_dma_alloc(struct driver_data *dd)
 	port->smart_buf     = port->block1 + AHCI_SMARTBUF_OFFSET;
 	port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
 
-	/* Setup per command SGL DMA region */
-
-	/* Point the command headers at the command tables */
-	for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
-		port->commands[i].command =
-			dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
-				&port->commands[i].command_dma, GFP_KERNEL);
-		if (!port->commands[i].command) {
-			rv = -ENOMEM;
-			mtip_dma_free(dd);
-			return rv;
-		}
-		memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ);
-
-		port->commands[i].command_header = port->command_list +
-					(sizeof(struct mtip_cmd_hdr) * i);
-		port->commands[i].command_header_dma =
-					dd->port->command_list_dma +
-					(sizeof(struct mtip_cmd_hdr) * i);
-
-		if (host_cap_64)
-			port->commands[i].command_header->ctbau =
-				__force_bit2int cpu_to_le32(
-				(port->commands[i].command_dma >> 16) >> 16);
-
-		port->commands[i].command_header->ctba =
-				__force_bit2int cpu_to_le32(
-				port->commands[i].command_dma & 0xFFFFFFFF);
-
-		sg_init_table(port->commands[i].sg, MTIP_MAX_SG);
-
-		/* Mark command as currently inactive */
-		atomic_set(&dd->port->commands[i].active, 0);
-	}
 	return 0;
 }
 
+static int mtip_hw_get_identify(struct driver_data *dd)
+{
+	struct smart_attr attr242;
+	unsigned char *buf;
+	int rv;
+
+	if (mtip_get_identify(dd->port, NULL) < 0)
+		return -EFAULT;
+
+	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+		MTIP_FTL_REBUILD_MAGIC) {
+		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+		return MTIP_FTL_REBUILD_MAGIC;
+	}
+	mtip_dump_identify(dd->port);
+
+	/* check write protect, over temp and rebuild statuses */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			/* TODO */
+		}
+	}
+
+	/* get write protect progess */
+	memset(&attr242, 0, sizeof(struct smart_attr));
+	if (mtip_get_smart_attr(dd->port, 242, &attr242))
+		dev_warn(&dd->pdev->dev,
+				"Unable to check write protect progress\n");
+	else
+		dev_info(&dd->pdev->dev,
+				"Write protect progress: %u%% (%u blocks)\n",
+				attr242.cur, le32_to_cpu(attr242.data));
+
+	return rv;
+}
+
 /*
  * Called once for each card.
  *
@@ -3485,8 +3173,6 @@ static int mtip_hw_init(struct driver_data *dd)
 	int rv;
 	unsigned int num_command_slots;
 	unsigned long timeout, timetaken;
-	unsigned char *buf;
-	struct smart_attr attr242;
 
 	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
 
@@ -3517,8 +3203,6 @@ static int mtip_hw_init(struct driver_data *dd)
 	else
 		dd->unal_qdepth = 0;
 
-	/* Counting semaphore to track command slot usage */
-	sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth);
 	sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
 
 	/* Spinlock to prevent concurrent issue */
@@ -3603,73 +3287,16 @@ static int mtip_hw_init(struct driver_data *dd)
 	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
 					dd->mmio + HOST_CTL);
 
-	init_timer(&dd->port->cmd_timer);
 	init_waitqueue_head(&dd->port->svc_wait);
 
-	dd->port->cmd_timer.data = (unsigned long int) dd->port;
-	dd->port->cmd_timer.function = mtip_timeout_function;
-	mod_timer(&dd->port->cmd_timer,
-		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
-
-
 	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
 		rv = -EFAULT;
 		goto out3;
 	}
 
-	if (mtip_get_identify(dd->port, NULL) < 0) {
-		rv = -EFAULT;
-		goto out3;
-	}
-	mtip_dump_identify(dd->port);
-
-	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
-		MTIP_FTL_REBUILD_MAGIC) {
-		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
-		return MTIP_FTL_REBUILD_MAGIC;
-	}
-
-	/* check write protect, over temp and rebuild statuses */
-	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
-				dd->port->log_buf,
-				dd->port->log_buf_dma, 1);
-	if (rv) {
-		dev_warn(&dd->pdev->dev,
-			"Error in READ LOG EXT (10h) command\n");
-		/* non-critical error, don't fail the load */
-	} else {
-		buf = (unsigned char *)dd->port->log_buf;
-		if (buf[259] & 0x1) {
-			dev_info(&dd->pdev->dev,
-				"Write protect bit is set.\n");
-			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
-		}
-		if (buf[288] == 0xF7) {
-			dev_info(&dd->pdev->dev,
-				"Exceeded Tmax, drive in thermal shutdown.\n");
-			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
-		}
-		if (buf[288] == 0xBF) {
-			dev_info(&dd->pdev->dev,
-				"Drive is in security locked state.\n");
-			set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
-		}
-	}
-
-	/* get write protect progess */
-	memset(&attr242, 0, sizeof(struct smart_attr));
-	if (mtip_get_smart_attr(dd->port, 242, &attr242))
-		dev_warn(&dd->pdev->dev,
-				"Unable to check write protect progress\n");
-	else
-		dev_info(&dd->pdev->dev,
-				"Write protect progress: %u%% (%u blocks)\n",
-				attr242.cur, le32_to_cpu(attr242.data));
 	return rv;
 
 out3:
-	del_timer_sync(&dd->port->cmd_timer);
-
 	/* Disable interrupts on the HBA. */
 	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
 			dd->mmio + HOST_CTL);
@@ -3689,6 +3316,22 @@ out1:
 	return rv;
 }
 
+static void mtip_standby_drive(struct driver_data *dd)
+{
+	if (dd->sr)
+		return;
+
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
+	    !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+		if (mtip_standby_immediate(dd->port))
+			dev_warn(&dd->pdev->dev,
+				"STANDBY IMMEDIATE failed\n");
+}
+
 /*
  * Called to deinitialize an interface.
  *
@@ -3704,12 +3347,6 @@ static int mtip_hw_exit(struct driver_data *dd)
 	 * saves its state.
 	 */
 	if (!dd->sr) {
-		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
-		    !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
-			if (mtip_standby_immediate(dd->port))
-				dev_warn(&dd->pdev->dev,
-					"STANDBY IMMEDIATE failed\n");
-
 		/* de-initialize the port. */
 		mtip_deinit_port(dd->port);
 
@@ -3718,8 +3355,6 @@ static int mtip_hw_exit(struct driver_data *dd)
 				dd->mmio + HOST_CTL);
 	}
 
-	del_timer_sync(&dd->port->cmd_timer);
-
 	/* Release the IRQ. */
 	irq_set_affinity_hint(dd->pdev->irq, NULL);
 	devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
@@ -4036,100 +3671,140 @@ static const struct block_device_operations mtip_block_ops = {
  *
  * @queue Pointer to the request queue. Unused other than to obtain
  *              the driver data structure.
- * @bio   Pointer to the BIO.
+ * @rq    Pointer to the request.
  *
  */
-static void mtip_make_request(struct request_queue *queue, struct bio *bio)
+static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-	struct driver_data *dd = queue->queuedata;
-	struct scatterlist *sg;
-	struct bio_vec bvec;
-	struct bvec_iter iter;
-	int nents = 0;
-	int tag = 0, unaligned = 0;
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	unsigned int nents;
 
 	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
 		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
 							&dd->dd_flag))) {
-			bio_endio(bio, -ENXIO);
-			return;
+			return -ENXIO;
 		}
 		if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
-			bio_endio(bio, -ENODATA);
-			return;
+			return -ENODATA;
 		}
 		if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
 							&dd->dd_flag) &&
-				bio_data_dir(bio))) {
-			bio_endio(bio, -ENODATA);
-			return;
-		}
-		if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) {
-			bio_endio(bio, -ENODATA);
-			return;
-		}
-		if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
-			bio_endio(bio, -ENXIO);
-			return;
+				rq_data_dir(rq))) {
+			return -ENODATA;
 		}
+		if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)))
+			return -ENODATA;
+		if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+			return -ENXIO;
 	}
 
-	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
-		bio_endio(bio, mtip_send_trim(dd, bio->bi_iter.bi_sector,
-						bio_sectors(bio)));
-		return;
+	if (rq->cmd_flags & REQ_DISCARD) {
+		int err;
+
+		err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
+		blk_mq_end_io(rq, err);
+		return 0;
 	}
 
-	if (unlikely(!bio_has_data(bio))) {
-		blk_queue_flush(queue, 0);
-		bio_endio(bio, 0);
-		return;
-	}
+	/* Create the scatter list for this request. */
+	nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
 
-	if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 &&
-							dd->unal_qdepth) {
-		if (bio->bi_iter.bi_sector % 8 != 0)
-			/* Unaligned on 4k boundaries */
-			unaligned = 1;
-		else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */
-			unaligned = 1;
-	}
-
-	sg = mtip_hw_get_scatterlist(dd, &tag, unaligned);
-	if (likely(sg != NULL)) {
-		blk_queue_bounce(queue, &bio);
-
-		if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
-			dev_warn(&dd->pdev->dev,
-				"Maximum number of SGL entries exceeded\n");
-			bio_io_error(bio);
-			mtip_hw_release_scatterlist(dd, tag, unaligned);
-			return;
-		}
-
-		/* Create the scatter list for this bio. */
-		bio_for_each_segment(bvec, bio, iter) {
-			sg_set_page(&sg[nents],
-					bvec.bv_page,
-					bvec.bv_len,
-					bvec.bv_offset);
-			nents++;
-		}
-
-		/* Issue the read/write. */
-		mtip_hw_submit_io(dd,
-				bio->bi_iter.bi_sector,
-				bio_sectors(bio),
-				nents,
-				tag,
-				bio_endio,
-				bio,
-				bio_data_dir(bio),
-				unaligned);
-	} else
-		bio_io_error(bio);
+	/* Issue the read/write. */
+	mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
+	return 0;
 }
 
+static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
+				  struct request *rq)
+{
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (!dd->unal_qdepth || rq_data_dir(rq) == READ)
+		return false;
+
+	/*
+	 * If unaligned depth must be limited on this controller, mark it
+	 * as unaligned if the IO isn't on a 4k boundary (start of length).
+	 */
+	if (blk_rq_sectors(rq) <= 64) {
+		if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7))
+			cmd->unaligned = 1;
+	}
+
+	if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
+		return true;
+
+	return false;
+}
+
+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	int ret;
+
+	if (mtip_check_unal_depth(hctx, rq))
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	ret = mtip_submit_request(hctx, rq);
+	if (!ret)
+		return BLK_MQ_RQ_QUEUE_OK;
+
+	rq->errors = ret;
+	return BLK_MQ_RQ_QUEUE_ERROR;
+}
+
+static void mtip_free_cmd(void *data, struct request *rq,
+			  unsigned int hctx_idx, unsigned int request_idx)
+{
+	struct driver_data *dd = data;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (!cmd->command)
+		return;
+
+	dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+				cmd->command, cmd->command_dma);
+}
+
+static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
+			 unsigned int request_idx, unsigned int numa_node)
+{
+	struct driver_data *dd = data;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+	cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+			&cmd->command_dma, GFP_KERNEL);
+	if (!cmd->command)
+		return -ENOMEM;
+
+	memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
+
+	/* Point the command headers at the command tables. */
+	cmd->command_header = dd->port->command_list +
+				(sizeof(struct mtip_cmd_hdr) * request_idx);
+	cmd->command_header_dma = dd->port->command_list_dma +
+				(sizeof(struct mtip_cmd_hdr) * request_idx);
+
+	if (host_cap_64)
+		cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+
+	cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+
+	sg_init_table(cmd->sg, MTIP_MAX_SG);
+	return 0;
+}
+
+static struct blk_mq_ops mtip_mq_ops = {
+	.queue_rq	= mtip_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
+	.free_hctx	= blk_mq_free_single_hw_queue,
+	.init_request	= mtip_init_cmd,
+	.exit_request	= mtip_free_cmd,
+};
+
 /*
  * Block layer initialization function.
  *
@@ -4152,11 +3827,7 @@ static int mtip_block_initialize(struct driver_data *dd)
 	if (dd->disk)
 		goto skip_create_disk; /* hw init done, before rebuild */
 
-	/* Initialize the protocol layer. */
-	wait_for_rebuild = mtip_hw_init(dd);
-	if (wait_for_rebuild < 0) {
-		dev_err(&dd->pdev->dev,
-			"Protocol layer initialization failed\n");
+	if (mtip_hw_init(dd)) {
 		rv = -EINVAL;
 		goto protocol_init_error;
 	}
@@ -4198,16 +3869,27 @@ static int mtip_block_initialize(struct driver_data *dd)
 
 	mtip_hw_debugfs_init(dd);
 
-	/*
-	 * if rebuild pending, start the service thread, and delay the block
-	 * queue creation and add_disk()
-	 */
-	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
-		goto start_service_thread;
-
 skip_create_disk:
+	memset(&dd->tags, 0, sizeof(dd->tags));
+	dd->tags.ops = &mtip_mq_ops;
+	dd->tags.nr_hw_queues = 1;
+	dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS;
+	dd->tags.reserved_tags = 1;
+	dd->tags.cmd_size = sizeof(struct mtip_cmd);
+	dd->tags.numa_node = dd->numa_node;
+	dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
+	dd->tags.driver_data = dd;
+
+	rv = blk_mq_alloc_tag_set(&dd->tags);
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+
 	/* Allocate the request queue. */
-	dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node);
+	dd->queue = blk_mq_init_queue(&dd->tags);
 	if (dd->queue == NULL) {
 		dev_err(&dd->pdev->dev,
 			"Unable to allocate request queue\n");
@@ -4215,12 +3897,25 @@ skip_create_disk:
 		goto block_queue_alloc_init_error;
 	}
 
-	/* Attach our request function to the request queue. */
-	blk_queue_make_request(dd->queue, mtip_make_request);
-
 	dd->disk->queue		= dd->queue;
 	dd->queue->queuedata	= dd;
 
+	/* Initialize the protocol layer. */
+	wait_for_rebuild = mtip_hw_get_identify(dd);
+	if (wait_for_rebuild < 0) {
+		dev_err(&dd->pdev->dev,
+			"Protocol layer initialization failed\n");
+		rv = -EINVAL;
+		goto init_hw_cmds_error;
+	}
+
+	/*
+	 * if rebuild pending, start the service thread, and delay the block
+	 * queue creation and add_disk()
+	 */
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		goto start_service_thread;
+
 	/* Set device limits. */
 	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
 	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
@@ -4299,8 +3994,9 @@ kthread_run_error:
 	del_gendisk(dd->disk);
 
 read_capacity_error:
+init_hw_cmds_error:
 	blk_cleanup_queue(dd->queue);
-
+	blk_mq_free_tag_set(&dd->tags);
 block_queue_alloc_init_error:
 	mtip_hw_debugfs_exit(dd);
 disk_index_error:
@@ -4349,6 +4045,9 @@ static int mtip_block_remove(struct driver_data *dd)
 				kobject_put(kobj);
 			}
 		}
+
+		mtip_standby_drive(dd);
+
 		/*
 		 * Delete our gendisk structure. This also removes the device
 		 * from /dev
@@ -4361,6 +4060,7 @@ static int mtip_block_remove(struct driver_data *dd)
 			if (dd->disk->queue) {
 				del_gendisk(dd->disk);
 				blk_cleanup_queue(dd->queue);
+				blk_mq_free_tag_set(&dd->tags);
 				dd->queue = NULL;
 			} else
 				put_disk(dd->disk);
@@ -4395,6 +4095,8 @@ static int mtip_block_remove(struct driver_data *dd)
  */
 static int mtip_block_shutdown(struct driver_data *dd)
 {
+	mtip_hw_shutdown(dd);
+
 	/* Delete our gendisk structure, and cleanup the blk queue. */
 	if (dd->disk) {
 		dev_info(&dd->pdev->dev,
@@ -4403,6 +4105,7 @@ static int mtip_block_shutdown(struct driver_data *dd)
 		if (dd->disk->queue) {
 			del_gendisk(dd->disk);
 			blk_cleanup_queue(dd->queue);
+			blk_mq_free_tag_set(&dd->tags);
 		} else
 			put_disk(dd->disk);
 		dd->disk  = NULL;
@@ -4412,8 +4115,6 @@ static int mtip_block_shutdown(struct driver_data *dd)
 	spin_lock(&rssd_index_lock);
 	ida_remove(&rssd_index_ida, dd->index);
 	spin_unlock(&rssd_index_lock);
-
-	mtip_hw_shutdown(dd);
 	return 0;
 }
 
@@ -4767,8 +4468,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 		dev_warn(&dd->pdev->dev,
 			"Completion workers still active!\n");
 	}
-	/* Cleanup the outstanding commands */
-	mtip_command_cleanup(dd);
 
 	/* Clean up the block layer. */
 	mtip_block_remove(dd);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index ffb955e7ccb9..982a88fe1ab2 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -331,12 +331,8 @@ struct mtip_cmd {
 	 */
 	void (*comp_func)(struct mtip_port *port,
 				int tag,
-				void *data,
+				struct mtip_cmd *cmd,
 				int status);
-	/* Additional callback function that may be called by comp_func() */
-	void (*async_callback)(void *data, int status);
-
-	void *async_data; /* Addl. data passed to async_callback() */
 
 	int scatter_ents; /* Number of scatter list entries used */
 
@@ -347,10 +343,6 @@ struct mtip_cmd {
 	int retries; /* The number of retries left for this command. */
 
 	int direction; /* Data transfer direction */
-
-	unsigned long comp_time; /* command completion time, in jiffies */
-
-	atomic_t active; /* declares if this command sent to the drive. */
 };
 
 /* Structure used to describe a port. */
@@ -436,12 +428,6 @@ struct mtip_port {
 	 * or error handling is active
 	 */
 	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
-	/*
-	 * Array of command slots. Structure includes pointers to the
-	 * command header and command table, and completion function and data
-	 * pointers.
-	 */
-	struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
 	/* Used by mtip_service_thread to wait for an event */
 	wait_queue_head_t svc_wait;
 	/*
@@ -452,13 +438,7 @@ struct mtip_port {
 	/*
 	 * Timer used to complete commands that have been active for too long.
 	 */
-	struct timer_list cmd_timer;
 	unsigned long ic_pause_timer;
-	/*
-	 * Semaphore used to block threads if there are no
-	 * command slots available.
-	 */
-	struct semaphore cmd_slot;
 
 	/* Semaphore to control queue depth of unaligned IOs */
 	struct semaphore cmd_slot_unal;
@@ -485,6 +465,8 @@ struct driver_data {
 
 	struct request_queue *queue; /* Our request queue. */
 
+	struct blk_mq_tag_set tags; /* blk_mq tags */
+
 	struct mtip_port *port; /* Pointer to the port data structure. */
 
 	unsigned product_type; /* magic value declaring the product type */

From a8a642ccd2e86248ae23c5d762dac67eaea02423 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 14 May 2014 15:54:18 +0300
Subject: [PATCH 45/54] mtip32xx: blk_mq_init_queue() returns an ERR_PTR

We changed this from blk_alloc_queue_node() to blk_mq_init_queue() so
the check needs to be updated as well.

Fixes: ffc771b3ca8b2 ('mtip32xx: convert to use blk-mq')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3a0882ee1642..5979ab3afd81 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3890,7 +3890,7 @@ skip_create_disk:
 
 	/* Allocate the request queue. */
 	dd->queue = blk_mq_init_queue(&dd->tags);
-	if (dd->queue == NULL) {
+	if (IS_ERR(dd->queue)) {
 		dev_err(&dd->pdev->dev,
 			"Unable to allocate request queue\n");
 		rv = -ENOMEM;

From 9acf03cfb1fbecc058d3f223323e3ed97763f1e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 14 May 2014 08:22:56 -0600
Subject: [PATCH 46/54] mtip32xx: stop block hardware queues before quiescing
 IO

We need to stop the block layer queues to prevent new "normal"
IO from entering the driver, while we wait for existing commands
to finish.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 5979ab3afd81..4efc676aa5f4 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1031,6 +1031,8 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 	unsigned int n;
 	unsigned int active = 1;
 
+	blk_mq_stop_hw_queues(port->dd->queue);
+
 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
 		if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
@@ -1039,7 +1041,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 			continue; /* svc thd is actively issuing commands */
 		}
 		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
-			return -EFAULT;
+			goto err_fault;
 		/*
 		 * Ignore s_active bit 0 of array element 0.
 		 * This bit will always be set
@@ -1054,7 +1056,11 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 		msleep(20);
 	} while (time_before(jiffies, to));
 
+	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
 	return active ? -EBUSY : 0;
+err_fault:
+	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+	return -EFAULT;
 }
 
 /*

From 0c29e93eae8a7f703e463c7b38ebc85d8718cae2 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 16 May 2014 23:31:21 +0800
Subject: [PATCH 47/54] virtio_blk: fix race between start and stop queue

When there isn't enough vring descriptor for adding to vq,
blk-mq will be put as stopped state until some of pending
descriptors are completed & freed.

Unfortunately, the vq's interrupt may come just before
blk-mq's BLK_MQ_S_STOPPED flag is set, so the blk-mq will
still be kept as stopped even though lots of descriptors
are completed and freed in the interrupt handler. The worst
case is that all pending descriptors are freed in the
interrupt handler, and the queue is kept as stopped forever.

This patch fixes the problem by starting/stopping blk-mq
with holding vq_lock.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/virtio_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7a51f065edcd..9f340fafca5c 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -147,11 +147,11 @@ static void virtblk_done(struct virtqueue *vq)
 		if (unlikely(virtqueue_is_broken(vq)))
 			break;
 	} while (!virtqueue_enable_cb(vq));
-	spin_unlock_irqrestore(&vblk->vq_lock, flags);
 
 	/* In case queue is stopped waiting for more buffers. */
 	if (req_done)
 		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+	spin_unlock_irqrestore(&vblk->vq_lock, flags);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
@@ -205,8 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
 	if (err) {
 		virtqueue_kick(vblk->vq);
-		spin_unlock_irqrestore(&vblk->vq_lock, flags);
 		blk_mq_stop_hw_queue(hctx);
+		spin_unlock_irqrestore(&vblk->vq_lock, flags);
 		/* Out of mem doesn't actually happen, since we fall back
 		 * to direct descriptors */
 		if (err == -ENOMEM || err == -ENOSPC)

From 9b204fbf0987748ec6cc4a3cde0064ecf42accd0 Mon Sep 17 00:00:00 2001
From: Asai Thambi S P <asamymuthupa@micron.com>
Date: Tue, 20 May 2014 10:48:56 -0700
Subject: [PATCH 48/54] mtip32xx: move error handling to service thread

Move error handling to service thread, and use mtip_set_timeout()
to set timeouts for HDIO_DRIVE_TASK and HDIO_DRIVE_CMD IOCTL commands.

Signed-off-by: Selvan Mani <smani@micron.com>
Signed-off-by: Asai Thambi S P <asamymuthupa@micron.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 71 +++++++++++++++++++++----------
 drivers/block/mtip32xx/mtip32xx.h |  8 ++--
 2 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 4efc676aa5f4..ae331ab4a451 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -620,7 +620,6 @@ static void mtip_handle_tfe(struct driver_data *dd)
 
 	port = dd->port;
 
-	/* Stop the timer to prevent command timeouts. */
 	set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 
 	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
@@ -855,8 +854,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
  */
 static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
 {
-	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
-		mtip_handle_tfe(dd);
 
 	if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
 		dev_warn(&dd->pdev->dev,
@@ -874,6 +871,12 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
 		dev_warn(&dd->pdev->dev,
 			"Port stat errors %x unhandled\n",
 			(port_stat & ~PORT_IRQ_HANDLED));
+		if (mtip_check_surprise_removal(dd->pdev))
+			return;
+	}
+	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) {
+		set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
 	}
 }
 
@@ -1040,8 +1043,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 			msleep(20);
 			continue; /* svc thd is actively issuing commands */
 		}
+
+		msleep(100);
+		if (mtip_check_surprise_removal(port->dd->pdev))
+			goto err_fault;
 		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
 			goto err_fault;
+
 		/*
 		 * Ignore s_active bit 0 of array element 0.
 		 * This bit will always be set
@@ -1052,8 +1060,6 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 
 		if (!active)
 			break;
-
-		msleep(20);
 	} while (time_before(jiffies, to));
 
 	blk_mq_start_stopped_hw_queues(port->dd->queue, true);
@@ -1113,7 +1119,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	if (atomic == GFP_KERNEL) {
 		if (fis->command != ATA_CMD_STANDBYNOW1) {
 			/* wait for io to complete if non atomic */
-			if (mtip_quiesce_io(port, 5000) < 0) {
+			if (mtip_quiesce_io(port,
+					MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) {
 				dev_warn(&dd->pdev->dev,
 					"Failed to quiesce IO\n");
 				mtip_put_int_command(dd, int_cmd);
@@ -1161,9 +1168,9 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 
 	if (atomic == GFP_KERNEL) {
 		/* Wait for the command to complete or timeout. */
-		if (wait_for_completion_interruptible_timeout(
+		if ((rv = wait_for_completion_interruptible_timeout(
 				&wait,
-				msecs_to_jiffies(timeout)) <= 0) {
+				msecs_to_jiffies(timeout))) <= 0) {
 			if (rv == -ERESTARTSYS) { /* interrupted */
 				dev_err(&dd->pdev->dev,
 					"Internal command [%02X] was interrupted after %lu ms\n",
@@ -1299,7 +1306,7 @@ static void mtip_set_timeout(struct driver_data *dd,
 		*timeout = 15000;  /* 15 seconds */
 		break;
 	default:
-		*timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+		*timeout = MTIP_IOCTL_CMD_TIMEOUT_MS;
 		break;
 	}
 }
@@ -1351,7 +1358,7 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
 				sizeof(u16) * ATA_ID_WORDS,
 				0,
 				GFP_KERNEL,
-				MTIP_INTERNAL_COMMAND_TIMEOUT_MS)
+				MTIP_INT_CMD_TIMEOUT_MS)
 				< 0) {
 		rv = -1;
 		goto out;
@@ -1483,7 +1490,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
 					sectors * ATA_SECT_SIZE,
 					0,
 					GFP_ATOMIC,
-					MTIP_INTERNAL_COMMAND_TIMEOUT_MS);
+					MTIP_INT_CMD_TIMEOUT_MS);
 }
 
 /*
@@ -1776,6 +1783,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 {
 	struct host_to_dev_fis	fis;
 	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+	unsigned int to;
 
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
@@ -1789,6 +1797,8 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 	fis.cyl_hi	= command[5];
 	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
 
+	mtip_set_timeout(port->dd, &fis, &to, 0);
+
 	dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
 		__func__,
 		command[0],
@@ -1807,7 +1817,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 				 0,
 				 0,
 				 GFP_KERNEL,
-				 MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) {
+				 to) < 0) {
 		return -1;
 	}
 
@@ -1847,6 +1857,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 	u8 *buf = NULL;
 	dma_addr_t dma_addr = 0;
 	int rv = 0, xfer_sz = command[3];
+	unsigned int to;
 
 	if (xfer_sz) {
 		if (!user_buffer)
@@ -1878,6 +1889,8 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 		fis.cyl_hi	= 0xC2;
 	}
 
+	mtip_set_timeout(port->dd, &fis, &to, 0);
+
 	if (xfer_sz)
 		reply = (port->rxfis + RX_FIS_PIO_SETUP);
 	else
@@ -1900,7 +1913,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 				 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
 				 0,
 				 GFP_KERNEL,
-				 MTIP_IOCTL_COMMAND_TIMEOUT_MS)
+				 to)
 				 < 0) {
 		rv = -EFAULT;
 		goto exit_drive_command;
@@ -2956,6 +2969,11 @@ static int mtip_service_thread(void *data)
 	int ret;
 
 	while (1) {
+		if (kthread_should_stop() ||
+			test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
 		/*
 		 * the condition is to check neither an internal command is
 		 * is in progress nor error handling is active
@@ -2963,11 +2981,12 @@ static int mtip_service_thread(void *data)
 		wait_event_interruptible(port->svc_wait, (port->flags) &&
 			!(port->flags & MTIP_PF_PAUSE_IO));
 
-		if (kthread_should_stop())
-			goto st_out;
-
 		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
 
+		if (kthread_should_stop() ||
+			test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+
 		/* If I am an orphan, start self cleanup */
 		if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
 			break;
@@ -2976,6 +2995,16 @@ static int mtip_service_thread(void *data)
 				&dd->dd_flag)))
 			goto st_out;
 
+restart_eh:
+		/* Demux bits: start with error handling */
+		if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) {
+			mtip_handle_tfe(dd);
+			clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+		}
+
+		if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags))
+			goto restart_eh;
+
 		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
 			slot = 1;
 			/* used to restrict the loop to one iteration */
@@ -3005,16 +3034,14 @@ static int mtip_service_thread(void *data)
 			}
 
 			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
-		} else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+		}
+
+		if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
 			if (mtip_ftl_rebuild_poll(dd) < 0)
 				set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
 							&dd->dd_flag);
 			clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
 		}
-		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
-
-		if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
-			goto st_out;
 	}
 
 	/* wait for pci remove to exit */
@@ -4499,8 +4526,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 
 	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
 	pci_set_drvdata(pdev, NULL);
-	pci_dev_put(pdev);
-
 }
 
 /*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 982a88fe1ab2..4b9b554234bc 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -40,9 +40,11 @@
 #define MTIP_MAX_RETRIES	2
 
 /* Various timeout values in ms */
-#define MTIP_NCQ_COMMAND_TIMEOUT_MS       5000
-#define MTIP_IOCTL_COMMAND_TIMEOUT_MS     5000
-#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS  5000
+#define MTIP_NCQ_CMD_TIMEOUT_MS      15000
+#define MTIP_IOCTL_CMD_TIMEOUT_MS    5000
+#define MTIP_INT_CMD_TIMEOUT_MS      5000
+#define MTIP_QUIESCE_IO_TIMEOUT_MS   (MTIP_NCQ_CMD_TIMEOUT_MS * \
+				     (MTIP_MAX_RETRIES + 1))
 
 /* check for timeouts every 500ms */
 #define MTIP_TIMEOUT_CHECK_PERIOD	500

From 6314a108ec19aefa5160535b2bfe1ca9c38efe37 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 28 May 2014 11:55:23 +0200
Subject: [PATCH 49/54] floppy: do not corrupt bio.bi_flags when reading block
 0

Commit 41a55b4de39 ("floppy: silence warning during disk test") caused
bio.bi_flags being overwritten, and its initialization to BIO_UPTODATE
in bio_init() to be lost.

This was unnoticed until 7b7b68bba5 ("floppy: bail out in open() if
drive is not responding to block0 read"), because the error value wasn't
checked for in the bio completion callback.

Now we are actually looking at the error, and the loss of BIO_UPTODATE
causes EIO to be wrongly passed to the callback, which confuses the
FD_OPEN_SHOULD_FAIL_BIT logic.

Fix this by not destroying previous value of bi_flags when setting
BIO_QUIET.

Cc: Stephen Hemminger <shemminger@vyatta.com>
Reported-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/block/floppy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 5f69c910c3ac..8e767bb7995e 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3809,7 +3809,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 	bio.bi_iter.bi_size = size;
 	bio.bi_bdev = bdev;
 	bio.bi_iter.bi_sector = 0;
-	bio.bi_flags = (1 << BIO_QUIET);
+	bio.bi_flags |= (1 << BIO_QUIET);
 	bio.bi_private = &cbdata;
 	bio.bi_end_io = floppy_rb0_cb;
 

From 1c8cad6c1bbcacc07332539f2d7dee22521f9ae0 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 21 May 2014 16:32:40 +0200
Subject: [PATCH 50/54] xen-blkfront: remove type check from
 blkfront_setup_discard

In its initial implementation a check for "type" was added, but only phy
and file are handled. This breaks advertised discard support for other
type values such as qdisk.

Fix and simplify this function: If the backend advertises discard
support it is supposed to implement it properly, so enable
feature_discard unconditionally. If the backend advertises the need for
a certain granularity and alignment then propagate both properties to
the blocklayer. The discard-secure property is a boolean, update the code
to reflect that.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 40 +++++++++++++-----------------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index efe1b4761735..25c11ad34184 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1635,36 +1635,24 @@ blkfront_closing(struct blkfront_info *info)
 static void blkfront_setup_discard(struct blkfront_info *info)
 {
 	int err;
-	char *type;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	unsigned int discard_secure;
 
-	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
-	if (IS_ERR(type))
-		return;
-
-	info->feature_secdiscard = 0;
-	if (strncmp(type, "phy", 3) == 0) {
-		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			"discard-granularity", "%u", &discard_granularity,
-			"discard-alignment", "%u", &discard_alignment,
-			NULL);
-		if (!err) {
-			info->feature_discard = 1;
-			info->discard_granularity = discard_granularity;
-			info->discard_alignment = discard_alignment;
-		}
-		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "discard-secure", "%d", &discard_secure,
-			    NULL);
-		if (!err)
-			info->feature_secdiscard = discard_secure;
-
-	} else if (strncmp(type, "file", 4) == 0)
-		info->feature_discard = 1;
-
-	kfree(type);
+	info->feature_discard = 1;
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		"discard-granularity", "%u", &discard_granularity,
+		"discard-alignment", "%u", &discard_alignment,
+		NULL);
+	if (!err) {
+		info->discard_granularity = discard_granularity;
+		info->discard_alignment = discard_alignment;
+	}
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		    "discard-secure", "%d", &discard_secure,
+		    NULL);
+	if (!err)
+		info->feature_secdiscard = !!discard_secure;
 }
 
 static int blkfront_setup_indirect(struct blkfront_info *info)

From c926b701fede80ecd6f68887d5485bb620758721 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 21 May 2014 16:32:42 +0200
Subject: [PATCH 51/54] xen/blkback: disable discard feature if requested by
 toolstack

Newer toolstacks may provide a boolean property "discard-enable" in the
backend node. Its purpose is to disable discard for file backed storage
to avoid fragmentation. Recognize this setting also for physical
storage.  If that property exists and is false, do not advertise
"feature-discard" to the frontend.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/xenbus.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 9a547e6b6ebf..a71ecf5f4283 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -481,10 +481,15 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info
 	struct xenbus_device *dev = be->dev;
 	struct xen_blkif *blkif = be->blkif;
 	int err;
-	int state = 0;
+	int state = 0, discard_enable;
 	struct block_device *bdev = be->blkif->vbd.bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
 
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "discard-enable", "%d",
+			   &discard_enable);
+	if (err == 1 && !discard_enable)
+		return;
+
 	if (blk_queue_discard(q)) {
 		err = xenbus_printf(xbt, dev->nodename,
 			"discard-granularity", "%u",

From 1c339ef74b431bab18bc11fb69e46db294070dd3 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Wed, 21 May 2014 16:32:41 +0200
Subject: [PATCH 52/54] xen blkif.h: fix comment typo in discard-alignment

Add the missing 'n' to discard-alignment

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/xen/interface/io/blkif.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index 32ec05a6572f..c33e1c489eb2 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -86,7 +86,7 @@ typedef uint64_t blkif_sector_t;
  *     Interface%20manuals/100293068c.pdf
  * The backend can optionally provide three extra XenBus attributes to
  * further optimize the discard functionality:
- * 'discard-aligment' - Devices that support discard functionality may
+ * 'discard-alignment' - Devices that support discard functionality may
  * internally allocate space in units that are bigger than the exported
  * logical block size. The discard-alignment parameter indicates how many bytes
  * the beginning of the partition is offset from the internal allocation unit's

From 814d04e7dfc4a9cf7e36656afe2da5c0c08dde2b Mon Sep 17 00:00:00 2001
From: Valentin Priescu <priescuv@amazon.com>
Date: Tue, 20 May 2014 22:28:50 +0200
Subject: [PATCH 53/54] xen-blkback: defer freeing blkif to avoid blocking
 xenwatch

Currently xenwatch blocks in VBD disconnect, waiting for all pending I/O
requests to finish. If the VBD is attached to a hot-swappable disk, then
xenwatch can hang for a long period of time, stalling other watches.

 INFO: task xenwatch:39 blocked for more than 120 seconds.
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 ffff880057f01bd0 0000000000000246 ffff880057f01ac0 ffffffff810b0782
 ffff880057f01ad0 00000000000131c0 0000000000000004 ffff880057edb040
 ffff8800344c6080 0000000000000000 ffff880058c00ba0 ffff880057edb040
 Call Trace:
 [<ffffffff810b0782>] ? irq_to_desc+0x12/0x20
 [<ffffffff8128f761>] ? list_del+0x11/0x40
 [<ffffffff8147a080>] ? wait_for_common+0x60/0x160
 [<ffffffff8147bcef>] ? _raw_spin_lock_irqsave+0x2f/0x50
 [<ffffffff8147bd49>] ? _raw_spin_unlock_irqrestore+0x19/0x20
 [<ffffffff8147a26a>] schedule+0x3a/0x60
 [<ffffffffa018fe6a>] xen_blkif_disconnect+0x8a/0x100 [xen_blkback]
 [<ffffffff81079f70>] ? wake_up_bit+0x40/0x40
 [<ffffffffa018ffce>] xen_blkbk_remove+0xae/0x1e0 [xen_blkback]
 [<ffffffff8130b254>] xenbus_dev_remove+0x44/0x90
 [<ffffffff81345cb7>] __device_release_driver+0x77/0xd0
 [<ffffffff81346488>] device_release_driver+0x28/0x40
 [<ffffffff813456e8>] bus_remove_device+0x78/0xe0
 [<ffffffff81342c9f>] device_del+0x12f/0x1a0
 [<ffffffff81342d2d>] device_unregister+0x1d/0x60
 [<ffffffffa0190826>] frontend_changed+0xa6/0x4d0 [xen_blkback]
 [<ffffffffa019c252>] ? frontend_changed+0x192/0x650 [xen_netback]
 [<ffffffff8130ae50>] ? cmp_dev+0x60/0x60
 [<ffffffff81344fe4>] ? bus_for_each_dev+0x94/0xa0
 [<ffffffff8130b06e>] xenbus_otherend_changed+0xbe/0x120
 [<ffffffff8130b4cb>] frontend_changed+0xb/0x10
 [<ffffffff81309c82>] xenwatch_thread+0xf2/0x130
 [<ffffffff81079f70>] ? wake_up_bit+0x40/0x40
 [<ffffffff81309b90>] ? xenbus_directory+0x80/0x80
 [<ffffffff810799d6>] kthread+0x96/0xa0
 [<ffffffff81485934>] kernel_thread_helper+0x4/0x10
 [<ffffffff814839f3>] ? int_ret_from_sys_call+0x7/0x1b
 [<ffffffff8147c17c>] ? retint_restore_args+0x5/0x6
 [<ffffffff81485930>] ? gs_change+0x13/0x13

With this patch, when there is still pending I/O, the actual disconnect
is done by the last reference holder (last pending I/O request). In this
case, xenwatch doesn't block indefinitely.

Signed-off-by: Valentin Priescu <priescuv@amazon.com>
Reviewed-by: Steven Kady <stevkady@amazon.com>
Reviewed-by: Steven Noonan <snoonan@amazon.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkback/common.h |  4 +--
 drivers/block/xen-blkback/xenbus.c | 46 ++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index be052773ad03..f65b807e3236 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -314,7 +314,7 @@ struct xen_blkif {
 	unsigned long long			st_rd_sect;
 	unsigned long long			st_wr_sect;
 
-	wait_queue_head_t	waiting_to_free;
+	struct work_struct	free_work;
 	/* Thread shutdown wait queue. */
 	wait_queue_head_t	shutdown_wq;
 };
@@ -361,7 +361,7 @@ struct pending_req {
 #define xen_blkif_put(_b)				\
 	do {						\
 		if (atomic_dec_and_test(&(_b)->refcnt))	\
-			wake_up(&(_b)->waiting_to_free);\
+			schedule_work(&(_b)->free_work);\
 	} while (0)
 
 struct phys_req {
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index a71ecf5f4283..3a8b810b4980 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -35,12 +35,26 @@ static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
 			    unsigned int);
+static void xen_blkif_free(struct xen_blkif *blkif);
+static void xen_vbd_free(struct xen_vbd *vbd);
 
 struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
 {
 	return be->dev;
 }
 
+/*
+ * The last request could free the device from softirq context and
+ * xen_blkif_free() can sleep.
+ */
+static void xen_blkif_deferred_free(struct work_struct *work)
+{
+	struct xen_blkif *blkif;
+
+	blkif = container_of(work, struct xen_blkif, free_work);
+	xen_blkif_free(blkif);
+}
+
 static int blkback_name(struct xen_blkif *blkif, char *buf)
 {
 	char *devpath, *devname;
@@ -121,7 +135,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	init_completion(&blkif->drain_complete);
 	atomic_set(&blkif->drain, 0);
 	blkif->st_print = jiffies;
-	init_waitqueue_head(&blkif->waiting_to_free);
 	blkif->persistent_gnts.rb_node = NULL;
 	spin_lock_init(&blkif->free_pages_lock);
 	INIT_LIST_HEAD(&blkif->free_pages);
@@ -132,6 +145,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
 
 	INIT_LIST_HEAD(&blkif->pending_free);
+	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
 
 	for (i = 0; i < XEN_BLKIF_REQS; i++) {
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
@@ -231,7 +245,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
 	return 0;
 }
 
-static void xen_blkif_disconnect(struct xen_blkif *blkif)
+static int xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
@@ -239,9 +253,12 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 		blkif->xenblkd = NULL;
 	}
 
-	atomic_dec(&blkif->refcnt);
-	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
-	atomic_inc(&blkif->refcnt);
+	/* The above kthread_stop() guarantees that at this point we
+	 * don't have any discard_io or other_io requests. So, checking
+	 * for inflight IO is enough.
+	 */
+	if (atomic_read(&blkif->inflight) > 0)
+		return -EBUSY;
 
 	if (blkif->irq) {
 		unbind_from_irqhandler(blkif->irq, blkif);
@@ -252,6 +269,8 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
 		blkif->blk_rings.common.sring = NULL;
 	}
+
+	return 0;
 }
 
 static void xen_blkif_free(struct xen_blkif *blkif)
@@ -259,8 +278,8 @@ static void xen_blkif_free(struct xen_blkif *blkif)
 	struct pending_req *req, *n;
 	int i = 0, j;
 
-	if (!atomic_dec_and_test(&blkif->refcnt))
-		BUG();
+	xen_blkif_disconnect(blkif);
+	xen_vbd_free(&blkif->vbd);
 
 	/* Remove all persistent grants and the cache of ballooned pages. */
 	xen_blkbk_free_caches(blkif);
@@ -449,16 +468,15 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
 		be->backend_watch.node = NULL;
 	}
 
+	dev_set_drvdata(&dev->dev, NULL);
+
 	if (be->blkif) {
 		xen_blkif_disconnect(be->blkif);
-		xen_vbd_free(&be->blkif->vbd);
-		xen_blkif_free(be->blkif);
-		be->blkif = NULL;
+		xen_blkif_put(be->blkif);
 	}
 
 	kfree(be->mode);
 	kfree(be);
-	dev_set_drvdata(&dev->dev, NULL);
 	return 0;
 }
 
@@ -705,7 +723,11 @@ static void frontend_changed(struct xenbus_device *dev,
 		 * Enforce precondition before potential leak point.
 		 * xen_blkif_disconnect() is idempotent.
 		 */
-		xen_blkif_disconnect(be->blkif);
+		err = xen_blkif_disconnect(be->blkif);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "pending I/O");
+			break;
+		}
 
 		err = connect_ring(be);
 		if (err)

From e8edca6f7f92234202d6dd163c118ef495244d7c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 30 May 2014 10:49:29 +0800
Subject: [PATCH 54/54] block: virtio_blk: don't hold spin lock during world
 switch

Firstly, it isn't necessary to hold lock of vblk->vq_lock
when notifying hypervisor about queued I/O.

Secondly, virtqueue_notify() will cause world switch and
it may take long time on some hypervisors(such as, qemu-arm),
so it isn't good to hold the lock and block other vCPUs.

On arm64 quad core VM(qemu-kvm), the patch can increase I/O
performance a lot with VIRTIO_RING_F_EVENT_IDX enabled:
	- without the patch: 14K IOPS
	- with the patch: 34K IOPS

fio script:
	[global]
	direct=1
	bsrange=4k-4k
	timeout=10
	numjobs=4
	ioengine=libaio
	iodepth=64

	filename=/dev/vdc
	group_reporting=1

	[f1]
	rw=randread

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: stable@kernel.org # 3.13+
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/virtio_blk.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c8f286e8d80f..f63d358f3d93 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -162,6 +162,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 	unsigned int num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
 	int err;
+	bool notify = false;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
@@ -214,10 +215,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 		return BLK_MQ_RQ_QUEUE_ERROR;
 	}
 
-	if (last)
-		virtqueue_kick(vblk->vq);
-
+	if (last && virtqueue_kick_prepare(vblk->vq))
+		notify = true;
 	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+
+	if (notify)
+		virtqueue_notify(vblk->vq);
 	return BLK_MQ_RQ_QUEUE_OK;
 }