From 7b427a59538a98161321aa46c13f4ea81b43f4eb Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Mon, 27 Jun 2016 16:33:24 +0800
Subject: [PATCH 1/3] xen-blkfront: save uncompleted reqs in blkfront_resume()

Uncompleted reqs used to be 'saved and resubmitted' in blkfront_recover() during
migration, but that's too late after multi-queue was introduced.

After a migrate to another host (which may not have multiqueue support), the
number of rings (block hardware queues) may be changed and the ring and shadow
structure will also be reallocated.

The blkfront_recover() then can't 'save and resubmit' the real
uncompleted reqs because shadow structure have been reallocated.

This patch fixes this issue by moving the 'save' logic out of
blkfront_recover() to earlier place in blkfront_resume().

The 'resubmit' is not changed and still in blkfront_recover().

Signed-off-by: Bob Liu <bob.liu@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: stable@vger.kernel.org
---
 drivers/block/xen-blkfront.c | 91 ++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 51 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2e6d1e9c3345..fcc5b4e0aef2 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -207,6 +207,9 @@ struct blkfront_info
 	struct blk_mq_tag_set tag_set;
 	struct blkfront_ring_info *rinfo;
 	unsigned int nr_rings;
+	/* Save uncomplete reqs and bios for migration. */
+	struct list_head requests;
+	struct bio_list bio_list;
 };
 
 static unsigned int nr_minors;
@@ -2002,69 +2005,22 @@ static int blkif_recover(struct blkfront_info *info)
 {
 	unsigned int i, r_index;
 	struct request *req, *n;
-	struct blk_shadow *copy;
 	int rc;
 	struct bio *bio, *cloned_bio;
-	struct bio_list bio_list, merge_bio;
 	unsigned int segs, offset;
 	int pending, size;
 	struct split_bio *split_bio;
-	struct list_head requests;
 
 	blkfront_gather_backend_features(info);
 	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	blk_queue_max_segments(info->rq, segs);
-	bio_list_init(&bio_list);
-	INIT_LIST_HEAD(&requests);
 
 	for (r_index = 0; r_index < info->nr_rings; r_index++) {
-		struct blkfront_ring_info *rinfo;
-
-		rinfo = &info->rinfo[r_index];
-		/* Stage 1: Make a safe copy of the shadow state. */
-		copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
-			       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
-		if (!copy)
-			return -ENOMEM;
-
-		/* Stage 2: Set up free list. */
-		memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
-		for (i = 0; i < BLK_RING_SIZE(info); i++)
-			rinfo->shadow[i].req.u.rw.id = i+1;
-		rinfo->shadow_free = rinfo->ring.req_prod_pvt;
-		rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+		struct blkfront_ring_info *rinfo = &info->rinfo[r_index];
 
 		rc = blkfront_setup_indirect(rinfo);
-		if (rc) {
-			kfree(copy);
+		if (rc)
 			return rc;
-		}
-
-		for (i = 0; i < BLK_RING_SIZE(info); i++) {
-			/* Not in use? */
-			if (!copy[i].request)
-				continue;
-
-			/*
-			 * Get the bios in the request so we can re-queue them.
-			 */
-			if (copy[i].request->cmd_flags &
-			    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
-				/*
-				 * Flush operations don't contain bios, so
-				 * we need to requeue the whole request
-				 */
-				list_add(&copy[i].request->queuelist, &requests);
-				continue;
-			}
-			merge_bio.head = copy[i].request->bio;
-			merge_bio.tail = copy[i].request->biotail;
-			bio_list_merge(&bio_list, &merge_bio);
-			copy[i].request->bio = NULL;
-			blk_end_request_all(copy[i].request, 0);
-		}
-
-		kfree(copy);
 	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
@@ -2079,7 +2035,7 @@ static int blkif_recover(struct blkfront_info *info)
 		kick_pending_request_queues(rinfo);
 	}
 
-	list_for_each_entry_safe(req, n, &requests, queuelist) {
+	list_for_each_entry_safe(req, n, &info->requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
 		list_del_init(&req->queuelist);
 		BUG_ON(req->nr_phys_segments > segs);
@@ -2087,7 +2043,7 @@ static int blkif_recover(struct blkfront_info *info)
 	}
 	blk_mq_kick_requeue_list(info->rq);
 
-	while ((bio = bio_list_pop(&bio_list)) != NULL) {
+	while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
 		/* Traverse the list of pending bios and re-queue them */
 		if (bio_segments(bio) > segs) {
 			/*
@@ -2133,9 +2089,42 @@ static int blkfront_resume(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err = 0;
+	unsigned int i, j;
 
 	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 
+	bio_list_init(&info->bio_list);
+	INIT_LIST_HEAD(&info->requests);
+	for (i = 0; i < info->nr_rings; i++) {
+		struct blkfront_ring_info *rinfo = &info->rinfo[i];
+		struct bio_list merge_bio;
+		struct blk_shadow *shadow = rinfo->shadow;
+
+		for (j = 0; j < BLK_RING_SIZE(info); j++) {
+			/* Not in use? */
+			if (!shadow[j].request)
+				continue;
+
+			/*
+			 * Get the bios in the request so we can re-queue them.
+			 */
+			if (shadow[j].request->cmd_flags &
+					(REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+				/*
+				 * Flush operations don't contain bios, so
+				 * we need to requeue the whole request
+				 */
+				list_add(&shadow[j].request->queuelist, &info->requests);
+				continue;
+			}
+			merge_bio.head = shadow[j].request->bio;
+			merge_bio.tail = shadow[j].request->biotail;
+			bio_list_merge(&info->bio_list, &merge_bio);
+			shadow[j].request->bio = NULL;
+			blk_mq_end_request(shadow[j].request, 0);
+		}
+	}
+
 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 
 	err = negotiate_mq(info);

From 74524955556096a0b2a821a49b4d0abebad3ee16 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Thu, 16 Jun 2016 05:15:33 -0700
Subject: [PATCH 2/3] writeback: inode cgroup wb switch should not call ihold()

Asynchronous wb switching of inodes takes an additional ref count on an
inode to make sure inode remains valid until switchover is completed.

However, anyone calling ihold() must already have a ref count on inode,
but in this case inode->i_count may already be zero:

------------[ cut here ]------------
WARNING: CPU: 1 PID: 917 at fs/inode.c:397 ihold+0x2b/0x30
CPU: 1 PID: 917 Comm: kworker/u4:5 Not tainted 4.7.0-rc2+ #49
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
01/01/2011
Workqueue: writeback wb_workfn (flush-8:16)
 0000000000000000 ffff88007ca0fb58 ffffffff805990af 0000000000000000
 0000000000000000 ffff88007ca0fb98 ffffffff80268702 0000018d000004e2
 ffff88007cef40e8 ffff88007c9b89a8 ffff880079e3a740 0000000000000003
Call Trace:
 [<ffffffff805990af>] dump_stack+0x4d/0x6e
 [<ffffffff80268702>] __warn+0xc2/0xe0
 [<ffffffff802687d8>] warn_slowpath_null+0x18/0x20
 [<ffffffff8035b4ab>] ihold+0x2b/0x30
 [<ffffffff80367ecc>] inode_switch_wbs+0x11c/0x180
 [<ffffffff80369110>] wbc_detach_inode+0x170/0x1a0
 [<ffffffff80369abc>] writeback_sb_inodes+0x21c/0x530
 [<ffffffff80369f7e>] wb_writeback+0xee/0x1e0
 [<ffffffff8036a147>] wb_workfn+0xd7/0x280
 [<ffffffff80287531>] ? try_to_wake_up+0x1b1/0x2b0
 [<ffffffff8027bb09>] process_one_work+0x129/0x300
 [<ffffffff8027be06>] worker_thread+0x126/0x480
 [<ffffffff8098cde7>] ? __schedule+0x1c7/0x561
 [<ffffffff8027bce0>] ? process_one_work+0x300/0x300
 [<ffffffff80280ff4>] kthread+0xc4/0xe0
 [<ffffffff80335578>] ? kfree+0xc8/0x100
 [<ffffffff809903cf>] ret_from_fork+0x1f/0x40
 [<ffffffff80280f30>] ? __kthread_parkme+0x70/0x70
---[ end trace aaefd2fd9f306bc4 ]---

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 989a2cef6b76..fe7e83a45eff 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -483,9 +483,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 		goto out_free;
 	}
 	inode->i_state |= I_WB_SWITCH;
+	__iget(inode);
 	spin_unlock(&inode->i_lock);
 
-	ihold(inode);
 	isw->inode = inode;
 
 	atomic_inc(&isw_nr_in_flight);

From 8ba8682107ee2ca3347354e018865d8e1967c5f4 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 1 Jul 2016 00:39:35 -0700
Subject: [PATCH 3/3] block: fix use-after-free in sys_ioprio_get()

get_task_ioprio() accesses the task->io_context without holding the task
lock and thus can race with exit_io_context(), leading to a
use-after-free. The reproducer below hits this within a few seconds on
my 4-core QEMU VM:

#define _GNU_SOURCE
#include <assert.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/wait.h>

int main(int argc, char **argv)
{
	pid_t pid, child;
	long nproc, i;

	/* ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); */
	syscall(SYS_ioprio_set, 1, 0, 0x6000);

	nproc = sysconf(_SC_NPROCESSORS_ONLN);

	for (i = 0; i < nproc; i++) {
		pid = fork();
		assert(pid != -1);
		if (pid == 0) {
			for (;;) {
				pid = fork();
				assert(pid != -1);
				if (pid == 0) {
					_exit(0);
				} else {
					child = wait(NULL);
					assert(child == pid);
				}
			}
		}

		pid = fork();
		assert(pid != -1);
		if (pid == 0) {
			for (;;) {
				/* ioprio_get(IOPRIO_WHO_PGRP, 0); */
				syscall(SYS_ioprio_get, 2, 0);
			}
		}
	}

	for (;;) {
		/* ioprio_get(IOPRIO_WHO_PGRP, 0); */
		syscall(SYS_ioprio_get, 2, 0);
	}

	return 0;
}

This gets us KASAN dumps like this:

[   35.526914] ==================================================================
[   35.530009] BUG: KASAN: out-of-bounds in get_task_ioprio+0x7b/0x90 at addr ffff880066f34e6c
[   35.530009] Read of size 2 by task ioprio-gpf/363
[   35.530009] =============================================================================
[   35.530009] BUG blkdev_ioc (Not tainted): kasan: bad access detected
[   35.530009] -----------------------------------------------------------------------------

[   35.530009] Disabling lock debugging due to kernel taint
[   35.530009] INFO: Allocated in create_task_io_context+0x2b/0x370 age=0 cpu=0 pid=360
[   35.530009] 	___slab_alloc+0x55d/0x5a0
[   35.530009] 	__slab_alloc.isra.20+0x2b/0x40
[   35.530009] 	kmem_cache_alloc_node+0x84/0x200
[   35.530009] 	create_task_io_context+0x2b/0x370
[   35.530009] 	get_task_io_context+0x92/0xb0
[   35.530009] 	copy_process.part.8+0x5029/0x5660
[   35.530009] 	_do_fork+0x155/0x7e0
[   35.530009] 	SyS_clone+0x19/0x20
[   35.530009] 	do_syscall_64+0x195/0x3a0
[   35.530009] 	return_from_SYSCALL_64+0x0/0x6a
[   35.530009] INFO: Freed in put_io_context+0xe7/0x120 age=0 cpu=0 pid=1060
[   35.530009] 	__slab_free+0x27b/0x3d0
[   35.530009] 	kmem_cache_free+0x1fb/0x220
[   35.530009] 	put_io_context+0xe7/0x120
[   35.530009] 	put_io_context_active+0x238/0x380
[   35.530009] 	exit_io_context+0x66/0x80
[   35.530009] 	do_exit+0x158e/0x2b90
[   35.530009] 	do_group_exit+0xe5/0x2b0
[   35.530009] 	SyS_exit_group+0x1d/0x20
[   35.530009] 	entry_SYSCALL_64_fastpath+0x1a/0xa4
[   35.530009] INFO: Slab 0xffffea00019bcd00 objects=20 used=4 fp=0xffff880066f34ff0 flags=0x1fffe0000004080
[   35.530009] INFO: Object 0xffff880066f34e58 @offset=3672 fp=0x0000000000000001
[   35.530009] ==================================================================

Fix it by grabbing the task lock while we poke at the io_context.

Cc: stable@vger.kernel.org
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/ioprio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/ioprio.c b/block/ioprio.c
index cc7800e9eb44..01b8116298a1 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -150,8 +150,10 @@ static int get_task_ioprio(struct task_struct *p)
 	if (ret)
 		goto out;
 	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	task_lock(p);
 	if (p->io_context)
 		ret = p->io_context->ioprio;
+	task_unlock(p);
 out:
 	return ret;
 }