From 5b0bbee4732cbd58aa98213d4c11a366356bba3d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 27 Apr 2020 10:41:22 -0600
Subject: [PATCH 1/8] io_uring: statx must grab the file table for valid fd

Clay reports that OP_STATX fails for a test case with a valid fd
and empty path:

 -- Test 0: statx:fd 3: SUCCEED, file mode 100755
 -- Test 1: statx:path ./uring_statx: SUCCEED, file mode 100755
 -- Test 2: io_uring_statx:fd 3: FAIL, errno 9: Bad file descriptor
 -- Test 3: io_uring_statx:path ./uring_statx: SUCCEED, file mode 100755

This is due to statx not grabbing the process file table, hence we can't
lookup the fd in async context. If the fd is valid, ensure that we grab
the file table so we can grab the file from async context.

Cc: stable@vger.kernel.org # v5.6
Reported-by: Clay Harris <bugs@claycon.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c687f57fb651..084dfade5cda 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -524,6 +524,7 @@ enum {
 	REQ_F_OVERFLOW_BIT,
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
+	REQ_F_NO_FILE_TABLE_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -577,6 +578,8 @@ enum {
 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
+	/* doesn't need file table for this request */
+	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
 };
 
 struct async_poll {
@@ -799,6 +802,7 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.fd_non_neg		= 1,
 		.needs_fs		= 1,
+		.file_table		= 1,
 	},
 	[IORING_OP_READ] = {
 		.needs_mm		= 1,
@@ -3355,8 +3359,12 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
 	struct kstat stat;
 	int ret;
 
-	if (force_nonblock)
+	if (force_nonblock) {
+		/* only need file table for an actual valid fd */
+		if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
+			req->flags |= REQ_F_NO_FILE_TABLE;
 		return -EAGAIN;
+	}
 
 	if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
 		return -EINVAL;
@@ -5429,7 +5437,7 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (req->work.files)
+	if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
 		return 0;
 	if (!ctx->ring_file)
 		return -EBADF;

From af197f50ac53fff1241598c73ca606754a3bb808 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 28 Apr 2020 13:15:06 -0600
Subject: [PATCH 2/8] io_uring: enable poll retry for any file with ->read_iter
 / ->write_iter

We can have files like eventfd where it's perfectly fine to do poll
based retry on them, right now io_file_supports_async() doesn't take
that into account.

Pass in data direction and check the f_op instead of just always needing
an async worker.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 084dfade5cda..516a59db73ca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2038,7 +2038,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
  * any file. For now, just ensure that anything potentially problematic is done
  * inline.
  */
-static bool io_file_supports_async(struct file *file)
+static bool io_file_supports_async(struct file *file, int rw)
 {
 	umode_t mode = file_inode(file)->i_mode;
 
@@ -2047,7 +2047,13 @@ static bool io_file_supports_async(struct file *file)
 	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
 		return true;
 
-	return false;
+	if (!(file->f_mode & FMODE_NOWAIT))
+		return false;
+
+	if (rw == READ)
+		return file->f_op->read_iter != NULL;
+
+	return file->f_op->write_iter != NULL;
 }
 
 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
@@ -2575,7 +2581,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
 	 * we know to async punt it even if it was opened O_NONBLOCK
 	 */
-	if (force_nonblock && !io_file_supports_async(req->file))
+	if (force_nonblock && !io_file_supports_async(req->file, READ))
 		goto copy_iov;
 
 	iov_count = iov_iter_count(&iter);
@@ -2666,7 +2672,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
 	 * we know to async punt it even if it was opened O_NONBLOCK
 	 */
-	if (force_nonblock && !io_file_supports_async(req->file))
+	if (force_nonblock && !io_file_supports_async(req->file, WRITE))
 		goto copy_iov;
 
 	/* file path doesn't support NOWAIT for non-direct_IO */
@@ -2760,11 +2766,11 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static bool io_splice_punt(struct file *file)
+static bool io_splice_punt(struct file *file, int rw)
 {
 	if (get_pipe_info(file))
 		return false;
-	if (!io_file_supports_async(file))
+	if (!io_file_supports_async(file, rw))
 		return true;
 	return !(file->f_flags & O_NONBLOCK);
 }
@@ -2779,7 +2785,7 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	long ret;
 
 	if (force_nonblock) {
-		if (io_splice_punt(in) || io_splice_punt(out))
+		if (io_splice_punt(in, READ) || io_splice_punt(out, WRITE))
 			return -EAGAIN;
 		flags |= SPLICE_F_NONBLOCK;
 	}

From 490e89676a523c688343d6cb8ca5f5dc476414df Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 28 Apr 2020 13:16:53 -0600
Subject: [PATCH 3/8] io_uring: only force async punt if poll based retry can't
 handle it

We do blocking retry from our poll handler, if the file supports polled
notifications. Only mark the request as needing an async worker if we
can't poll for it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 516a59db73ca..b536c34c6c36 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2604,7 +2604,8 @@ copy_iov:
 			if (ret)
 				goto out_free;
 			/* any defer here is final, must blocking retry */
-			if (!(req->flags & REQ_F_NOWAIT))
+			if (!(req->flags & REQ_F_NOWAIT) &&
+			    !file_can_poll(req->file))
 				req->flags |= REQ_F_MUST_PUNT;
 			return -EAGAIN;
 		}
@@ -2726,7 +2727,8 @@ copy_iov:
 			if (ret)
 				goto out_free;
 			/* any defer here is final, must blocking retry */
-			req->flags |= REQ_F_MUST_PUNT;
+			if (!file_can_poll(req->file))
+				req->flags |= REQ_F_MUST_PUNT;
 			return -EAGAIN;
 		}
 	}

From dd461af65946de060bff2dab08a63676d2731afe Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Wed, 29 Apr 2020 17:47:50 -0700
Subject: [PATCH 4/8] io_uring: use proper references for fallback_req locking

Use ctx->fallback_req address for test_and_set_bit_lock() and
clear_bit_unlock().

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b536c34c6c36..3da2a02531e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1295,7 +1295,7 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
 	struct io_kiocb *req;
 
 	req = ctx->fallback_req;
-	if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
+	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
 		return req;
 
 	return NULL;
@@ -1382,7 +1382,7 @@ static void __io_free_req(struct io_kiocb *req)
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
 	else
-		clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
+		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
 }
 
 struct req_batch {

From 3fd44c86711f71156b586c22b0495c58f69358bb Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Fri, 1 May 2020 08:52:56 +0800
Subject: [PATCH 5/8] io_uring: use cond_resched() in
 io_ring_ctx_wait_and_kill()

While working on to make io_uring sqpoll mode support syscalls that need
struct files_struct, I got cpu soft lockup in io_ring_ctx_wait_and_kill(),

    while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
        cpu_relax();

above loop never has an chance to exit, it's because preempt isn't enabled
in the kernel, and the context calling io_ring_ctx_wait_and_kill() and
io_sq_thread() run in the same cpu, if io_sq_thread calls a cond_resched()
yield cpu and another context enters above loop, then io_sq_thread() will
always in runqueue and never exit.

Use cond_resched() can fix this issue.

 Reported-by: syzbot+66243bb7126c410cefe6@syzkaller.appspotmail.com
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3da2a02531e6..5ca2da6648d0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7343,7 +7343,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	 * it could cause shutdown to hang.
 	 */
 	while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
-		cpu_relax();
+		cond_resched();
 
 	io_kill_timeouts(ctx);
 	io_poll_remove_all(ctx);

From 7759a0bfadceef3910d0e50f86d63b6ed58b4e70 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 May 2020 17:09:36 +0300
Subject: [PATCH 6/8] io_uring: fix extra put in sync_file_range()

[   40.179474] refcount_t: underflow; use-after-free.
[   40.179499] WARNING: CPU: 6 PID: 1848 at lib/refcount.c:28 refcount_warn_saturate+0xae/0xf0
...
[   40.179612] RIP: 0010:refcount_warn_saturate+0xae/0xf0
[   40.179617] Code: 28 44 0a 01 01 e8 d7 01 c2 ff 0f 0b 5d c3 80 3d 15 44 0a 01 00 75 91 48 c7 c7 b8 f5 75 be c6 05 05 44 0a 01 01 e8 b7 01 c2 ff <0f> 0b 5d c3 80 3d f3 43 0a 01 00 0f 85 6d ff ff ff 48 c7 c7 10 f6
[   40.179619] RSP: 0018:ffffb252423ebe18 EFLAGS: 00010286
[   40.179623] RAX: 0000000000000000 RBX: ffff98d65e929400 RCX: 0000000000000000
[   40.179625] RDX: 0000000000000001 RSI: 0000000000000086 RDI: 00000000ffffffff
[   40.179627] RBP: ffffb252423ebe18 R08: 0000000000000001 R09: 000000000000055d
[   40.179629] R10: 0000000000000c8c R11: 0000000000000001 R12: 0000000000000000
[   40.179631] R13: ffff98d68c434400 R14: ffff98d6a9cbaa20 R15: ffff98d6a609ccb8
[   40.179634] FS:  0000000000000000(0000) GS:ffff98d6af580000(0000) knlGS:0000000000000000
[   40.179636] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   40.179638] CR2: 00000000033e3194 CR3: 000000006480a003 CR4: 00000000003606e0
[   40.179641] Call Trace:
[   40.179652]  io_put_req+0x36/0x40
[   40.179657]  io_free_work+0x15/0x20
[   40.179661]  io_worker_handle_work+0x2f5/0x480
[   40.179667]  io_wqe_worker+0x2a9/0x360
[   40.179674]  ? _raw_spin_unlock_irqrestore+0x24/0x40
[   40.179681]  kthread+0x12c/0x170
[   40.179685]  ? io_worker_handle_work+0x480/0x480
[   40.179690]  ? kthread_park+0x90/0x90
[   40.179695]  ret_from_fork+0x35/0x40
[   40.179702] ---[ end trace 85027405f00110aa ]---

Opcode handler must never put submission ref, but that's what
io_sync_file_range_finish() do. use io_steal_work() there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ca2da6648d0..7f10af02c3d1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3518,7 +3518,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr)
 	if (io_req_cancelled(req))
 		return;
 	__io_sync_file_range(req);
-	io_put_req(req); /* put submission ref */
+	io_steal_work(req, workptr);
 }
 
 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)

From 4ee3631451c9a62e6b6bc7ee51fb9a5b34e33509 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 May 2020 17:09:37 +0300
Subject: [PATCH 7/8] io_uring: check non-sync defer_list carefully

io_req_defer() do double-checked locking. Use proper helpers for that,
i.e. list_empty_careful().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7f10af02c3d1..91ddc27b5173 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5031,7 +5031,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	int ret;
 
 	/* Still need defer if there is pending req in defer list. */
-	if (!req_need_defer(req) && list_empty(&ctx->defer_list))
+	if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
 		return 0;
 
 	if (!req->io && io_alloc_async_ctx(req))

From 2fb3e82284fca40afbde5351907f0a5b3be717f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 1 May 2020 17:09:38 +0300
Subject: [PATCH 8/8] io_uring: punt splice async because of inode mutex

Nonblocking do_splice() still may wait for some time on an inode mutex.
Let's play safe and always punt it async.

Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91ddc27b5173..0b91b0631173 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,15 +2768,6 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static bool io_splice_punt(struct file *file, int rw)
-{
-	if (get_pipe_info(file))
-		return false;
-	if (!io_file_supports_async(file, rw))
-		return true;
-	return !(file->f_flags & O_NONBLOCK);
-}
-
 static int io_splice(struct io_kiocb *req, bool force_nonblock)
 {
 	struct io_splice *sp = &req->splice;
@@ -2786,11 +2777,8 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	loff_t *poff_in, *poff_out;
 	long ret;
 
-	if (force_nonblock) {
-		if (io_splice_punt(in, READ) || io_splice_punt(out, WRITE))
-			return -EAGAIN;
-		flags |= SPLICE_F_NONBLOCK;
-	}
+	if (force_nonblock)
+		return -EAGAIN;
 
 	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
 	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;