From a719370be52d1152a5f6e19c4af3b73280e25475 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 21:57:48 +1000
Subject: [PATCH 1/8] xfs: factor DIO write mapping from get_blocks

Clarify and separate the buffer mapping logic so that the direct IO mapping is
not tangled up in propagating the extent status to teh mapping buffer. This
makes it easier to extend the direct IO mapping to use an ioend in future.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..489ed200bbbb 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1233,6 +1233,22 @@ xfs_vm_releasepage(
 	return try_to_free_buffers(page);
 }
 
+/*
+ * do all the direct IO specific mapping buffer manipulation here.
+ */
+static void
+xfs_map_direct(
+	struct inode		*inode,
+	struct buffer_head	*bh_result,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	if (ISUNWRITTEN(imap)) {
+		bh_result->b_private = inode;
+		set_buffer_defer_completion(bh_result);
+	}
+}
+
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
@@ -1331,21 +1347,19 @@ __xfs_get_blocks(
 		goto out_unlock;
 	}
 
+	/*
+	 * For unwritten extents do not report a disk address in the buffered
+	 * read case (treat as if we're reading into a hole).
+	 */
 	if (imap.br_startblock != HOLESTARTBLOCK &&
-	    imap.br_startblock != DELAYSTARTBLOCK) {
-		/*
-		 * For unwritten extents do not report a disk address on
-		 * the read case (treat as if we're reading into a hole).
-		 */
-		if (create || !ISUNWRITTEN(&imap))
-			xfs_map_buffer(inode, bh_result, &imap, offset);
-		if (create && ISUNWRITTEN(&imap)) {
-			if (direct) {
-				bh_result->b_private = inode;
-				set_buffer_defer_completion(bh_result);
-			}
+	    imap.br_startblock != DELAYSTARTBLOCK &&
+	    (create || !ISUNWRITTEN(&imap))) {
+		xfs_map_buffer(inode, bh_result, &imap, offset);
+		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
-		}
+		/* direct IO needs special help */
+		if (create && direct)
+			xfs_map_direct(inode, bh_result, &imap, offset);
 	}
 
 	/*

From 1fdca9c21198b2c2869086ac3629612492476f28 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 21:58:21 +1000
Subject: [PATCH 2/8] xfs: move DIO mapping size calculation

The mapping size calculation is done last in __xfs_get_blocks(), but
we are going to need the actual mapping size we will use to map the
direct IO correctly in xfs_map_direct(). Factor out the calculation
for code clarity, and move the call to be the first operation in
mapping the extent to the returned buffer.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c | 79 +++++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 489ed200bbbb..4a29399ed549 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1249,6 +1249,47 @@ xfs_map_direct(
 	}
 }
 
+
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset,
+	ssize_t			size)
+{
+	xfs_off_t		mapping_size;
+
+	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+	mapping_size <<= inode->i_blkbits;
+
+	ASSERT(mapping_size > 0);
+	if (mapping_size > size)
+		mapping_size = size;
+	if (offset < i_size_read(inode) &&
+	    offset + mapping_size >= i_size_read(inode)) {
+		/* limit mapping to block that spans EOF */
+		mapping_size = roundup_64(i_size_read(inode) - offset,
+					  1 << inode->i_blkbits);
+	}
+	if (mapping_size > LONG_MAX)
+		mapping_size = LONG_MAX;
+
+	bh_result->b_size = mapping_size;
+}
+
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
@@ -1347,6 +1388,11 @@ __xfs_get_blocks(
 		goto out_unlock;
 	}
 
+	/* trim mapping down to size requested */
+	if (direct || size > (1 << inode->i_blkbits))
+		xfs_map_trim_size(inode, iblock, bh_result,
+				  &imap, offset, size);
+
 	/*
 	 * For unwritten extents do not report a disk address in the buffered
 	 * read case (treat as if we're reading into a hole).
@@ -1392,39 +1438,6 @@ __xfs_get_blocks(
 		}
 	}
 
-	/*
-	 * If this is O_DIRECT or the mpage code calling tell them how large
-	 * the mapping is, so that we can avoid repeated get_blocks calls.
-	 *
-	 * If the mapping spans EOF, then we have to break the mapping up as the
-	 * mapping for blocks beyond EOF must be marked new so that sub block
-	 * regions can be correctly zeroed. We can't do this for mappings within
-	 * EOF unless the mapping was just allocated or is unwritten, otherwise
-	 * the callers would overwrite existing data with zeros. Hence we have
-	 * to split the mapping into a range up to and including EOF, and a
-	 * second mapping for beyond EOF.
-	 */
-	if (direct || size > (1 << inode->i_blkbits)) {
-		xfs_off_t		mapping_size;
-
-		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-		mapping_size <<= inode->i_blkbits;
-
-		ASSERT(mapping_size > 0);
-		if (mapping_size > size)
-			mapping_size = size;
-		if (offset < i_size_read(inode) &&
-		    offset + mapping_size >= i_size_read(inode)) {
-			/* limit mapping to block that spans EOF */
-			mapping_size = roundup_64(i_size_read(inode) - offset,
-						  1 << inode->i_blkbits);
-		}
-		if (mapping_size > LONG_MAX)
-			mapping_size = LONG_MAX;
-
-		bh_result->b_size = mapping_size;
-	}
-
 	return 0;
 
 out_unlock:

From d5cc2e3f968ff60f247fdef15b04fac788ef46d2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 21:59:07 +1000
Subject: [PATCH 3/8] xfs: DIO needs an ioend for writes

Currently we can only tell DIO completion that an IO requires
unwritten extent completion. This is done by a hacky non-null
private pointer passed to Io completion, but the private pointer
does not actually contain any information that is used.

We also need to pass to IO completion the fact that the IO may be
beyond EOF and so a size update transaction needs to be done. This
is currently determined by checks in the io completion, but we need
to determine if this is necessary at block mapping time as we need
to defer the size update transactions to a completion workqueue,
just like unwritten extent conversion.

To do this, first we need to allocate and pass an ioend to to IO
completion. Add this for unwritten extent conversion; we'll do the
EOF updates in the next commit.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c  | 92 +++++++++++++++++++++++++++++++++++++++++-----
 fs/xfs/xfs_trace.h |  3 ++
 2 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4a29399ed549..60d6466d72f6 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1234,7 +1234,23 @@ xfs_vm_releasepage(
 }
 
 /*
- * do all the direct IO specific mapping buffer manipulation here.
+ * When we map a DIO buffer, we need to attach an ioend that describes the type
+ * of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) the ioend spans the entire region of the IO; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
  */
 static void
 xfs_map_direct(
@@ -1243,10 +1259,42 @@ xfs_map_direct(
 	struct xfs_bmbt_irec	*imap,
 	xfs_off_t		offset)
 {
-	if (ISUNWRITTEN(imap)) {
-		bh_result->b_private = inode;
-		set_buffer_defer_completion(bh_result);
+	struct xfs_ioend	*ioend;
+	xfs_off_t		size = bh_result->b_size;
+	int			type;
+
+	if (ISUNWRITTEN(imap))
+		type = XFS_IO_UNWRITTEN;
+	else
+		type = XFS_IO_OVERWRITE;
+
+	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+
+	if (bh_result->b_private) {
+		ioend = bh_result->b_private;
+		ASSERT(ioend->io_size > 0);
+		ASSERT(offset >= ioend->io_offset);
+		if (offset + size > ioend->io_offset + ioend->io_size)
+			ioend->io_size = offset - ioend->io_offset + size;
+
+		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+			ioend->io_type = XFS_IO_UNWRITTEN;
+
+		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+					      ioend->io_size, ioend->io_type,
+					      imap);
+	} else {
+		ioend = xfs_alloc_ioend(inode, type);
+		ioend->io_offset = offset;
+		ioend->io_size = size;
+		bh_result->b_private = ioend;
+
+		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+					   imap);
 	}
+
+	if (ioend->io_type == XFS_IO_UNWRITTEN)
+		set_buffer_defer_completion(bh_result);
 }
 
 
@@ -1378,10 +1426,13 @@ __xfs_get_blocks(
 
 			xfs_iunlock(ip, lockmode);
 		}
-
-		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+		trace_xfs_get_blocks_alloc(ip, offset, size,
+				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+						   : XFS_IO_DELALLOC, &imap);
 	} else if (nimaps) {
-		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+		trace_xfs_get_blocks_found(ip, offset, size,
+				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+						   : XFS_IO_OVERWRITE, &imap);
 		xfs_iunlock(ip, lockmode);
 	} else {
 		trace_xfs_get_blocks_notfound(ip, offset, size);
@@ -1482,9 +1533,28 @@ xfs_end_io_direct_write(
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ioend	*ioend = private;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
-		return;
+		goto out_destroy_ioend;
+
+	/*
+	 * dio completion end_io functions are only called on writes if more
+	 * than 0 bytes was written.
+	 */
+	ASSERT(size > 0);
+
+	/*
+	 * The ioend only maps whole blocks, while the IO may be sector aligned.
+	 * Hence the ioend offset/size may not match the IO offset/size exactly,
+	 * but should span it completely. Write the IO sizes into the ioend so
+	 * that completion processing does the right thing.
+	 */
+	ASSERT(size <= ioend->io_size);
+	ASSERT(offset >= ioend->io_offset);
+	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+	ioend->io_size = size;
+	ioend->io_offset = offset;
 
 	/*
 	 * While the generic direct I/O code updates the inode size, it does
@@ -1504,7 +1574,7 @@ xfs_end_io_direct_write(
 	 * we can pass the ioend to the direct IO allocation callbacks and
 	 * avoid nesting that way.
 	 */
-	if (private && size > 0) {
+	if (ioend->io_type == XFS_IO_UNWRITTEN) {
 		xfs_iomap_write_unwritten(ip, offset, size);
 	} else if (offset + size > ip->i_d.di_size) {
 		struct xfs_trans	*tp;
@@ -1514,11 +1584,13 @@ xfs_end_io_direct_write(
 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
 		if (error) {
 			xfs_trans_cancel(tp, 0);
-			return;
+			goto out_destroy_ioend;
 		}
 
 		xfs_setfilesize(ip, tp, offset, size);
 	}
+out_destroy_ioend:
+	xfs_destroy_ioend(ioend);
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e34d988..2de8556ffac2 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1217,6 +1217,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),

From 6dfa1b67e3b3a9bf536e2fb9ed99001c219822a5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 21:59:34 +1000
Subject: [PATCH 4/8] xfs: handle DIO overwrite EOF update completion correctly

Currently a DIO overwrite that extends the EOF (e.g sub-block IO or
write into allocated blocks beyond EOF) requires a transaction for
the EOF update. Thi is done in IO completion context, but we aren't
explicitly handling this situation properly and so it can run in
interrupt context. Ensure that we defer IO that spans EOF correctly
to the DIO completion workqueue, and now that we have an ioend in IO
completion we can use the common ioend completion path to do all the
work.

Note: we do not preallocate the append transaction as we can have
multiple mapping and allocation calls per direct IO. hence
preallocating can still leave us with nested transactions by
attempting to map and allocate more blocks after we've preallocated
an append transaction.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c  | 61 +++++++++++++++++++++++-----------------------
 fs/xfs/xfs_trace.h |  1 +
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 60d6466d72f6..a59443db1de9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1293,7 +1293,7 @@ xfs_map_direct(
 					   imap);
 	}
 
-	if (ioend->io_type == XFS_IO_UNWRITTEN)
+	if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend))
 		set_buffer_defer_completion(bh_result);
 }
 
@@ -1535,8 +1535,10 @@ xfs_end_io_direct_write(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ioend	*ioend = private;
 
+	trace_xfs_gbmap_direct_endio(ip, offset, size, ioend->io_type, NULL);
+
 	if (XFS_FORCED_SHUTDOWN(mp))
-		goto out_destroy_ioend;
+		goto out_end_io;
 
 	/*
 	 * dio completion end_io functions are only called on writes if more
@@ -1557,40 +1559,37 @@ xfs_end_io_direct_write(
 	ioend->io_offset = offset;
 
 	/*
-	 * While the generic direct I/O code updates the inode size, it does
-	 * so only after the end_io handler is called, which means our
-	 * end_io handler thinks the on-disk size is outside the in-core
-	 * size.  To prevent this just update it a little bit earlier here.
+	 * The ioend tells us whether we are doing unwritten extent conversion
+	 * or an append transaction that updates the on-disk file size. These
+	 * cases are the only cases where we should *potentially* be needing
+	 * to update the VFS inode size. When the ioend indicates this, we
+	 * are *guaranteed* to be running in non-interrupt context.
+	 *
+	 * We need to update the in-core inode size here so that we don't end up
+	 * with the on-disk inode size being outside the in-core inode size.
+	 * While we can do this in the process context after the IO has
+	 * completed, this does not work for AIO and hence we always update
+	 * the in-core inode size here if necessary.
 	 */
-	if (offset + size > i_size_read(inode))
-		i_size_write(inode, offset + size);
+	if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend)) {
+		if (offset + size > i_size_read(inode))
+			i_size_write(inode, offset + size);
+	} else
+		ASSERT(offset + size <= i_size_read(inode));
 
 	/*
-	 * For direct I/O we do not know if we need to allocate blocks or not,
-	 * so we can't preallocate an append transaction, as that results in
-	 * nested reservations and log space deadlocks. Hence allocate the
-	 * transaction here. While this is sub-optimal and can block IO
-	 * completion for some time, we're stuck with doing it this way until
-	 * we can pass the ioend to the direct IO allocation callbacks and
-	 * avoid nesting that way.
+	 * If we are doing an append IO that needs to update the EOF on disk,
+	 * do the transaction reserve now so we can use common end io
+	 * processing. Stashing the error (if there is one) in the ioend will
+	 * result in the ioend processing passing on the error if it is
+	 * possible as we can't return it from here.
 	 */
-	if (ioend->io_type == XFS_IO_UNWRITTEN) {
-		xfs_iomap_write_unwritten(ip, offset, size);
-	} else if (offset + size > ip->i_d.di_size) {
-		struct xfs_trans	*tp;
-		int			error;
+	if (ioend->io_type == XFS_IO_OVERWRITE && xfs_ioend_is_append(ioend))
+		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
 
-		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-		if (error) {
-			xfs_trans_cancel(tp, 0);
-			goto out_destroy_ioend;
-		}
-
-		xfs_setfilesize(ip, tp, offset, size);
-	}
-out_destroy_ioend:
-	xfs_destroy_ioend(ioend);
+out_end_io:
+	xfs_end_io(&ioend->io_work);
+	return;
 }
 
 STATIC ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2de8556ffac2..0ae50e9847bb 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1220,6 +1220,7 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),

From a06c277a13c3620c8ee9304891758f2fcff9c4a4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:00:00 +1000
Subject: [PATCH 5/8] xfs: DIO writes within EOF don't need an ioend

DIO writes that lie entirely within EOF have nothing to do in IO
completion. In this case, we don't need no steekin' ioend, and so we
can avoid allocating an ioend until we have a mapping that spans
EOF.

This means that IO completion has two contexts - deferred completion
to the dio workqueue that uses an ioend, and interrupt completion
that does nothing because there is nothing that can be done in this
context.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c  | 69 ++++++++++++++++++++++++++--------------------
 fs/xfs/xfs_trace.h |  1 +
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a59443db1de9..c02a47453137 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1234,15 +1234,19 @@ xfs_vm_releasepage(
 }
 
 /*
- * When we map a DIO buffer, we need to attach an ioend that describes the type
- * of write IO we are doing. This passes to the completion function the
- * operations it needs to perform.
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
  *
  * If we get multiple mappings in a single IO, we might be mapping different
  * types. But because the direct IO can only have a single private pointer, we
  * need to ensure that:
  *
- * a) the ioend spans the entire region of the IO; and
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
  * b) if it contains unwritten extents, it is *permanently* marked as such
  *
  * We could do this by chaining ioends like buffered IO does, but we only
@@ -1283,21 +1287,23 @@ xfs_map_direct(
 		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
 					      ioend->io_size, ioend->io_type,
 					      imap);
-	} else {
+	} else if (type == XFS_IO_UNWRITTEN ||
+		   offset + size > i_size_read(inode)) {
 		ioend = xfs_alloc_ioend(inode, type);
 		ioend->io_offset = offset;
 		ioend->io_size = size;
+
 		bh_result->b_private = ioend;
+		set_buffer_defer_completion(bh_result);
 
 		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
 					   imap);
+	} else {
+		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+					    imap);
 	}
-
-	if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend))
-		set_buffer_defer_completion(bh_result);
 }
 
-
 /*
  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
  * is, so that we can avoid repeated get_blocks calls.
@@ -1519,9 +1525,11 @@ xfs_get_blocks_direct(
 /*
  * Complete a direct I/O write request.
  *
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
- * need to issue a transaction to convert the range from unwritten to written
- * extents.
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
  */
 STATIC void
 xfs_end_io_direct_write(
@@ -1535,7 +1543,13 @@ xfs_end_io_direct_write(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ioend	*ioend = private;
 
-	trace_xfs_gbmap_direct_endio(ip, offset, size, ioend->io_type, NULL);
+	trace_xfs_gbmap_direct_endio(ip, offset, size,
+				     ioend ? ioend->io_type : 0, NULL);
+
+	if (!ioend) {
+		ASSERT(offset + size <= i_size_read(inode));
+		return;
+	}
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		goto out_end_io;
@@ -1548,12 +1562,12 @@ xfs_end_io_direct_write(
 
 	/*
 	 * The ioend only maps whole blocks, while the IO may be sector aligned.
-	 * Hence the ioend offset/size may not match the IO offset/size exactly,
-	 * but should span it completely. Write the IO sizes into the ioend so
-	 * that completion processing does the right thing.
+	 * Hence the ioend offset/size may not match the IO offset/size exactly.
+	 * Because we don't map overwrites within EOF into the ioend, the offset
+	 * may not match, but only if the endio spans EOF.  Either way, write
+	 * the IO sizes into the ioend so that completion processing does the
+	 * right thing.
 	 */
-	ASSERT(size <= ioend->io_size);
-	ASSERT(offset >= ioend->io_offset);
 	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
 	ioend->io_size = size;
 	ioend->io_offset = offset;
@@ -1562,20 +1576,15 @@ xfs_end_io_direct_write(
 	 * The ioend tells us whether we are doing unwritten extent conversion
 	 * or an append transaction that updates the on-disk file size. These
 	 * cases are the only cases where we should *potentially* be needing
-	 * to update the VFS inode size. When the ioend indicates this, we
-	 * are *guaranteed* to be running in non-interrupt context.
+	 * to update the VFS inode size.
 	 *
 	 * We need to update the in-core inode size here so that we don't end up
-	 * with the on-disk inode size being outside the in-core inode size.
-	 * While we can do this in the process context after the IO has
-	 * completed, this does not work for AIO and hence we always update
-	 * the in-core inode size here if necessary.
+	 * with the on-disk inode size being outside the in-core inode size. We
+	 * have no other method of updating EOF for AIO, so always do it here
+	 * if necessary.
 	 */
-	if (ioend->io_type == XFS_IO_UNWRITTEN || xfs_ioend_is_append(ioend)) {
-		if (offset + size > i_size_read(inode))
-			i_size_write(inode, offset + size);
-	} else
-		ASSERT(offset + size <= i_size_read(inode));
+	if (offset + size > i_size_read(inode))
+		i_size_write(inode, offset + size);
 
 	/*
 	 * If we are doing an append IO that needs to update the EOF on disk,
@@ -1584,7 +1593,7 @@ xfs_end_io_direct_write(
 	 * result in the ioend processing passing on the error if it is
 	 * possible as we can't return it from here.
 	 */
-	if (ioend->io_type == XFS_IO_OVERWRITE && xfs_ioend_is_append(ioend))
+	if (ioend->io_type == XFS_IO_OVERWRITE)
 		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
 
 out_end_io:
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0ae50e9847bb..4e0a5773eee4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1220,6 +1220,7 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
 DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,

From b9d59846f73713d77f0f3fb784c7f84249fc2b93 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:03:07 +1000
Subject: [PATCH 6/8] xfs: DIO write completion size updates race

xfs_end_io_direct_write() can race with other IO completions when
updating the in-core inode size. The IO completion processing is not
serialised for direct IO - they are done either under the
IOLOCK_SHARED for non-AIO DIO, and without any IOLOCK held at all
during AIO DIO completion. Hence the non-atomic test-and-set update
of the in-core inode size is racy and can result in the in-core
inode size going backwards if the race if hit just right.

If the inode size goes backwards, this can trigger the EOF zeroing
code to run incorrectly on the next IO, which then will zero data
that has successfully been written to disk by a previous DIO.

To fix this bug, we need to serialise the test/set updates of the
in-core inode size. This first patch introduces locking around the
relevant updates and checks in the DIO path. Because we now have an
ioend in xfs_end_io_direct_write(), we know exactly then we are
doing an IO that requires an in-core EOF update, and we know that
they are not running in interrupt context. As such, we do not need to
use irqsave() spinlock variants to protect against interrupts while
the lock is held.

Hence we can use an existing spinlock in the inode to do this
serialisation and so not need to grow the struct xfs_inode just to
work around this problem.

This patch does not address the test/set EOF update in
generic_file_write_direct() for various reasons - that will be done
as a followup with separate explanation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_aops.c |  7 +++++++
 fs/xfs/xfs_file.c | 13 ++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c02a47453137..598b259fda04 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1582,9 +1582,16 @@ xfs_end_io_direct_write(
 	 * with the on-disk inode size being outside the in-core inode size. We
 	 * have no other method of updating EOF for AIO, so always do it here
 	 * if necessary.
+	 *
+	 * We need to lock the test/set EOF update as we can be racing with
+	 * other IO completions here to update the EOF. Failing to serialise
+	 * here can result in EOF moving backwards and Bad Things Happen when
+	 * that occurs.
 	 */
+	spin_lock(&ip->i_flags_lock);
 	if (offset + size > i_size_read(inode))
 		i_size_write(inode, offset + size);
+	spin_unlock(&ip->i_flags_lock);
 
 	/*
 	 * If we are doing an append IO that needs to update the EOF on disk,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d12fb44..2323b8b63183 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -565,8 +565,18 @@ restart:
 	 * write.  If zeroing is needed and we are currently holding the
 	 * iolock shared, we need to update it to exclusive which implies
 	 * having to redo all checks before.
+	 *
+	 * We need to serialise against EOF updates that occur in IO
+	 * completions here. We want to make sure that nobody is changing the
+	 * size while we do this check until we have placed an IO barrier (i.e.
+	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+	 * The spinlock effectively forms a memory barrier once we have the
+	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+	 * and hence be able to correctly determine if we need to run zeroing.
 	 */
+	spin_lock(&ip->i_flags_lock);
 	if (*pos > i_size_read(inode)) {
+		spin_unlock(&ip->i_flags_lock);
 		if (*iolock == XFS_IOLOCK_SHARED) {
 			xfs_rw_iunlock(ip, *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
@@ -576,7 +586,8 @@ restart:
 		error = xfs_zero_eof(ip, *pos, i_size_read(inode));
 		if (error)
 			return error;
-	}
+	} else
+		spin_unlock(&ip->i_flags_lock);
 
 	/*
 	 * Updating the timestamps will grab the ilock again from

From 40c63fbc55a968383b8bb5cacad81585e80cd323 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:03:17 +1000
Subject: [PATCH 7/8] xfs: direct IO EOF zeroing needs to drain AIO

When we are doing AIO DIO writes, the IOLOCK only provides an IO
submission barrier. When we need to do EOF zeroing, we need to ensure
that no other IO is in progress and all pending in-core EOF updates
have been completed. This requires us to wait for all outstanding
AIO DIO writes to the inode to complete and, if necessary, run their
EOF updates.

Once all the EOF updates are complete, we can then restart
xfs_file_aio_write_checks() while holding the IOLOCK_EXCL, knowing
that EOF is up to date and we have exclusive IO access to the file
so we can run EOF block zeroing if we need to without interference.
This gives EOF zeroing the same exclusivity against other IO as we
provide truncate operations.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2323b8b63183..f6f0e9671919 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -581,6 +581,16 @@ restart:
 			xfs_rw_iunlock(ip, *iolock);
 			*iolock = XFS_IOLOCK_EXCL;
 			xfs_rw_ilock(ip, *iolock);
+
+			/*
+			 * We now have an IO submission barrier in place, but
+			 * AIO can do EOF updates during IO completion and hence
+			 * we now need to wait for all of them to drain. Non-AIO
+			 * DIO will have drained before we are given the
+			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+			 * no-op.
+			 */
+			inode_dio_wait(inode);
 			goto restart;
 		}
 		error = xfs_zero_eof(ip, *pos, i_size_read(inode));

From 0cefb29e6a63727bc7606c47fc538467594ef112 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 16 Apr 2015 22:03:27 +1000
Subject: [PATCH 8/8] xfs: using generic_file_direct_write() is unnecessary

generic_file_direct_write() does all sorts of things to make DIO
work "sorta ok" with mixed buffered IO workloads. We already do
most of this work in xfs_file_aio_dio_write() because of the locking
requirements, so there's only a couple of things it does for us.

The first thing is that it does a page cache invalidation after the
->direct_IO callout. This can easily be added to the XFS code.

The second thing it does is that if data was written, it updates the
iov_iter structure to reflect the data written, and then does EOF
size updates if necessary. For XFS, these EOF size updates are now
not necessary, as we do them safely and race-free in IO completion
context. That leaves just the iov_iter update, and that's also moved
to the XFS code.

Therefore we don't need to call generic_file_direct_write() and in
doing so remove redundant buffered writeback and page cache
invalidation calls from the DIO submission path. We also remove a
racy EOF size update, and make the DIO submission code in XFS much
easier to follow. Wins all round, really.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_file.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f6f0e9671919..79ffb3e74f49 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -659,6 +659,8 @@ xfs_file_dio_aio_write(
 	int			iolock;
 	size_t			count = iov_iter_count(from);
 	loff_t			pos = iocb->ki_pos;
+	loff_t			end;
+	struct iov_iter		data;
 	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
@@ -698,10 +700,11 @@ xfs_file_dio_aio_write(
 	if (ret)
 		goto out;
 	iov_iter_truncate(from, count);
+	end = pos + count - 1;
 
 	if (mapping->nrpages) {
 		ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-						    pos, pos + count - 1);
+						   pos, end);
 		if (ret)
 			goto out;
 		/*
@@ -711,7 +714,7 @@ xfs_file_dio_aio_write(
 		 */
 		ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
 					pos >> PAGE_CACHE_SHIFT,
-					(pos + count - 1) >> PAGE_CACHE_SHIFT);
+					end >> PAGE_CACHE_SHIFT);
 		WARN_ON_ONCE(ret);
 		ret = 0;
 	}
@@ -728,8 +731,22 @@ xfs_file_dio_aio_write(
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-	ret = generic_file_direct_write(iocb, from, pos);
 
+	data = *from;
+	ret = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+
+	/* see generic_file_direct_write() for why this is necessary */
+	if (mapping->nrpages) {
+		invalidate_inode_pages2_range(mapping,
+					      pos >> PAGE_CACHE_SHIFT,
+					      end >> PAGE_CACHE_SHIFT);
+	}
+
+	if (ret > 0) {
+		pos += ret;
+		iov_iter_advance(from, ret);
+		iocb->ki_pos = pos;
+	}
 out:
 	xfs_rw_iunlock(ip, iolock);