From afca6c5b2595fc44383919fba740c194b0b76aff Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 17 Apr 2018 17:17:34 -0700
Subject: [PATCH 001/121] xfs: validate cached inodes are free when allocated

A recent fuzzed filesystem image cached random dcache corruption
when the reproducer was run. This often showed up as panics in
lookup_slow() on a null inode->i_ops pointer when doing pathwalks.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
....
Call Trace:
 lookup_slow+0x44/0x60
 walk_component+0x3dd/0x9f0
 link_path_walk+0x4a7/0x830
 path_lookupat+0xc1/0x470
 filename_lookup+0x129/0x270
 user_path_at_empty+0x36/0x40
 path_listxattr+0x98/0x110
 SyS_listxattr+0x13/0x20
 do_syscall_64+0xf5/0x280
 entry_SYSCALL_64_after_hwframe+0x42/0xb7

but had many different failure modes including deadlocks trying to
lock the inode that was just allocated or KASAN reports of
use-after-free violations.

The cause of the problem was a corrupt INOBT on a v4 fs where the
root inode was marked as free in the inobt record. Hence when we
allocated an inode, it chose the root inode to allocate, found it in
the cache and re-initialised it.

We recently fixed a similar inode allocation issue caused by inobt
record corruption problem in xfs_iget_cache_miss() in commit
ee457001ed6c ("xfs: catch inode allocation state mismatch
corruption"). This change adds similar checks to the cache-hit path
to catch it, and turns the reproducer into a corruption shutdown
situation.

Reported-by: Wen Xu <wen.xu@gatech.edu>
Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
[darrick: fix typos in comment]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_icache.c | 73 +++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9a18f69f6e96..817899961f48 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -308,6 +308,46 @@ xfs_reinit_inode(
 	return error;
 }
 
+/*
+ * If we are allocating a new inode, then check what was returned is
+ * actually a free, empty inode. If we are not allocating an inode,
+ * then check we didn't find a free inode.
+ *
+ * Returns:
+ *	0		if the inode free state matches the lookup context
+ *	-ENOENT		if the inode is free and we are not allocating
+ *	-EFSCORRUPTED	if there is any state mismatch at all
+ */
+static int
+xfs_iget_check_free_state(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	if (flags & XFS_IGET_CREATE) {
+		/* should be a free inode */
+		if (VFS_I(ip)->i_mode != 0) {
+			xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
+				ip->i_ino, VFS_I(ip)->i_mode);
+			return -EFSCORRUPTED;
+		}
+
+		if (ip->i_d.di_nblocks != 0) {
+			xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx has blocks allocated!",
+				ip->i_ino);
+			return -EFSCORRUPTED;
+		}
+		return 0;
+	}
+
+	/* should be an allocated inode */
+	if (VFS_I(ip)->i_mode == 0)
+		return -ENOENT;
+
+	return 0;
+}
+
 /*
  * Check the validity of the inode we just found it the cache
  */
@@ -357,12 +397,12 @@ xfs_iget_cache_hit(
 	}
 
 	/*
-	 * If lookup is racing with unlink return an error immediately.
+	 * Check the inode free state is valid. This also detects lookup
+	 * racing with unlinks.
 	 */
-	if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-		error = -ENOENT;
+	error = xfs_iget_check_free_state(ip, flags);
+	if (error)
 		goto out_error;
-	}
 
 	/*
 	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
@@ -485,29 +525,12 @@ xfs_iget_cache_miss(
 
 
 	/*
-	 * If we are allocating a new inode, then check what was returned is
-	 * actually a free, empty inode. If we are not allocating an inode,
-	 * the check we didn't find a free inode.
+	 * Check the inode free state is valid. This also detects lookup
+	 * racing with unlinks.
 	 */
-	if (flags & XFS_IGET_CREATE) {
-		if (VFS_I(ip)->i_mode != 0) {
-			xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx not marked free on disk",
-				ino);
-			error = -EFSCORRUPTED;
-			goto out_destroy;
-		}
-		if (ip->i_d.di_nblocks != 0) {
-			xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx has blocks allocated!",
-				ino);
-			error = -EFSCORRUPTED;
-			goto out_destroy;
-		}
-	} else if (VFS_I(ip)->i_mode == 0) {
-		error = -ENOENT;
+	error = xfs_iget_check_free_state(ip, flags);
+	if (error)
 		goto out_destroy;
-	}
 
 	/*
 	 * Preload the radix tree so we can insert safely under the

From 8b26984dbd813b1f63267baa258c9932e7f6c835 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 17 Apr 2018 17:17:35 -0700
Subject: [PATCH 002/121] xfs: validate allocated inode number

When we have corrupted free inode btrees, we can attempt to
allocate inodes that we know are already allocated. Catch allocation
of these inodes and report corruption as early as possible to
prevent corruption propagation or deadlocks.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2b70c8b4cee2..3cdd4fa37947 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -790,6 +790,18 @@ xfs_ialloc(
 	}
 	ASSERT(*ialloc_context == NULL);
 
+	/*
+	 * Protect against obviously corrupt allocation btree records. Later
+	 * xfs_iget checks will catch re-allocation of other active in-memory
+	 * and on-disk inodes. If we don't catch reallocating the parent inode
+	 * here we will deadlock in xfs_iget() so we have to do these checks
+	 * first.
+	 */
+	if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
+		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+		return -EFSCORRUPTED;
+	}
+
 	/*
 	 * Get the in-core inode with the lock held exclusively.
 	 * This is because we're setting fields here we need

From e443523d191041df942750efddc971efb803f827 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 17 Apr 2018 17:17:35 -0700
Subject: [PATCH 003/121] xfs: trace ATTR flags in xattr tracepoints

This will trace i.e. the ATTR_SECURE/ATTR_CREATE/ATTR_REPLACE
flags as well as the OP_FLAGS.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_trace.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8955254b900e..21d494f30ea7 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1750,6 +1750,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
 		__field(int, namelen)
 		__field(int, valuelen)
 		__field(xfs_dahash_t, hashval)
+		__field(int, flags)
 		__field(int, op_flags)
 	),
 	TP_fast_assign(
@@ -1760,10 +1761,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
 		__entry->namelen = args->namelen;
 		__entry->valuelen = args->valuelen;
 		__entry->hashval = args->hashval;
+		__entry->flags = args->flags;
 		__entry->op_flags = args->op_flags;
 	),
 	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
-		  "hashval 0x%x op_flags %s",
+		  "hashval 0x%x flags %s op_flags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->namelen,
@@ -1771,6 +1773,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
 		  __entry->namelen,
 		  __entry->valuelen,
 		  __entry->hashval,
+		  __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
 		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
 )
 

From 8925a3dc4771004b3e697e7159fa87be2aa5dd43 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 Apr 2018 08:25:20 -0700
Subject: [PATCH 004/121] xfs: make xfs_buf_incore out of line

Move xfs_buf_incore out of line and make it the only way to look up
a buffer in the buffer cache from outside the buffer cache. Convert
the external users of _xfs_buf_find() to xfs_buf_incore() and make
_xfs_buf_find() static.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
[darrick: actually rename xfs_incore -> xfs_buf_incore]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_attr_remote.c |  2 +-
 fs/xfs/xfs_buf.c                | 21 ++++++++++++++++-----
 fs/xfs/xfs_buf.h                | 17 +++--------------
 fs/xfs/xfs_qm.c                 |  5 ++---
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 21be186067a2..83a6d3c7f872 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -620,7 +620,7 @@ xfs_attr_rmtval_remove(
 		/*
 		 * If the "remote" value is in the cache, remove it.
 		 */
-		bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+		bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
 		if (bp) {
 			xfs_buf_stale(bp);
 			xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 55661cbdb51b..349d42fde1d7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -549,17 +549,17 @@ xfs_buf_hash_destroy(
 }
 
 /*
- *	Look up, and creates if absent, a lockable buffer for
- *	a given range of an inode.  The buffer is returned
- *	locked.	No I/O is implied by this call.
+ * Look up (and insert if absent), a lockable buffer for a given
+ * range of an inode.  The buffer is returned locked. No I/O is
+ * implied by this call.
  */
-xfs_buf_t *
+static struct xfs_buf *
 _xfs_buf_find(
 	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map,
 	int			nmaps,
 	xfs_buf_flags_t		flags,
-	xfs_buf_t		*new_bp)
+	struct xfs_buf		*new_bp)
 {
 	struct xfs_perag	*pag;
 	xfs_buf_t		*bp;
@@ -649,6 +649,17 @@ found:
 	return bp;
 }
 
+struct xfs_buf *
+xfs_buf_incore(
+	struct xfs_buftarg	*target,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	xfs_buf_flags_t		flags)
+{
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return _xfs_buf_find(target, &map, 1, flags, NULL);
+}
+
 /*
  * Assembles a buffer covering the specified range. The code is optimised for
  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index edced162a674..830e2f6c064a 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -218,20 +218,9 @@ typedef struct xfs_buf {
 } xfs_buf_t;
 
 /* Finding and Reading Buffers */
-struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
-			      struct xfs_buf_map *map, int nmaps,
-			      xfs_buf_flags_t flags, struct xfs_buf *new_bp);
-
-static inline struct xfs_buf *
-xfs_incore(
-	struct xfs_buftarg	*target,
-	xfs_daddr_t		blkno,
-	size_t			numblks,
-	xfs_buf_flags_t		flags)
-{
-	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return _xfs_buf_find(target, &map, 1, flags, NULL);
-}
+struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
+			   xfs_daddr_t blkno, size_t numblks,
+			   xfs_buf_flags_t flags);
 
 struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
 			       struct xfs_buf_map *map, int nmaps,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ec39ae274c78..afb991283b12 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1247,9 +1247,8 @@ xfs_qm_flush_one(
 	 */
 	if (!xfs_dqflock_nowait(dqp)) {
 		/* buf is pinned in-core by delwri list */
-		DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
-				      mp->m_quotainfo->qi_dqchunklen);
-		bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
+		bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+				mp->m_quotainfo->qi_dqchunklen, 0);
 		if (!bp) {
 			error = -EINVAL;
 			goto out_unlock;

From b027d4c97b9675c2ad75dec94be4e46dceb3ec74 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 18 Apr 2018 08:25:21 -0700
Subject: [PATCH 005/121] xfs: don't retry xfs_buf_find on XBF_TRYLOCK failure

When looking at an event trace recently, I noticed that non-blocking
buffer lookup attempts would fail on cached locked buffers and then
run the slow cache-miss path. This means we are doing an xfs_buf
allocation, lookup and free unnecessarily every time we avoid
blocking on a locked buffer.

Fix this by changing _xfs_buf_find() to return an error status to
the caller to indicate that we failed the lock attempt rather than
just returning a NULL. This allows the higher level code to
discriminate between a cache miss and an cache hit that we failed to
lock.

This also allows us to return a -EFSCORRUPTED state if we are asked
to look up a block number outside the range of the filesystem in
_xfs_buf_find(), which moves us one step closer to being able to
handle such errors in a more graceful manner at the higher levels.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf.c | 93 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 349d42fde1d7..5179ab9e3d6a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -549,17 +549,31 @@ xfs_buf_hash_destroy(
 }
 
 /*
- * Look up (and insert if absent), a lockable buffer for a given
- * range of an inode.  The buffer is returned locked. No I/O is
- * implied by this call.
+ * Look up a buffer in the buffer cache and return it referenced and locked
+ * in @found_bp.
+ *
+ * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
+ * cache.
+ *
+ * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
+ * -EAGAIN if we fail to lock it.
+ *
+ * Return values are:
+ *	-EFSCORRUPTED if have been supplied with an invalid address
+ *	-EAGAIN on trylock failure
+ *	-ENOENT if we fail to find a match and @new_bp was NULL
+ *	0, with @found_bp:
+ *		- @new_bp if we inserted it into the cache
+ *		- the buffer we found and locked.
  */
-static struct xfs_buf *
-_xfs_buf_find(
+static int
+xfs_buf_find(
 	struct xfs_buftarg	*btp,
 	struct xfs_buf_map	*map,
 	int			nmaps,
 	xfs_buf_flags_t		flags,
-	struct xfs_buf		*new_bp)
+	struct xfs_buf		*new_bp,
+	struct xfs_buf		**found_bp)
 {
 	struct xfs_perag	*pag;
 	xfs_buf_t		*bp;
@@ -567,6 +581,8 @@ _xfs_buf_find(
 	xfs_daddr_t		eofs;
 	int			i;
 
+	*found_bp = NULL;
+
 	for (i = 0; i < nmaps; i++)
 		cmap.bm_len += map[i].bm_len;
 
@@ -580,16 +596,11 @@ _xfs_buf_find(
 	 */
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
 	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
-		/*
-		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
-		 * but none of the higher level infrastructure supports
-		 * returning a specific error on buffer lookup failures.
-		 */
 		xfs_alert(btp->bt_mount,
 			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
 			  __func__, cmap.bm_bn, eofs);
 		WARN_ON(1);
-		return NULL;
+		return -EFSCORRUPTED;
 	}
 
 	pag = xfs_perag_get(btp->bt_mount,
@@ -604,19 +615,20 @@ _xfs_buf_find(
 	}
 
 	/* No match found */
-	if (new_bp) {
-		/* the buffer keeps the perag reference until it is freed */
-		new_bp->b_pag = pag;
-		rhashtable_insert_fast(&pag->pag_buf_hash,
-				       &new_bp->b_rhash_head,
-				       xfs_buf_hash_params);
-		spin_unlock(&pag->pag_buf_lock);
-	} else {
+	if (!new_bp) {
 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
 		spin_unlock(&pag->pag_buf_lock);
 		xfs_perag_put(pag);
+		return -ENOENT;
 	}
-	return new_bp;
+
+	/* the buffer keeps the perag reference until it is freed */
+	new_bp->b_pag = pag;
+	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
+			       xfs_buf_hash_params);
+	spin_unlock(&pag->pag_buf_lock);
+	*found_bp = new_bp;
+	return 0;
 
 found:
 	spin_unlock(&pag->pag_buf_lock);
@@ -626,7 +638,7 @@ found:
 		if (flags & XBF_TRYLOCK) {
 			xfs_buf_rele(bp);
 			XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
-			return NULL;
+			return -EAGAIN;
 		}
 		xfs_buf_lock(bp);
 		XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
@@ -646,7 +658,8 @@ found:
 
 	trace_xfs_buf_find(bp, flags, _RET_IP_);
 	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
-	return bp;
+	*found_bp = bp;
+	return 0;
 }
 
 struct xfs_buf *
@@ -656,8 +669,14 @@ xfs_buf_incore(
 	size_t			numblks,
 	xfs_buf_flags_t		flags)
 {
+	struct xfs_buf		*bp;
+	int			error;
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return _xfs_buf_find(target, &map, 1, flags, NULL);
+
+	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
+	if (error)
+		return NULL;
+	return bp;
 }
 
 /*
@@ -676,9 +695,27 @@ xfs_buf_get_map(
 	struct xfs_buf		*new_bp;
 	int			error = 0;
 
-	bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
-	if (likely(bp))
+	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
+
+	switch (error) {
+	case 0:
+		/* cache hit */
 		goto found;
+	case -EAGAIN:
+		/* cache hit, trylock failure, caller handles failure */
+		ASSERT(flags & XBF_TRYLOCK);
+		return NULL;
+	case -ENOENT:
+		/* cache miss, go for insert */
+		break;
+	case -EFSCORRUPTED:
+	default:
+		/*
+		 * None of the higher layers understand failure types
+		 * yet, so return NULL to signal a fatal lookup error.
+		 */
+		return NULL;
+	}
 
 	new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
 	if (unlikely(!new_bp))
@@ -690,8 +727,8 @@ xfs_buf_get_map(
 		return NULL;
 	}
 
-	bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
-	if (!bp) {
+	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
+	if (error) {
 		xfs_buf_free(new_bp);
 		return NULL;
 	}

From ed5c3e66a32883e2b3d119d358d23fd5990dc9c2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 2 May 2018 12:54:52 -0700
Subject: [PATCH 006/121] xfs: move generic_write_sync calls inwards

To prepare for iomap iinfrastructure based DSYNC optimisations.

While moving the code araound, move the XFS write bytes metric
update for direct IO into xfs_dio_write_end_io callback so that we
always capture the amount of data written via AIO+DIO. This fixes
the problem where queued AIO+DIO writes are not accounted to this
metric.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_file.c | 48 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e70fb8ccecea..e03e084b6819 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -414,6 +414,12 @@ xfs_dio_write_end_io(
 	if (size <= 0)
 		return size;
 
+	/*
+	 * Capture amount written on completion as we can't reliably account
+	 * for it on submission.
+	 */
+	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+
 	if (flags & IOMAP_DIO_COW) {
 		error = xfs_reflink_end_cow(ip, offset, size);
 		if (error)
@@ -562,6 +568,11 @@ out:
 	 * complete fully or fail.
 	 */
 	ASSERT(ret < 0 || ret == count);
+
+	if (ret > 0) {
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
 	return ret;
 }
 
@@ -599,7 +610,16 @@ xfs_file_dax_write(
 	}
 out:
 	xfs_iunlock(ip, iolock);
-	return error ? error : ret;
+	if (error)
+		return error;
+
+	if (ret > 0) {
+		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
+	return ret;
 }
 
 STATIC ssize_t
@@ -669,6 +689,12 @@ write_retry:
 out:
 	if (iolock)
 		xfs_iunlock(ip, iolock);
+
+	if (ret > 0) {
+		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
 	return ret;
 }
 
@@ -693,8 +719,9 @@ xfs_file_write_iter(
 		return -EIO;
 
 	if (IS_DAX(inode))
-		ret = xfs_file_dax_write(iocb, from);
-	else if (iocb->ki_flags & IOCB_DIRECT) {
+		return xfs_file_dax_write(iocb, from);
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
 		/*
 		 * Allow a directio write to fall back to a buffered
 		 * write *only* in the case that we're doing a reflink
@@ -702,20 +729,11 @@ xfs_file_write_iter(
 		 * allow an operation to fall back to buffered mode.
 		 */
 		ret = xfs_file_dio_aio_write(iocb, from);
-		if (ret == -EREMCHG)
-			goto buffered;
-	} else {
-buffered:
-		ret = xfs_file_buffered_aio_write(iocb, from);
+		if (ret != -EREMCHG)
+			return ret;
 	}
 
-	if (ret > 0) {
-		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
-
-		/* Handle various SYNC-type writes */
-		ret = generic_write_sync(iocb, ret);
-	}
-	return ret;
+	return xfs_file_buffered_aio_write(iocb, from);
 }
 
 #define	XFS_FALLOC_FL_SUPPORTED						\

From 4f8ff44ba0ad82a6f51c1bf381d7bad346464b09 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 2 May 2018 12:54:52 -0700
Subject: [PATCH 007/121] iomap: iomap_dio_rw() handles all sync writes

Currently iomap_dio_rw() only handles (data)sync write completions
for AIO. This means we can't optimised non-AIO IO to minimise device
flushes as we can't tell the caller whether a flush is required or
not.

To solve this problem and enable further optimisations, make
iomap_dio_rw responsible for data sync behaviour for all IO, not
just AIO.

In doing so, the sync operation is now accounted as part of the DIO
IO by inode_dio_end(), hence post-IO data stability updates will no
long race against operations that serialise via inode_dio_wait()
such as truncate or hole punch.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c        | 21 +++++++++++++++------
 fs/xfs/xfs_file.c |  5 -----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index afd163586aa0..b044d8ee2efd 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_NEED_SYNC	(1 << 29)
 #define IOMAP_DIO_WRITE		(1 << 30)
 #define IOMAP_DIO_DIRTY		(1 << 31)
 
@@ -759,6 +760,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 			dio_warn_stale_pagecache(iocb->ki_filp);
 	}
 
+	/*
+	 * If this is a DSYNC write, make sure we push it to stable storage now
+	 * that we've written data.
+	 */
+	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+		ret = generic_write_sync(iocb, ret);
+
 	inode_dio_end(file_inode(iocb->ki_filp));
 	kfree(dio);
 
@@ -769,13 +777,8 @@ static void iomap_dio_complete_work(struct work_struct *work)
 {
 	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
 	struct kiocb *iocb = dio->iocb;
-	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
-	ssize_t ret;
 
-	ret = iomap_dio_complete(dio);
-	if (is_write && ret > 0)
-		ret = generic_write_sync(iocb, ret);
-	iocb->ki_complete(iocb, ret, 0);
+	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
 }
 
 /*
@@ -961,6 +964,10 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	return copied;
 }
 
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.
+ */
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
@@ -1006,6 +1013,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			dio->flags |= IOMAP_DIO_DIRTY;
 	} else {
 		dio->flags |= IOMAP_DIO_WRITE;
+		if (iocb->ki_flags & IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_NEED_SYNC;
 		flags |= IOMAP_WRITE;
 	}
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e03e084b6819..28be51908254 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -568,11 +568,6 @@ out:
 	 * complete fully or fail.
 	 */
 	ASSERT(ret < 0 || ret == count);
-
-	if (ret > 0) {
-		/* Handle various SYNC-type writes */
-		ret = generic_write_sync(iocb, ret);
-	}
 	return ret;
 }
 

From 3460cac1ca76215a60acb086ebe97b3e50731628 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 2 May 2018 12:54:53 -0700
Subject: [PATCH 008/121] iomap: Use FUA for pure data O_DSYNC DIO writes

If we are doing direct IO writes with datasync semantics, we often
have to flush metadata changes along with the data write. However,
if we are overwriting existing data, there are no metadata changes
that we need to flush. In this case, optimising the IO by using
FUA write makes sense.

We know from the IOMAP_F_DIRTY flag as to whether a specific inode
requires a metadata flush - this is currently used by DAX to ensure
extent modification as stable in page fault operations. For direct
IO writes, we can use it to determine if we need to flush metadata
or not once the data is on disk.

Hence if we have been returned a mapped extent that is not new and
the IO mapping is not dirty, then we can use a FUA write to provide
datasync semantics. This allows us to short-cut the
generic_write_sync() call in IO completion and hence avoid
unnecessary operations. This makes pure direct IO data write
behaviour identical to the way block devices use REQ_FUA to provide
datasync semantics.

On a FUA enabled device, a synchronous direct IO write workload
(sequential 4k overwrites in 32MB file) had the following results:

# xfs_io -fd -c "pwrite -V 1 -D 0 32m" /mnt/scratch/boo

kernel		time	write()s	write iops	Write b/w
------		----	--------	----------	---------
(no dsync)	 4s	2173/s		2173		8.5MB/s
vanilla		22s	 370/s		 750		1.4MB/s
patched		19s	 420/s		 420		1.6MB/s

The patched code clearly doesn't send cache flushes anymore, but
instead uses FUA (confirmed via blktrace), and performance improves
a bit as a result. However, the benefits will be higher on workloads
that mix O_DSYNC overwrites with other write IO as we won't be
flushing the entire device cache on every DSYNC overwrite IO
anymore.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index b044d8ee2efd..bcfd7f3654d4 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_WRITE_FUA	(1 << 28)
 #define IOMAP_DIO_NEED_SYNC	(1 << 29)
 #define IOMAP_DIO_WRITE		(1 << 30)
 #define IOMAP_DIO_DIRTY		(1 << 31)
@@ -861,6 +862,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	struct iov_iter iter;
 	struct bio *bio;
 	bool need_zeroout = false;
+	bool use_fua = false;
 	int nr_pages, ret;
 	size_t copied = 0;
 
@@ -884,8 +886,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	case IOMAP_MAPPED:
 		if (iomap->flags & IOMAP_F_SHARED)
 			dio->flags |= IOMAP_DIO_COW;
-		if (iomap->flags & IOMAP_F_NEW)
+		if (iomap->flags & IOMAP_F_NEW) {
 			need_zeroout = true;
+		} else {
+			/*
+			 * Use a FUA write if we need datasync semantics, this
+			 * is a pure data IO that doesn't require any metadata
+			 * updates and the underlying device supports FUA. This
+			 * allows us to avoid cache flushes on IO completion.
+			 */
+			if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+			    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+			    blk_queue_fua(bdev_get_queue(iomap->bdev)))
+				use_fua = true;
+		}
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -933,10 +947,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 		n = bio->bi_iter.bi_size;
 		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (use_fua)
+				bio->bi_opf |= REQ_FUA;
+			else
+				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
 			task_io_account_write(n);
 		} else {
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			bio->bi_opf = REQ_OP_READ;
 			if (dio->flags & IOMAP_DIO_DIRTY)
 				bio_set_pages_dirty(bio);
 		}
@@ -966,7 +984,12 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 /*
  * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
- * is being issued as AIO or not.
+ * is being issued as AIO or not.  This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
  */
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
@@ -1012,10 +1035,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		if (iter->type == ITER_IOVEC)
 			dio->flags |= IOMAP_DIO_DIRTY;
 	} else {
+		flags |= IOMAP_WRITE;
 		dio->flags |= IOMAP_DIO_WRITE;
+
+		/* for data sync or sync, we need sync completion processing */
 		if (iocb->ki_flags & IOCB_DSYNC)
 			dio->flags |= IOMAP_DIO_NEED_SYNC;
-		flags |= IOMAP_WRITE;
+
+		/*
+		 * For datasync only writes, we optimistically try using FUA for
+		 * this IO.  Any non-FUA write that occurs will clear this flag,
+		 * hence we know before completion whether a cache flush is
+		 * necessary.
+		 */
+		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_WRITE_FUA;
 	}
 
 	if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -1071,6 +1105,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (ret < 0)
 		iomap_dio_set_error(dio, ret);
 
+	/*
+	 * If all the writes we issued were FUA, we don't need to flush the
+	 * cache on IO completion. Clear the sync flag for this case.
+	 */
+	if (dio->flags & IOMAP_DIO_WRITE_FUA)
+		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
 	if (!atomic_dec_and_test(&dio->ref)) {
 		if (!is_sync_kiocb(iocb))
 			return -EIOCBQUEUED;

From d0641780940a9842cbf62114bc8422012c7d88c4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 2 May 2018 12:54:53 -0700
Subject: [PATCH 009/121] xfs: simplify xfs_file_iomap_begin() logic

The current logic that determines whether allocation should be done
has grown somewhat spaghetti like with the addition of IOMAP_NOWAIT
functionality. Separate out each of the different cases into single,
obvious checks to get rid most of the nested IOMAP_NOWAIT checks
in the allocation logic.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_iomap.c | 82 ++++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 046469fcc1b8..16565da67bb6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1040,6 +1040,10 @@ xfs_file_iomap_begin(
 			goto out_unlock;
 	}
 
+	/* Non-modifying mapping requested, so we are done */
+	if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
+		goto out_found;
+
 	if (xfs_is_reflink_inode(ip) &&
 	    ((flags & IOMAP_WRITE) ||
 	     ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
@@ -1068,46 +1072,45 @@ xfs_file_iomap_begin(
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
 	}
 
-	if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
-		/*
-		 * If nowait is set bail since we are going to make
-		 * allocations.
-		 */
-		if (flags & IOMAP_NOWAIT) {
-			error = -EAGAIN;
-			goto out_unlock;
-		}
-		/*
-		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
-		 * pages to keep the chunks of work done where somewhat symmetric
-		 * with the work writeback does. This is a completely arbitrary
-		 * number pulled out of thin air as a best guess for initial
-		 * testing.
-		 *
-		 * Note that the values needs to be less than 32-bits wide until
-		 * the lower level functions are updated.
-		 */
-		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
-		/*
-		 * xfs_iomap_write_direct() expects the shared lock. It
-		 * is unlocked on return.
-		 */
-		if (lockmode == XFS_ILOCK_EXCL)
-			xfs_ilock_demote(ip, lockmode);
-		error = xfs_iomap_write_direct(ip, offset, length, &imap,
-				nimaps);
-		if (error)
-			return error;
+	/* Don't need to allocate over holes when doing zeroing operations. */
+	if (flags & IOMAP_ZERO)
+		goto out_found;
 
-		iomap->flags = IOMAP_F_NEW;
-		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
-	} else {
-		ASSERT(nimaps);
+	if (!imap_needs_alloc(inode, &imap, nimaps))
+		goto out_found;
 
-		xfs_iunlock(ip, lockmode);
-		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+	/* If nowait is set bail since we are going to make allocations. */
+	if (flags & IOMAP_NOWAIT) {
+		error = -EAGAIN;
+		goto out_unlock;
 	}
 
+	/*
+	 * We cap the maximum length we map to a sane size  to keep the chunks
+	 * of work done where somewhat symmetric with the work writeback does.
+	 * This is a completely arbitrary number pulled out of thin air as a
+	 * best guess for initial testing.
+	 *
+	 * Note that the values needs to be less than 32-bits wide until the
+	 * lower level functions are updated.
+	 */
+	length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+
+	/*
+	 * xfs_iomap_write_direct() expects the shared lock. It is unlocked on
+	 * return.
+	 */
+	if (lockmode == XFS_ILOCK_EXCL)
+		xfs_ilock_demote(ip, lockmode);
+	error = xfs_iomap_write_direct(ip, offset, length, &imap,
+			nimaps);
+	if (error)
+		return error;
+
+	iomap->flags = IOMAP_F_NEW;
+	trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+
+out_finish:
 	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
 				& ~XFS_ILOG_TIMESTAMP))
 		iomap->flags |= IOMAP_F_DIRTY;
@@ -1117,6 +1120,13 @@ xfs_file_iomap_begin(
 	if (shared)
 		iomap->flags |= IOMAP_F_SHARED;
 	return 0;
+
+out_found:
+	ASSERT(nimaps);
+	xfs_iunlock(ip, lockmode);
+	trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+	goto out_finish;
+
 out_unlock:
 	xfs_iunlock(ip, lockmode);
 	return error;

From dfa03a5f8076c6bf79a9f68ef8f559fb8999cec1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 2 May 2018 12:54:54 -0700
Subject: [PATCH 010/121] xfs: clean up locking in xfs_file_iomap_begin

Rather than checking what kind of locking is needed in a helper
function and then jumping through hoops to do the locking in line,
move the locking to the helper function that does all the checks
and rename it to xfs_ilock_for_iomap().

This also allows us to hoist all the nonblocking checks up into the
locking helper, further simplifier the code flow in
xfs_file_iomap_begin() and making it easier to understand.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_iomap.c | 98 +++++++++++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 37 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 16565da67bb6..d03e65f01c89 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -946,8 +946,11 @@ error_on_bmapi_transaction:
 	return error;
 }
 
-static inline bool imap_needs_alloc(struct inode *inode,
-		struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool
+imap_needs_alloc(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*imap,
+	int			nimaps)
 {
 	return !nimaps ||
 		imap->br_startblock == HOLESTARTBLOCK ||
@@ -955,31 +958,58 @@ static inline bool imap_needs_alloc(struct inode *inode,
 		(IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
 }
 
-static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool
+needs_cow_for_zeroing(
+	struct xfs_bmbt_irec	*imap,
+	int			nimaps)
 {
 	return nimaps &&
 		imap->br_startblock != HOLESTARTBLOCK &&
 		imap->br_state != XFS_EXT_UNWRITTEN;
 }
 
-static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+static int
+xfs_ilock_for_iomap(
+	struct xfs_inode	*ip,
+	unsigned		flags,
+	unsigned		*lockmode)
 {
+	unsigned		mode = XFS_ILOCK_SHARED;
+
 	/*
 	 * COW writes may allocate delalloc space or convert unwritten COW
 	 * extents, so we need to make sure to take the lock exclusively here.
 	 */
-	if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
-		return true;
+	if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) {
+		/*
+		 * FIXME: It could still overwrite on unshared extents and not
+		 * need allocation.
+		 */
+		if (flags & IOMAP_NOWAIT)
+			return -EAGAIN;
+		mode = XFS_ILOCK_EXCL;
+	}
 
 	/*
-	 * Extents not yet cached requires exclusive access, don't block.
-	 * This is an opencoded xfs_ilock_data_map_shared() to cater for the
+	 * Extents not yet cached requires exclusive access, don't block.  This
+	 * is an opencoded xfs_ilock_data_map_shared() call but with
 	 * non-blocking behaviour.
 	 */
-	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-	    !(ip->i_df.if_flags & XFS_IFEXTENTS))
-		return true;
-	return false;
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+		if (flags & IOMAP_NOWAIT)
+			return -EAGAIN;
+		mode = XFS_ILOCK_EXCL;
+	}
+
+	if (flags & IOMAP_NOWAIT) {
+		if (!xfs_ilock_nowait(ip, mode))
+			return -EAGAIN;
+	} else {
+		xfs_ilock(ip, mode);
+	}
+
+	*lockmode = mode;
+	return 0;
 }
 
 static int
@@ -1007,19 +1037,15 @@ xfs_file_iomap_begin(
 		return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
 	}
 
-	if (need_excl_ilock(ip, flags))
-		lockmode = XFS_ILOCK_EXCL;
-	else
-		lockmode = XFS_ILOCK_SHARED;
-
-	if (flags & IOMAP_NOWAIT) {
-		if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
-			return -EAGAIN;
-		if (!xfs_ilock_nowait(ip, lockmode))
-			return -EAGAIN;
-	} else {
-		xfs_ilock(ip, lockmode);
-	}
+	/*
+	 * Lock the inode in the manner required for the specified operation and
+	 * check for as many conditions that would result in blocking as
+	 * possible. This removes most of the non-blocking checks from the
+	 * mapping code below.
+	 */
+	error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+	if (error)
+		return error;
 
 	ASSERT(offset <= mp->m_super->s_maxbytes);
 	if (offset > mp->m_super->s_maxbytes - length)
@@ -1044,19 +1070,17 @@ xfs_file_iomap_begin(
 	if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
 		goto out_found;
 
-	if (xfs_is_reflink_inode(ip) &&
-	    ((flags & IOMAP_WRITE) ||
-	     ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
+	/*
+	 * Break shared extents if necessary. Checks for non-blocking IO have
+	 * been done up front, so we don't need to do them here.
+	 */
+	if (xfs_is_reflink_inode(ip)) {
+		/* if zeroing doesn't need COW allocation, then we are done. */
+		if ((flags & IOMAP_ZERO) &&
+		    !needs_cow_for_zeroing(&imap, nimaps))
+			goto out_found;
+
 		if (flags & IOMAP_DIRECT) {
-			/*
-			 * A reflinked inode will result in CoW alloc.
-			 * FIXME: It could still overwrite on unshared extents
-			 * and not need allocation.
-			 */
-			if (flags & IOMAP_NOWAIT) {
-				error = -EAGAIN;
-				goto out_unlock;
-			}
 			/* may drop and re-acquire the ilock */
 			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
 					&lockmode);

From e381a0f6c28a3f2a452d5fba9b917f03e5dc4ffb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 4 May 2018 15:15:48 -0700
Subject: [PATCH 011/121] xfs: remove unused flags arg from xfs_dquot_verify

Long ago the flags argument was used to determine whether to issue warnings
about corruptions, but that's done elsewhere now and the flag is unused
here, so remove it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_dquot_buf.c  | 5 ++---
 fs/xfs/libxfs/xfs_quota_defs.h | 3 +--
 fs/xfs/xfs_dquot.c             | 2 +-
 fs/xfs/xfs_log_recover.c       | 4 ++--
 fs/xfs/xfs_qm.c                | 2 +-
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 8b7a6c3cb599..a926058f7b0c 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -47,8 +47,7 @@ xfs_dquot_verify(
 	struct xfs_mount *mp,
 	xfs_disk_dquot_t *ddq,
 	xfs_dqid_t	 id,
-	uint		 type,	  /* used only when IO_dorepair is true */
-	uint		 flags)
+	uint		 type)	  /* used only when IO_dorepair is true */
 {
 	/*
 	 * We can encounter an uninitialized dquot buffer for 2 reasons:
@@ -200,7 +199,7 @@ xfs_dquot_buf_verify(
 		if (i == 0)
 			id = be32_to_cpu(ddq->d_id);
 
-		fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0);
+		fa = xfs_dquot_verify(mp, ddq, id + i, 0);
 		if (fa)
 			return fa;
 	}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index bb1b13a9b5f4..8433656af0da 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -152,8 +152,7 @@ typedef uint16_t	xfs_qwarncnt_t;
 #define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
 
 extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
-		struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type,
-		uint flags);
+		struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
 extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq,
 		xfs_dqid_t id, uint type);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a7daef9e16bf..8d378f485260 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1001,7 +1001,7 @@ xfs_qm_dqflush(
 	/*
 	 * A simple sanity check in case we got a corrupted dquot..
 	 */
-	fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0);
+	fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0);
 	if (fa) {
 		xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
 				be32_to_cpu(ddqp->d_id), fa);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2b2383f1895e..06a09cb948b5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2702,7 +2702,7 @@ xlog_recover_do_reg_buffer(
 				goto next;
 			}
 			fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
-					       -1, 0, 0);
+					       -1, 0);
 			if (fa) {
 				xfs_alert(mp,
 	"dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -3348,7 +3348,7 @@ xlog_recover_dquot_pass2(
 	 */
 	dq_f = item->ri_buf[0].i_addr;
 	ASSERT(dq_f);
-	fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0);
+	fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
 	if (fa) {
 		xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
 				dq_f->qlf_id, fa);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index afb991283b12..e641a507d0ed 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -865,7 +865,7 @@ xfs_qm_reset_dqcounts(
 		 * find uninitialised dquot blks. See comment in
 		 * xfs_dquot_verify.
 		 */
-		fa = xfs_dquot_verify(mp, ddq, id + j, type, 0);
+		fa = xfs_dquot_verify(mp, ddq, id + j, type);
 		if (fa)
 			xfs_dquot_repair(mp, ddq, id + j, type);
 

From 57ab324553bbfedc8e732eb570edfac0f5cfe57e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 7 May 2018 09:20:17 -0700
Subject: [PATCH 012/121] xfs: check type in quota verifier during quotacheck

During quotacheck we send in the quota type, so verify that as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_dquot_buf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index a926058f7b0c..c00174f32f0d 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -47,7 +47,7 @@ xfs_dquot_verify(
 	struct xfs_mount *mp,
 	xfs_disk_dquot_t *ddq,
 	xfs_dqid_t	 id,
-	uint		 type)	  /* used only when IO_dorepair is true */
+	uint		 type)	  /* used only during quotacheck */
 {
 	/*
 	 * We can encounter an uninitialized dquot buffer for 2 reasons:
@@ -69,6 +69,8 @@ xfs_dquot_verify(
 	if (ddq->d_version != XFS_DQUOT_VERSION)
 		return __this_address;
 
+	if (type && ddq->d_flags != type)
+		return __this_address;
 	if (ddq->d_flags != XFS_DQ_USER &&
 	    ddq->d_flags != XFS_DQ_PROJ &&
 	    ddq->d_flags != XFS_DQ_GROUP)

From 48fa1db87f730da1aed2d3df0cc8c33c7c133b4b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 7 May 2018 09:20:17 -0700
Subject: [PATCH 013/121] xfs: pass full xfs_dqblk to repair during quotacheck

It's a bit dicey to pass in the smaller xfs_disk_dquot and then cast it to
something larger; pass in the full xfs_dqblk so we know the caller has sent
us the right thing.  Rename the function to xfs_dqblk_repair for
clarity.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_dquot_buf.c  | 21 +++++++++------------
 fs/xfs/libxfs/xfs_quota_defs.h |  2 +-
 fs/xfs/xfs_qm.c                |  2 +-
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index c00174f32f0d..3b92427883fa 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -104,29 +104,26 @@ xfs_dquot_verify(
  * Do some primitive error checking on ondisk dquot data structures.
  */
 int
-xfs_dquot_repair(
+xfs_dqblk_repair(
 	struct xfs_mount	*mp,
-	struct xfs_disk_dquot	*ddq,
+	struct xfs_dqblk	*dqb,
 	xfs_dqid_t		id,
 	uint			type)
 {
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)ddq;
-
-
 	/*
 	 * Typically, a repair is only requested by quotacheck.
 	 */
 	ASSERT(id != -1);
-	memset(d, 0, sizeof(xfs_dqblk_t));
+	memset(dqb, 0, sizeof(xfs_dqblk_t));
 
-	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
-	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-	d->dd_diskdq.d_flags = type;
-	d->dd_diskdq.d_id = cpu_to_be32(id);
+	dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+	dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+	dqb->dd_diskdq.d_flags = type;
+	dqb->dd_diskdq.d_id = cpu_to_be32(id);
 
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
-		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+		uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
+		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
 				 XFS_DQUOT_CRC_OFF);
 	}
 
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 8433656af0da..a2f8cb334bb3 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -154,7 +154,7 @@ typedef uint16_t	xfs_qwarncnt_t;
 extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
 		struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq,
+extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
 		xfs_dqid_t id, uint type);
 
 #endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e641a507d0ed..b016079971ee 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -867,7 +867,7 @@ xfs_qm_reset_dqcounts(
 		 */
 		fa = xfs_dquot_verify(mp, ddq, id + j, type);
 		if (fa)
-			xfs_dquot_repair(mp, ddq, id + j, type);
+			xfs_dqblk_repair(mp, &dqb[j], id + j, type);
 
 		/*
 		 * Reset type in case we are reusing group quota file for

From 7224fa482a6daa0558792e03a209e08d34690a26 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 7 May 2018 09:20:18 -0700
Subject: [PATCH 014/121] xfs: add full xfs_dqblk verifier

Add an xfs_dqblk verifier so that it can check the uuid on V5 filesystems;
it calls the existing xfs_dquot_verify verifier to validate the
xfs_disk_dquot_t contained inside it.  This lets us move the uuid
verification out of the crc verifier, which makes little sense.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_dquot_buf.c  | 27 ++++++++++++++++++++++-----
 fs/xfs/libxfs/xfs_quota_defs.h |  2 ++
 fs/xfs/xfs_dquot.c             | 10 +++++-----
 fs/xfs/xfs_qm.c                |  2 +-
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 3b92427883fa..83bebb57d0da 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -41,7 +41,12 @@ xfs_calc_dquots_per_chunk(
 
 /*
  * Do some primitive error checking on ondisk dquot data structures.
+ *
+ * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure;
+ * we verify them separately because at some points we have only the
+ * smaller xfs_disk_dquot structure available.
  */
+
 xfs_failaddr_t
 xfs_dquot_verify(
 	struct xfs_mount *mp,
@@ -100,6 +105,20 @@ xfs_dquot_verify(
 	return NULL;
 }
 
+xfs_failaddr_t
+xfs_dqblk_verify(
+	struct xfs_mount	*mp,
+	struct xfs_dqblk	*dqb,
+	xfs_dqid_t	 	id,
+	uint		 	type)	/* used only during quotacheck */
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
+		return __this_address;
+
+	return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type);
+}
+
 /*
  * Do some primitive error checking on ondisk dquot data structures.
  */
@@ -156,8 +175,6 @@ xfs_dquot_buf_verify_crc(
 		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
 				 XFS_DQUOT_CRC_OFF))
 			return false;
-		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
-			return false;
 	}
 	return true;
 }
@@ -167,7 +184,7 @@ xfs_dquot_buf_verify(
 	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	struct xfs_dqblk	*dqb = bp->b_addr;
 	xfs_failaddr_t		fa;
 	xfs_dqid_t		id = 0;
 	int			ndquots;
@@ -193,12 +210,12 @@ xfs_dquot_buf_verify(
 	for (i = 0; i < ndquots; i++) {
 		struct xfs_disk_dquot	*ddq;
 
-		ddq = &d[i].dd_diskdq;
+		ddq = &dqb[i].dd_diskdq;
 
 		if (i == 0)
 			id = be32_to_cpu(ddq->d_id);
 
-		fa = xfs_dquot_verify(mp, ddq, id + i, 0);
+		fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
 		if (fa)
 			return fa;
 	}
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index a2f8cb334bb3..1aac52d7fef4 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -153,6 +153,8 @@ typedef uint16_t	xfs_qwarncnt_t;
 
 extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
 		struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
+extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
+		struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
 extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
 		xfs_dqid_t id, uint type);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8d378f485260..d0880c1add41 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -953,6 +953,7 @@ xfs_qm_dqflush(
 {
 	struct xfs_mount	*mp = dqp->q_mount;
 	struct xfs_buf		*bp;
+	struct xfs_dqblk	*dqb;
 	struct xfs_disk_dquot	*ddqp;
 	xfs_failaddr_t		fa;
 	int			error;
@@ -996,12 +997,13 @@ xfs_qm_dqflush(
 	/*
 	 * Calculate the location of the dquot inside the buffer.
 	 */
-	ddqp = bp->b_addr + dqp->q_bufoffset;
+	dqb = bp->b_addr + dqp->q_bufoffset;
+	ddqp = &dqb->dd_diskdq;
 
 	/*
-	 * A simple sanity check in case we got a corrupted dquot..
+	 * A simple sanity check in case we got a corrupted dquot.
 	 */
-	fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0);
+	fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0);
 	if (fa) {
 		xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
 				be32_to_cpu(ddqp->d_id), fa);
@@ -1032,8 +1034,6 @@ xfs_qm_dqflush(
 	 * of a dquot without an up-to-date CRC getting to disk.
 	 */
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
-
 		dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
 		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
 				 XFS_DQUOT_CRC_OFF);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b016079971ee..c72a8da55703 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -865,7 +865,7 @@ xfs_qm_reset_dqcounts(
 		 * find uninitialised dquot blks. See comment in
 		 * xfs_dquot_verify.
 		 */
-		fa = xfs_dquot_verify(mp, ddq, id + j, type);
+		fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type);
 		if (fa)
 			xfs_dqblk_repair(mp, &dqb[j], id + j, type);
 

From 72c5c5f6d01c859dfe16c4910a5222ed9393c37c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Mon, 7 May 2018 09:20:47 -0700
Subject: [PATCH 015/121] xfs: print specific dqblk that failed verifiers

Rather than printing the top of the buffer that held a corrupted dqblk,
restructure things to print out the specific one that failed by pushing
the calls to the verifier_error function down into the verifier which
iterates over the buffer and detects the error.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_dquot_buf.c | 41 +++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 83bebb57d0da..cce520becee4 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -152,7 +152,8 @@ xfs_dqblk_repair(
 STATIC bool
 xfs_dquot_buf_verify_crc(
 	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
+	struct xfs_buf		*bp,
+	bool			readahead)
 {
 	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
 	int			ndquots;
@@ -173,8 +174,12 @@ xfs_dquot_buf_verify_crc(
 
 	for (i = 0; i < ndquots; i++, d++) {
 		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
-				 XFS_DQUOT_CRC_OFF))
+				 XFS_DQUOT_CRC_OFF)) {
+			if (!readahead)
+				xfs_buf_verifier_error(bp, -EFSBADCRC, __func__,
+					d, sizeof(*d), __this_address);
 			return false;
+		}
 	}
 	return true;
 }
@@ -182,7 +187,8 @@ xfs_dquot_buf_verify_crc(
 STATIC xfs_failaddr_t
 xfs_dquot_buf_verify(
 	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
+	struct xfs_buf		*bp,
+	bool			readahead)
 {
 	struct xfs_dqblk	*dqb = bp->b_addr;
 	xfs_failaddr_t		fa;
@@ -216,8 +222,13 @@ xfs_dquot_buf_verify(
 			id = be32_to_cpu(ddq->d_id);
 
 		fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
-		if (fa)
+		if (fa) {
+			if (!readahead)
+				xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+					__func__, &dqb[i],
+					sizeof(struct xfs_dqblk), fa);
 			return fa;
+		}
 	}
 
 	return NULL;
@@ -229,7 +240,7 @@ xfs_dquot_buf_verify_struct(
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 
-	return xfs_dquot_buf_verify(mp, bp);
+	return xfs_dquot_buf_verify(mp, bp, false);
 }
 
 static void
@@ -237,15 +248,10 @@ xfs_dquot_buf_read_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	xfs_failaddr_t		fa;
 
-	if (!xfs_dquot_buf_verify_crc(mp, bp))
-		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
-	else {
-		fa = xfs_dquot_buf_verify(mp, bp);
-		if (fa)
-			xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
-	}
+	if (!xfs_dquot_buf_verify_crc(mp, bp, false))
+		return;
+	xfs_dquot_buf_verify(mp, bp, false);
 }
 
 /*
@@ -260,8 +266,8 @@ xfs_dquot_buf_readahead_verify(
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 
-	if (!xfs_dquot_buf_verify_crc(mp, bp) ||
-	    xfs_dquot_buf_verify(mp, bp) != NULL) {
+	if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
+	    xfs_dquot_buf_verify(mp, bp, true) != NULL) {
 		xfs_buf_ioerror(bp, -EIO);
 		bp->b_flags &= ~XBF_DONE;
 	}
@@ -277,11 +283,8 @@ xfs_dquot_buf_write_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	xfs_failaddr_t		fa;
 
-	fa = xfs_dquot_buf_verify(mp, bp);
-	if (fa)
-		xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+	xfs_dquot_buf_verify(mp, bp, false);
 }
 
 const struct xfs_buf_ops xfs_dquot_buf_ops = {

From 4223f659dd3edd9e561d90488c6ae332a0a05148 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 May 2018 17:38:46 -0700
Subject: [PATCH 016/121] xfs: create agfl block free helper function

Refactor the AGFL block free code into a new helper such that it can
be invoked from deferred context. No functional changes.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_alloc.c | 37 +++++++++++++++++++++++++++----------
 fs/xfs/libxfs/xfs_alloc.h |  2 ++
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 4bcc095fe44a..193a5b4909c5 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2060,6 +2060,30 @@ xfs_alloc_space_available(
 	return true;
 }
 
+int
+xfs_free_agfl_block(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	struct xfs_buf		*agbp,
+	struct xfs_owner_info	*oinfo)
+{
+	int			error;
+	struct xfs_buf		*bp;
+
+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
+				   XFS_AG_RESV_AGFL);
+	if (error)
+		return error;
+
+	bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0);
+	if (!bp)
+		return -EFSCORRUPTED;
+	xfs_trans_binval(tp, bp);
+
+	return 0;
+}
+
 /*
  * Check the agfl fields of the agf for inconsistency or corruption. The purpose
  * is to detect an agfl header padding mismatch between current and early v5
@@ -2247,21 +2271,14 @@ xfs_alloc_fix_freelist(
 	else
 		xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
 	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
-		struct xfs_buf	*bp;
-
 		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
 		if (error)
 			goto out_agbp_relse;
-		error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
-					   &targs.oinfo, XFS_AG_RESV_AGFL);
+
+		error = xfs_free_agfl_block(tp, args->agno, bno, agbp,
+					    &targs.oinfo);
 		if (error)
 			goto out_agbp_relse;
-		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
-		if (!bp) {
-			error = -EFSCORRUPTED;
-			goto out_agbp_relse;
-		}
-		xfs_trans_binval(tp, bp);
 	}
 
 	targs.tp = tp;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index cbf789ea5a4e..949e21326066 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -223,6 +223,8 @@ int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
 			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
 			xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
+			struct xfs_buf *, struct xfs_owner_info *);
 int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
 int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
 		struct xfs_buf **agbp);

From f8f2835a9cf300079835e1adb1d90f85033be04c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 May 2018 17:38:47 -0700
Subject: [PATCH 017/121] xfs: defer agfl block frees when dfops is available

The AGFL fixup code executes before every block allocation/free and
rectifies the AGFL based on the current, dynamic allocation
requirements of the fs. The AGFL must hold a minimum number of
blocks to satisfy a worst case split of the free space btrees caused
by the impending allocation operation. The AGFL is also updated to
maintain the implicit requirement for a minimum number of free slots
to satisfy a worst case join of the free space btrees.

Since the AGFL caches individual blocks, AGFL reduction typically
involves multiple, single block frees. We've had reports of
transaction overrun problems during certain workloads that boil down
to AGFL reduction freeing multiple blocks and consuming more space
in the log than was reserved for the transaction.

Since the objective of freeing AGFL blocks is to ensure free AGFL
free slots are available for the upcoming allocation, one way to
address this problem is to release surplus blocks from the AGFL
immediately but defer the free of those blocks (similar to how
file-mapped blocks are unmapped from the file in one transaction and
freed via a deferred operation) until the transaction is rolled.
This turns AGFL reduction into an operation with predictable log
reservation consumption.

Add the capability to defer AGFL block frees when a deferred ops
list is available to the AGFL fixup code. Add a dfops pointer to the
transaction to carry dfops through various contexts to the allocator
context. Deferring AGFL frees is  conditional behavior based on
whether the transaction pointer is populated. The long term
objective is to reuse the transaction pointer to clean up all
unrelated callchains that pass dfops on the stack along with a
transaction and in doing so, consistently defer AGFL blocks from the
allocator.

A bit of customization is required to handle deferred completion
processing because AGFL blocks are accounted against a per-ag
reservation pool and AGFL blocks are not inserted into the extent
busy list when freed (they are inserted when used and released back
to the AGFL). Reuse the majority of the existing deferred extent
free infrastructure and customize it appropriately to handle AGFL
blocks.

Note that this patch only adds infrastructure. It does not change
behavior because no callers have been updated to pass ->t_agfl_dfops
into the allocation code.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_alloc.c  | 51 ++++++++++++++++++++++++---
 fs/xfs/libxfs/xfs_defer.h  |  1 +
 fs/xfs/xfs_trace.h         |  2 ++
 fs/xfs/xfs_trans.c         | 11 ++++--
 fs/xfs/xfs_trans.h         |  1 +
 fs/xfs/xfs_trans_extfree.c | 70 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 193a5b4909c5..350ad203b082 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -39,6 +39,9 @@
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 #include "xfs_ag_resv.h"
+#include "xfs_bmap.h"
+
+extern kmem_zone_t	*xfs_bmap_free_item_zone;
 
 struct workqueue_struct *xfs_alloc_wq;
 
@@ -2171,6 +2174,40 @@ xfs_agfl_reset(
 	pag->pagf_agflreset = false;
 }
 
+/*
+ * Defer an AGFL block free. This is effectively equivalent to
+ * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ *
+ * Deferring AGFL frees helps prevent log reservation overruns due to too many
+ * allocation operations in a transaction. AGFL frees are prone to this problem
+ * because for one they are always freed one at a time. Further, an immediate
+ * AGFL block free can cause a btree join and require another block free before
+ * the real allocation can proceed. Deferring the free disconnects freeing up
+ * the AGFL slot from freeing the block.
+ */
+STATIC void
+xfs_defer_agfl_block(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_agnumber_t			agno,
+	xfs_fsblock_t			agbno,
+	struct xfs_owner_info		*oinfo)
+{
+	struct xfs_extent_free_item	*new;		/* new element */
+
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	ASSERT(oinfo != NULL);
+
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+	new->xefi_blockcount = 1;
+	new->xefi_oinfo = *oinfo;
+
+	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+
+	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+}
+
 /*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
@@ -2275,10 +2312,16 @@ xfs_alloc_fix_freelist(
 		if (error)
 			goto out_agbp_relse;
 
-		error = xfs_free_agfl_block(tp, args->agno, bno, agbp,
-					    &targs.oinfo);
-		if (error)
-			goto out_agbp_relse;
+		/* defer agfl frees if dfops is provided */
+		if (tp->t_agfl_dfops) {
+			xfs_defer_agfl_block(mp, tp->t_agfl_dfops, args->agno,
+					     bno, &targs.oinfo);
+		} else {
+			error = xfs_free_agfl_block(tp, args->agno, bno, agbp,
+						    &targs.oinfo);
+			if (error)
+				goto out_agbp_relse;
+		}
 	}
 
 	targs.tp = tp;
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 045beacdd37d..e70725ba1f5f 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -55,6 +55,7 @@ enum xfs_defer_ops_type {
 	XFS_DEFER_OPS_TYPE_REFCOUNT,
 	XFS_DEFER_OPS_TYPE_RMAP,
 	XFS_DEFER_OPS_TYPE_FREE,
+	XFS_DEFER_OPS_TYPE_AGFL_FREE,
 	XFS_DEFER_OPS_TYPE_MAX,
 };
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 21d494f30ea7..24892259301e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2436,6 +2436,8 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
 #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
 DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
 DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
 
 /* rmap tracepoints */
 DECLARE_EVENT_CLASS(xfs_rmap_class,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d6d8f9d129a7..06adb1a3e31f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -31,6 +31,7 @@
 #include "xfs_log.h"
 #include "xfs_trace.h"
 #include "xfs_error.h"
+#include "xfs_defer.h"
 
 kmem_zone_t	*xfs_trans_zone;
 kmem_zone_t	*xfs_log_item_desc_zone;
@@ -94,11 +95,11 @@ xfs_trans_free(
  * blocks.  Locks and log items, however, are no inherited.  They must
  * be added to the new transaction explicitly.
  */
-STATIC xfs_trans_t *
+STATIC struct xfs_trans *
 xfs_trans_dup(
-	xfs_trans_t	*tp)
+	struct xfs_trans	*tp)
 {
-	xfs_trans_t	*ntp;
+	struct xfs_trans	*ntp;
 
 	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
 
@@ -127,6 +128,7 @@ xfs_trans_dup(
 	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
 	tp->t_rtx_res = tp->t_rtx_res_used;
 	ntp->t_pflags = tp->t_pflags;
+	ntp->t_agfl_dfops = tp->t_agfl_dfops;
 
 	xfs_trans_dup_dqinfo(tp, ntp);
 
@@ -936,6 +938,9 @@ __xfs_trans_commit(
 	int			error = 0;
 	int			sync = tp->t_flags & XFS_TRANS_SYNC;
 
+	ASSERT(!tp->t_agfl_dfops ||
+	       !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant);
+
 	/*
 	 * If there is nothing to be logged by the transaction,
 	 * then unlock all of the items associated with the
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9d542dfe0052..834388c2c9de 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -111,6 +111,7 @@ typedef struct xfs_trans {
 	struct xlog_ticket	*t_ticket;	/* log mgr ticket */
 	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
 	struct xfs_dquot_acct   *t_dqinfo;	/* acctg info for dquots */
+	struct xfs_defer_ops	*t_agfl_dfops;	/* optional agfl fixup dfops */
 	unsigned int		t_flags;	/* misc flags */
 	int64_t			t_icount_delta;	/* superblock icount change */
 	int64_t			t_ifree_delta;	/* superblock ifree change */
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index ab438647592a..f5620796ae25 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -231,9 +231,79 @@ static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
 	.cancel_item	= xfs_extent_free_cancel_item,
 };
 
+/*
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
+ */
+STATIC int
+xfs_agfl_free_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dop,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_efd_log_item		*efdp = done_item;
+	struct xfs_extent_free_item	*free;
+	struct xfs_extent		*extp;
+	struct xfs_buf			*agbp;
+	int				error;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	uint				next_extent;
+
+	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+	ASSERT(free->xefi_blockcount == 1);
+	agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+
+	trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+	error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+	if (!error)
+		error = xfs_free_agfl_block(tp, agno, agbno, agbp,
+					    &free->xefi_oinfo);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the EFI and frees the EFD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = free->xefi_startblock;
+	extp->ext_len = free->xefi_blockcount;
+	efdp->efd_next_extent++;
+
+	kmem_free(free);
+	return error;
+}
+
+
+/* sub-type with special handling for AGFL deferred frees */
+static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+	.type		= XFS_DEFER_OPS_TYPE_AGFL_FREE,
+	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_extent_free_diff_items,
+	.create_intent	= xfs_extent_free_create_intent,
+	.abort_intent	= xfs_extent_free_abort_intent,
+	.log_item	= xfs_extent_free_log_item,
+	.create_done	= xfs_extent_free_create_done,
+	.finish_item	= xfs_agfl_free_finish_item,
+	.cancel_item	= xfs_extent_free_cancel_item,
+};
+
 /* Register the deferred op type. */
 void
 xfs_extent_free_init_defer_op(void)
 {
 	xfs_defer_init_op_type(&xfs_extent_free_defer_type);
+	xfs_defer_init_op_type(&xfs_agfl_free_defer_type);
 }

From 2bc5eba8b6957824631205aeffc75db8dbb5426a Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 May 2018 17:38:47 -0700
Subject: [PATCH 018/121] xfs: defer agfl block frees from deferred ops
 processing context

Now that AGFL block frees are deferred when dfops is set in the
transaction, start deferring AGFL block frees from contexts that are
known to push the limits of existing log reservations.

The first such context is deferred operation processing itself. This
primarily targets deferred extent frees (such as file extents and
inode chunks), but in doing so covers all allocation operations that
occur in deferred operation processing context.

Update xfs_defer_finish() to set and reset ->t_agfl_dfops across the
processing sequence. This means that any AGFL block frees due to
allocation events result in the addition of new EFIs to the dfops
rather than being processed immediately. xfs_defer_finish() rolls
the transaction at least once more to process the frees of the AGFL
blocks back to the allocation btrees and returns once the AGFL is
rectified.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_defer.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 087fea02c389..4d87d9661907 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -352,11 +352,22 @@ xfs_defer_finish(
 	void				*state;
 	int				error = 0;
 	void				(*cleanup_fn)(struct xfs_trans *, void *, int);
+	struct xfs_defer_ops		*orig_dop;
 
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 
 	trace_xfs_defer_finish((*tp)->t_mountp, dop);
 
+	/*
+	 * Attach dfops to the transaction during deferred ops processing. This
+	 * explicitly causes calls into the allocator to defer AGFL block frees.
+	 * Note that this code can go away once all dfops users attach to the
+	 * associated tp.
+	 */
+	ASSERT(!(*tp)->t_agfl_dfops || ((*tp)->t_agfl_dfops == dop));
+	orig_dop = (*tp)->t_agfl_dfops;
+	(*tp)->t_agfl_dfops = dop;
+
 	/* Until we run out of pending work to finish... */
 	while (xfs_defer_has_unfinished_work(dop)) {
 		/* Log intents for work items sitting in the intake. */
@@ -428,6 +439,7 @@ xfs_defer_finish(
 	}
 
 out:
+	(*tp)->t_agfl_dfops = orig_dop;
 	if (error)
 		trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
 	else

From 658f8f95117349194af1631ef87970c212c68487 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 May 2018 17:38:48 -0700
Subject: [PATCH 019/121] xfs: defer agfl frees from inode inactivation

XFS inode chunks are already freed via deferred operations (which
now also defer AGFL block frees), but inode btree blocks are freed
directly in the associated context. This has been known to lead to
log reservation overruns in particular workloads where an inobt
block free may require several AGFL block frees (and thus several
allocation btree modifications) before the inobt block itself is
actually freed.

To avoid this problem, defer the frees of any AGFL blocks before the
inobt block free takes place. This requires passing the dfops from
xfs_inactive_ifree() down through the inobt ->[alloc|free]_block()
callouts, which essentially only requires to attach the dfops to the
transaction since it is already carried all the way through to the
inobt update and allocation.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3cdd4fa37947..9959b6287bea 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1823,6 +1823,7 @@ xfs_inactive_ifree(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	xfs_defer_init(&dfops, &first_block);
+	tp->t_agfl_dfops = &dfops;
 	error = xfs_ifree(tp, ip, &dfops);
 	if (error) {
 		/*

From 8b922f0e6a7fb2eea65bd9f1462f648e240d4ba7 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 May 2018 17:38:48 -0700
Subject: [PATCH 020/121] xfs: defer frees from common inode allocation paths

Inode allocation can require block allocation for physical inode
chunk allocation, inode btree record insertion, and/or directory
block allocation for entry insertion. Any of these block allocation
requests can require AGFL fixups prior to the actual allocation.
Update the common file creation transacions to defer AGFL frees from
these contexts to avoid too much log reservation consumption
per-transaction.

Since these transactions are already passed down through the btree
cursors and da_args structure, this simply requires to attach dfops
to the transaction. Note that this covers tr_create, tr_mkdir and
tr_symlink. Other transactions such as tr_create_tmpfile do not
already make use of deferred operations and so are left alone for
the time being.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c   | 1 +
 fs/xfs/xfs_symlink.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9959b6287bea..c903b6a0ef14 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1208,6 +1208,7 @@ xfs_create(
 	unlock_dp_on_error = true;
 
 	xfs_defer_init(&dfops, &first_block);
+	tp->t_agfl_dfops = &dfops;
 
 	/*
 	 * Reserve disk quota and the inode.
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 5b66ac12913c..88643a8fd487 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -259,6 +259,7 @@ xfs_symlink(
 	 * bmapi or the directory create code.
 	 */
 	xfs_defer_init(&dfops, &first_block);
+	tp->t_agfl_dfops = &dfops;
 
 	/*
 	 * Allocate an inode for the symlink.

From 8804630e1eeba06194344b9eeffa379f4ed168c7 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 May 2018 17:38:48 -0700
Subject: [PATCH 021/121] xfs: defer agfl frees from directory op transactions

Directory operations can perform block allocations as entries are
added/removed from directories. Defer AGFL block frees from the
remaining directory operation transactions. This covers the hard
link, remove and rename operations.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c903b6a0ef14..d9b91606d2a3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1464,6 +1464,7 @@ xfs_link(
 	}
 
 	xfs_defer_init(&dfops, &first_block);
+	tp->t_agfl_dfops = &dfops;
 
 	/*
 	 * Handle initial link state of O_TMPFILE inode
@@ -2661,6 +2662,7 @@ xfs_remove(
 		goto out_trans_cancel;
 
 	xfs_defer_init(&dfops, &first_block);
+	tp->t_agfl_dfops = &dfops;
 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &dfops, resblks);
 	if (error) {
@@ -3028,6 +3030,7 @@ xfs_rename(
 	}
 
 	xfs_defer_init(&dfops, &first_block);
+	tp->t_agfl_dfops = &dfops;
 
 	/* RENAME_EXCHANGE is unique from here on. */
 	if (flags & RENAME_EXCHANGE)

From cec572561a748396c783c1ea91a289816d3c4f18 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:31:21 -0700
Subject: [PATCH 022/121] xfs: bmap debugging should never panic the system

Don't panic() the system if the bmap records are garbage, just call
ASSERT which gives us the same backtrace but enables developers to
control if the system goes down or not.  This makes debugging with
generic/388 much easier because it won't reboot the machine midway
through a run just because btree_read_bufl returns EIO when the fs has
already shut down.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 040eeda8426f..ddb5f5336ecf 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -312,8 +312,9 @@ xfs_check_block(
 				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
 					__func__, j, i,
 					(unsigned long long)be64_to_cpu(*thispa));
-				panic("%s: ptrs are equal in node\n",
+				xfs_err(mp, "%s: ptrs are equal in node\n",
 					__func__);
+				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 			}
 		}
 	}
@@ -483,7 +484,8 @@ error0:
 error_norelse:
 	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
 		__func__, i);
-	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	return;
 }
 

From 52101dfe56f71d8cb140c2440d95affa25a53746 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:31:21 -0700
Subject: [PATCH 023/121] xfs: add missing rmap error return

xfs_rmap_lookup_le_range can return errors, so we need to check for
them and bail out.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_rmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index fba8d2718017..f7769edf5b68 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -1374,6 +1374,8 @@ xfs_rmap_convert_shared(
 	 */
 	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
 			&PREV, &i);
+	if (error)
+		goto done;
 	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
 
 	ASSERT(PREV.rm_offset <= offset);

From 22525c17ed133202088f6f05acd9c53790a7121d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:47:34 -0700
Subject: [PATCH 024/121] xfs: log item flags are racy

The log item flags contain a field that is protected by the AIL
lock - the XFS_LI_IN_AIL flag. We use non-atomic RMW operations to
set and clear these flags, but most of the updates and checks are
not done with the AIL lock held and so are susceptible to update
races.

Fix this by changing the log item flags to use atomic bitops rather
than be reliant on the AIL lock for update serialisation.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_bmap_item.c     |  4 ++--
 fs/xfs/xfs_buf_item.c      |  4 +++-
 fs/xfs/xfs_dquot.c         |  7 +++----
 fs/xfs/xfs_dquot_item.c    |  2 +-
 fs/xfs/xfs_extfree_item.c  |  4 ++--
 fs/xfs/xfs_icache.c        |  3 ++-
 fs/xfs/xfs_icreate_item.c  |  2 +-
 fs/xfs/xfs_inode.c         |  4 ++--
 fs/xfs/xfs_inode_item.c    |  8 ++++----
 fs/xfs/xfs_log.c           |  2 +-
 fs/xfs/xfs_qm.c            |  2 +-
 fs/xfs/xfs_refcount_item.c |  4 ++--
 fs/xfs/xfs_rmap_item.c     |  4 ++--
 fs/xfs/xfs_trace.h         |  6 +++---
 fs/xfs/xfs_trans.c         |  4 ++--
 fs/xfs/xfs_trans.h         | 19 ++++++++++++-------
 fs/xfs/xfs_trans_ail.c     |  9 ++++-----
 fs/xfs/xfs_trans_buf.c     |  2 +-
 fs/xfs/xfs_trans_priv.h    | 10 ++++------
 19 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 2203465e63ea..618bb71535c8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -160,7 +160,7 @@ STATIC void
 xfs_bui_item_unlock(
 	struct xfs_log_item	*lip)
 {
-	if (lip->li_flags & XFS_LI_ABORTED)
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
 		xfs_bui_release(BUI_ITEM(lip));
 }
 
@@ -305,7 +305,7 @@ xfs_bud_item_unlock(
 {
 	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
 
-	if (lip->li_flags & XFS_LI_ABORTED) {
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
 		xfs_bui_release(budp->bud_buip);
 		kmem_zone_free(xfs_bud_zone, budp);
 	}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 82ad270e390e..df62082f2204 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -568,13 +568,15 @@ xfs_buf_item_unlock(
 {
 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 	struct xfs_buf		*bp = bip->bli_buf;
-	bool			aborted = !!(lip->li_flags & XFS_LI_ABORTED);
+	bool			aborted;
 	bool			hold = !!(bip->bli_flags & XFS_BLI_HOLD);
 	bool			dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
 #if defined(DEBUG) || defined(XFS_WARN)
 	bool			ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
 #endif
 
+	aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags);
+
 	/* Clear the buffer's association with this transaction. */
 	bp->b_transp = NULL;
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index d0880c1add41..4ca9c39879ae 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -913,9 +913,9 @@ xfs_qm_dqflush_done(
 	 * since it's cheaper, and then we recheck while
 	 * holding the lock before removing the dquot from the AIL.
 	 */
-	if ((lip->li_flags & XFS_LI_IN_AIL) &&
+	if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
 	    ((lip->li_lsn == qip->qli_flush_lsn) ||
-	     (lip->li_flags & XFS_LI_FAILED))) {
+	     test_bit(XFS_LI_FAILED, &lip->li_flags))) {
 
 		/* xfs_trans_ail_delete() drops the AIL lock. */
 		spin_lock(&ailp->ail_lock);
@@ -926,8 +926,7 @@ xfs_qm_dqflush_done(
 			 * Clear the failed state since we are about to drop the
 			 * flush lock
 			 */
-			if (lip->li_flags & XFS_LI_FAILED)
-				xfs_clear_li_failed(lip);
+			xfs_clear_li_failed(lip);
 			spin_unlock(&ailp->ail_lock);
 		}
 	}
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 4b331e354da7..57df98122156 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -173,7 +173,7 @@ xfs_qm_dquot_logitem_push(
 	 * The buffer containing this item failed to be written back
 	 * previously. Resubmit the buffer for IO
 	 */
-	if (lip->li_flags & XFS_LI_FAILED) {
+	if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
 		if (!xfs_buf_trylock(bp))
 			return XFS_ITEM_LOCKED;
 
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index b5b1e567b9f4..70b7d48af6d6 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -168,7 +168,7 @@ STATIC void
 xfs_efi_item_unlock(
 	struct xfs_log_item	*lip)
 {
-	if (lip->li_flags & XFS_LI_ABORTED)
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
 		xfs_efi_release(EFI_ITEM(lip));
 }
 
@@ -402,7 +402,7 @@ xfs_efd_item_unlock(
 {
 	struct xfs_efd_log_item	*efdp = EFD_ITEM(lip);
 
-	if (lip->li_flags & XFS_LI_ABORTED) {
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
 		xfs_efi_release(efdp->efd_efip);
 		xfs_efd_item_free(efdp);
 	}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 817899961f48..9deff136c5b9 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -107,7 +107,8 @@ xfs_inode_free_callback(
 		xfs_idestroy_fork(ip, XFS_COW_FORK);
 
 	if (ip->i_itemp) {
-		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+		ASSERT(!test_bit(XFS_LI_IN_AIL,
+				 &ip->i_itemp->ili_item.li_flags));
 		xfs_inode_item_destroy(ip);
 		ip->i_itemp = NULL;
 	}
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 865ad1373e5e..a99a0f8aa528 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -91,7 +91,7 @@ xfs_icreate_item_unlock(
 {
 	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
 
-	if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
 		kmem_zone_free(xfs_icreate_zone, icp);
 	return;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d9b91606d2a3..42781bae6794 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -498,7 +498,7 @@ again:
 		if (!try_lock) {
 			for (j = (i - 1); j >= 0 && !try_lock; j--) {
 				lp = (xfs_log_item_t *)ips[j]->i_itemp;
-				if (lp && (lp->li_flags & XFS_LI_IN_AIL))
+				if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
 					try_lock++;
 			}
 		}
@@ -598,7 +598,7 @@ xfs_lock_two_inodes(
 	 * and try again.
 	 */
 	lp = (xfs_log_item_t *)ip0->i_itemp;
-	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
 		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
 			xfs_iunlock(ip0, ip0_mode);
 			if ((++attempts % 5) == 0)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 34b91b789702..3e5b8574818e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -518,7 +518,7 @@ xfs_inode_item_push(
 	 * The buffer containing this item failed to be written back
 	 * previously. Resubmit the buffer for IO.
 	 */
-	if (lip->li_flags & XFS_LI_FAILED) {
+	if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
 		if (!xfs_buf_trylock(bp))
 			return XFS_ITEM_LOCKED;
 
@@ -729,14 +729,14 @@ xfs_iflush_done(
 		 */
 		iip = INODE_ITEM(blip);
 		if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
-		    (blip->li_flags & XFS_LI_FAILED))
+		    test_bit(XFS_LI_FAILED, &blip->li_flags))
 			need_ail++;
 	}
 
 	/* make sure we capture the state of the initial inode. */
 	iip = INODE_ITEM(lip);
 	if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
-	    lip->li_flags & XFS_LI_FAILED)
+	    test_bit(XFS_LI_FAILED, &lip->li_flags))
 		need_ail++;
 
 	/*
@@ -803,7 +803,7 @@ xfs_iflush_abort(
 	xfs_inode_log_item_t	*iip = ip->i_itemp;
 
 	if (iip) {
-		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+		if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
 			xfs_trans_ail_remove(&iip->ili_item,
 					     stale ? SHUTDOWN_LOG_IO_ERROR :
 						     SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2fcd9ed5d075..e427864434c1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2132,7 +2132,7 @@ xlog_print_trans(
 
 		xfs_warn(mp, "log item: ");
 		xfs_warn(mp, "  type	= 0x%x", lip->li_type);
-		xfs_warn(mp, "  flags	= 0x%x", lip->li_flags);
+		xfs_warn(mp, "  flags	= 0x%lx", lip->li_flags);
 		if (!lv)
 			continue;
 		xfs_warn(mp, "  niovecs	= %d", lv->lv_niovecs);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index c72a8da55703..62764f3e35e2 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -173,7 +173,7 @@ xfs_qm_dqpurge(
 
 	ASSERT(atomic_read(&dqp->q_pincount) == 0);
 	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
-	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+		!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
 
 	xfs_dqfunlock(dqp);
 	xfs_dqunlock(dqp);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 15c9393dd7a7..e5866b714d5f 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -159,7 +159,7 @@ STATIC void
 xfs_cui_item_unlock(
 	struct xfs_log_item	*lip)
 {
-	if (lip->li_flags & XFS_LI_ABORTED)
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
 		xfs_cui_release(CUI_ITEM(lip));
 }
 
@@ -310,7 +310,7 @@ xfs_cud_item_unlock(
 {
 	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
 
-	if (lip->li_flags & XFS_LI_ABORTED) {
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
 		xfs_cui_release(cudp->cud_cuip);
 		kmem_zone_free(xfs_cud_zone, cudp);
 	}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 06a07846c9b3..e5b5b3e7ef82 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -158,7 +158,7 @@ STATIC void
 xfs_rui_item_unlock(
 	struct xfs_log_item	*lip)
 {
-	if (lip->li_flags & XFS_LI_ABORTED)
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
 		xfs_rui_release(RUI_ITEM(lip));
 }
 
@@ -331,7 +331,7 @@ xfs_rud_item_unlock(
 {
 	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
 
-	if (lip->li_flags & XFS_LI_ABORTED) {
+	if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
 		xfs_rui_release(rudp->rud_ruip);
 		kmem_zone_free(xfs_rud_zone, rudp);
 	}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 24892259301e..989708d06e10 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -442,7 +442,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		__field(int, bli_refcount)
 		__field(unsigned, bli_flags)
 		__field(void *, li_desc)
-		__field(unsigned, li_flags)
+		__field(unsigned long, li_flags)
 	),
 	TP_fast_assign(
 		__entry->dev = bip->bli_buf->b_target->bt_dev;
@@ -1018,7 +1018,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
 		__field(dev_t, dev)
 		__field(void *, lip)
 		__field(uint, type)
-		__field(uint, flags)
+		__field(unsigned long, flags)
 		__field(xfs_lsn_t, lsn)
 	),
 	TP_fast_assign(
@@ -1070,7 +1070,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
 		__field(dev_t, dev)
 		__field(void *, lip)
 		__field(uint, type)
-		__field(uint, flags)
+		__field(unsigned long, flags)
 		__field(xfs_lsn_t, old_lsn)
 		__field(xfs_lsn_t, new_lsn)
 	),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06adb1a3e31f..83f2032641cf 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -792,7 +792,7 @@ xfs_trans_free_items(
 		if (commit_lsn != NULLCOMMITLSN)
 			lip->li_ops->iop_committing(lip, commit_lsn);
 		if (abort)
-			lip->li_flags |= XFS_LI_ABORTED;
+			set_bit(XFS_LI_ABORTED, &lip->li_flags);
 		lip->li_ops->iop_unlock(lip);
 
 		xfs_trans_free_item_desc(lidp);
@@ -863,7 +863,7 @@ xfs_trans_committed_bulk(
 		xfs_lsn_t		item_lsn;
 
 		if (aborted)
-			lip->li_flags |= XFS_LI_ABORTED;
+			set_bit(XFS_LI_ABORTED, &lip->li_flags);
 		item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
 
 		/* item_lsn of -1 means the item needs no further processing */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 834388c2c9de..ca449036f820 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,7 +48,7 @@ typedef struct xfs_log_item {
 	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
 	struct xfs_ail			*li_ailp;	/* ptr to AIL */
 	uint				li_type;	/* item type */
-	uint				li_flags;	/* misc flags */
+	unsigned long			li_flags;	/* misc flags */
 	struct xfs_buf			*li_buf;	/* real buffer pointer */
 	struct list_head		li_bio_list;	/* buffer item list */
 	void				(*li_cb)(struct xfs_buf *,
@@ -64,14 +64,19 @@ typedef struct xfs_log_item {
 	xfs_lsn_t			li_seq;		/* CIL commit seq */
 } xfs_log_item_t;
 
-#define	XFS_LI_IN_AIL	0x1
-#define	XFS_LI_ABORTED	0x2
-#define	XFS_LI_FAILED	0x4
+/*
+ * li_flags use the (set/test/clear)_bit atomic interfaces because updates can
+ * race with each other and we don't want to have to use the AIL lock to
+ * serialise all updates.
+ */
+#define	XFS_LI_IN_AIL	0
+#define	XFS_LI_ABORTED	1
+#define	XFS_LI_FAILED	2
 
 #define XFS_LI_FLAGS \
-	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
-	{ XFS_LI_ABORTED,	"ABORTED" }, \
-	{ XFS_LI_FAILED,	"FAILED" }
+	{ (1 << XFS_LI_IN_AIL),		"IN_AIL" }, \
+	{ (1 << XFS_LI_ABORTED),	"ABORTED" }, \
+	{ (1 << XFS_LI_FAILED),		"FAILED" }
 
 struct xfs_item_ops {
 	void (*iop_size)(xfs_log_item_t *, int *, int *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d4a2445215e6..50611d2bcbc2 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -46,7 +46,7 @@ xfs_ail_check(
 	/*
 	 * Check the next and previous entries are valid.
 	 */
-	ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
 	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
 	if (&prev_lip->li_ail != &ailp->ail_head)
 		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
@@ -684,7 +684,7 @@ xfs_trans_ail_update_bulk(
 
 	for (i = 0; i < nr_items; i++) {
 		struct xfs_log_item *lip = log_items[i];
-		if (lip->li_flags & XFS_LI_IN_AIL) {
+		if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
 			/* check if we really need to move the item */
 			if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
 				continue;
@@ -694,7 +694,6 @@ xfs_trans_ail_update_bulk(
 			if (mlip == lip)
 				mlip_changed = 1;
 		} else {
-			lip->li_flags |= XFS_LI_IN_AIL;
 			trace_xfs_ail_insert(lip, 0, lsn);
 		}
 		lip->li_lsn = lsn;
@@ -725,7 +724,7 @@ xfs_ail_delete_one(
 	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
 	xfs_ail_delete(ailp, lip);
 	xfs_clear_li_failed(lip);
-	lip->li_flags &= ~XFS_LI_IN_AIL;
+	clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
 	lip->li_lsn = 0;
 
 	return mlip == lip;
@@ -761,7 +760,7 @@ xfs_trans_ail_delete(
 	struct xfs_mount	*mp = ailp->ail_mount;
 	bool			mlip_changed;
 
-	if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+	if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
 		spin_unlock(&ailp->ail_lock);
 		if (!XFS_FORCED_SHUTDOWN(mp)) {
 			xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index a5d9dfc45d98..0081e9b3decf 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -442,7 +442,7 @@ xfs_trans_brelse(
 		ASSERT(bp->b_pincount == 0);
 ***/
 		ASSERT(atomic_read(&bip->bli_refcount) == 0);
-		ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+		ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
 		ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
 		xfs_buf_item_relse(bp);
 	}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index be24b0c8a332..43f773297b9d 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -119,7 +119,7 @@ xfs_trans_ail_remove(
 
 	spin_lock(&ailp->ail_lock);
 	/* xfs_trans_ail_delete() drops the AIL lock */
-	if (lip->li_flags & XFS_LI_IN_AIL)
+	if (test_bit(XFS_LI_IN_AIL, &lip->li_flags))
 		xfs_trans_ail_delete(ailp, lip, shutdown_type);
 	else
 		spin_unlock(&ailp->ail_lock);
@@ -171,11 +171,10 @@ xfs_clear_li_failed(
 {
 	struct xfs_buf	*bp = lip->li_buf;
 
-	ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
 	lockdep_assert_held(&lip->li_ailp->ail_lock);
 
-	if (lip->li_flags & XFS_LI_FAILED) {
-		lip->li_flags &= ~XFS_LI_FAILED;
+	if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
 		lip->li_buf = NULL;
 		xfs_buf_rele(bp);
 	}
@@ -188,9 +187,8 @@ xfs_set_li_failed(
 {
 	lockdep_assert_held(&lip->li_ailp->ail_lock);
 
-	if (!(lip->li_flags & XFS_LI_FAILED)) {
+	if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
 		xfs_buf_hold(bp);
-		lip->li_flags |= XFS_LI_FAILED;
 		lip->li_buf = bp;
 	}
 }

From ba18781b91569af2d721c46740d4ce139720c298 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:47:57 -0700
Subject: [PATCH 025/121] xfs: add tracing to high level transaction operations

Because currently we have no idea what the transaction context we
are operating in is, and I need to know that information to track
down bugs in multiple log item joins to transactions.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log_cil.c |  1 +
 fs/xfs/xfs_trace.h   | 37 +++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trans.c   | 20 +++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4668403b1741..a8675c631438 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -1013,6 +1013,7 @@ xfs_log_commit_cil(
 		*commit_lsn = xc_commit_lsn;
 
 	xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+	tp->t_ticket = NULL;
 	xfs_trans_unreserve_and_mod_sb(tp);
 
 	/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 989708d06e10..1f95c20e3442 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3351,6 +3351,43 @@ TRACE_EVENT(xfs_trans_resv_calc,
 		  __entry->logflags)
 );
 
+DECLARE_EVENT_CLASS(xfs_trans_class,
+	TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
+	TP_ARGS(tp, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(uint32_t, tid)
+		__field(uint32_t, flags)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = tp->t_mountp->m_super->s_dev;
+		__entry->tid = 0;
+		if (tp->t_ticket)
+			__entry->tid = tp->t_ticket->t_tid;
+		__entry->flags = tp->t_flags;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d trans %x flags 0x%x caller %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->tid,
+		  __entry->flags,
+		  (char *)__entry->caller_ip)
+)
+
+#define DEFINE_TRANS_EVENT(name) \
+DEFINE_EVENT(xfs_trans_class, name, \
+	TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
+	TP_ARGS(tp, caller_ip))
+DEFINE_TRANS_EVENT(xfs_trans_alloc);
+DEFINE_TRANS_EVENT(xfs_trans_cancel);
+DEFINE_TRANS_EVENT(xfs_trans_commit);
+DEFINE_TRANS_EVENT(xfs_trans_dup);
+DEFINE_TRANS_EVENT(xfs_trans_free);
+DEFINE_TRANS_EVENT(xfs_trans_roll);
+DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_free_items);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 83f2032641cf..4bbdb6faec30 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -80,6 +80,7 @@ xfs_trans_free(
 	xfs_extent_busy_sort(&tp->t_busy);
 	xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
 
+	trace_xfs_trans_free(tp, _RET_IP_);
 	atomic_dec(&tp->t_mountp->m_active_trans);
 	if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
 		sb_end_intwrite(tp->t_mountp->m_super);
@@ -101,6 +102,8 @@ xfs_trans_dup(
 {
 	struct xfs_trans	*ntp;
 
+	trace_xfs_trans_dup(tp, _RET_IP_);
+
 	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
 
 	/*
@@ -285,6 +288,8 @@ xfs_trans_alloc(
 		return error;
 	}
 
+	trace_xfs_trans_alloc(tp, _RET_IP_);
+
 	*tpp = tp;
 	return 0;
 }
@@ -751,6 +756,8 @@ xfs_trans_add_item(
 	list_add_tail(&lidp->lid_trans, &tp->t_items);
 
 	lip->li_desc = lidp;
+
+	trace_xfs_trans_add_item(tp, _RET_IP_);
 }
 
 STATIC void
@@ -784,6 +791,8 @@ xfs_trans_free_items(
 {
 	struct xfs_log_item_desc *lidp, *next;
 
+	trace_xfs_trans_free_items(tp, _RET_IP_);
+
 	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
 		struct xfs_log_item	*lip = lidp->lid_item;
 
@@ -941,6 +950,8 @@ __xfs_trans_commit(
 	ASSERT(!tp->t_agfl_dfops ||
 	       !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant);
 
+	trace_xfs_trans_commit(tp, _RET_IP_);
+
 	/*
 	 * If there is nothing to be logged by the transaction,
 	 * then unlock all of the items associated with the
@@ -996,6 +1007,7 @@ out_unreserve:
 		commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
 		if (commit_lsn == -1 && !error)
 			error = -EIO;
+		tp->t_ticket = NULL;
 	}
 	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
@@ -1027,6 +1039,8 @@ xfs_trans_cancel(
 	struct xfs_mount	*mp = tp->t_mountp;
 	bool			dirty = (tp->t_flags & XFS_TRANS_DIRTY);
 
+	trace_xfs_trans_cancel(tp, _RET_IP_);
+
 	/*
 	 * See if the caller is relying on us to shut down the
 	 * filesystem.  This happens in paths where we detect
@@ -1047,8 +1061,10 @@ xfs_trans_cancel(
 	xfs_trans_unreserve_and_mod_sb(tp);
 	xfs_trans_unreserve_and_mod_dquots(tp);
 
-	if (tp->t_ticket)
+	if (tp->t_ticket) {
 		xfs_log_done(mp, tp->t_ticket, NULL, false);
+		tp->t_ticket = NULL;
+	}
 
 	/* mark this thread as no longer being in a transaction */
 	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1072,6 +1088,8 @@ xfs_trans_roll(
 	struct xfs_trans_res	tres;
 	int			error;
 
+	trace_xfs_trans_roll(trans, _RET_IP_);
+
 	/*
 	 * Copy the critical parameters from one trans to the next.
 	 */

From e632a5690c734a383a83272a502be79cb2c040e5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:48:52 -0700
Subject: [PATCH 026/121] xfs: adder caller IP to xfs_defer* tracepoints

So it's clear in the trace where they are being called from.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_defer.c | 12 ++++++------
 fs/xfs/xfs_trace.h        | 17 +++++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 4d87d9661907..3daf175e2535 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -220,7 +220,7 @@ xfs_defer_trans_abort(
 {
 	struct xfs_defer_pending	*dfp;
 
-	trace_xfs_defer_trans_abort(tp->t_mountp, dop);
+	trace_xfs_defer_trans_abort(tp->t_mountp, dop, _RET_IP_);
 
 	/* Abort intent items that don't have a done item. */
 	list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
@@ -253,7 +253,7 @@ xfs_defer_trans_roll(
 	for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++)
 		xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]);
 
-	trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
+	trace_xfs_defer_trans_roll((*tp)->t_mountp, dop, _RET_IP_);
 
 	/* Roll the transaction. */
 	error = xfs_trans_roll(tp);
@@ -356,7 +356,7 @@ xfs_defer_finish(
 
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 
-	trace_xfs_defer_finish((*tp)->t_mountp, dop);
+	trace_xfs_defer_finish((*tp)->t_mountp, dop, _RET_IP_);
 
 	/*
 	 * Attach dfops to the transaction during deferred ops processing. This
@@ -443,7 +443,7 @@ out:
 	if (error)
 		trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
 	else
-		trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
+		trace_xfs_defer_finish_done((*tp)->t_mountp, dop, _RET_IP_);
 	return error;
 }
 
@@ -459,7 +459,7 @@ xfs_defer_cancel(
 	struct list_head		*pwi;
 	struct list_head		*n;
 
-	trace_xfs_defer_cancel(NULL, dop);
+	trace_xfs_defer_cancel(NULL, dop, _RET_IP_);
 
 	/*
 	 * Free the pending items.  Caller should already have arranged
@@ -544,5 +544,5 @@ xfs_defer_init(
 	*fbp = NULLFSBLOCK;
 	INIT_LIST_HEAD(&dop->dop_intake);
 	INIT_LIST_HEAD(&dop->dop_pending);
-	trace_xfs_defer_init(NULL, dop);
+	trace_xfs_defer_init(NULL, dop, _RET_IP_);
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1f95c20e3442..e2ffc1e01bd8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2246,30 +2246,35 @@ struct xfs_defer_pending;
 struct xfs_defer_ops;
 
 DECLARE_EVENT_CLASS(xfs_defer_class,
-	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop),
-	TP_ARGS(mp, dop),
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop,
+		 unsigned long caller_ip),
+	TP_ARGS(mp, dop, caller_ip),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(void *, dop)
 		__field(char, committed)
 		__field(char, low)
+		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
 		__entry->dop = dop;
 		__entry->committed = dop->dop_committed;
 		__entry->low = dop->dop_low;
+		__entry->caller_ip = caller_ip;
 	),
-	TP_printk("dev %d:%d ops %p committed %d low %d",
+	TP_printk("dev %d:%d ops %p committed %d low %d, caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->dop,
 		  __entry->committed,
-		  __entry->low)
+		  __entry->low,
+		  (char *)__entry->caller_ip)
 )
 #define DEFINE_DEFER_EVENT(name) \
 DEFINE_EVENT(xfs_defer_class, name, \
-	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \
-	TP_ARGS(mp, dop))
+	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, \
+		 unsigned long caller_ip), \
+	TP_ARGS(mp, dop, caller_ip))
 
 DECLARE_EVENT_CLASS(xfs_defer_error_class,
 	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error),

From d686d12d23ae1a6a5a52ad2f794f3955985fd54d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:49:09 -0700
Subject: [PATCH 027/121] xfs: don't assert fail with AIL lock held

Been hitting AIL ordering assert failures recently, but been unable
to trace them down because the system immediately hangs up onteh
spinlock that was held when this assert fires:

XFS: Assertion failed: XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0, file: fs/xfs/xfs_trans_ail.c, line: 52

Move the assertions outside of the spinlock so the corpse can
be dissected. Thanks to Brian Foster for supplying a clean
way of doing this.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_trans_ail.c | 43 +++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 50611d2bcbc2..41e280ef1483 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -32,30 +32,51 @@
 #ifdef DEBUG
 /*
  * Check that the list is sorted as it should be.
+ *
+ * Called with the ail lock held, but we don't want to assert fail with it
+ * held otherwise we'll lock everything up and won't be able to debug the
+ * cause. Hence we sample and check the state under the AIL lock and return if
+ * everything is fine, otherwise we drop the lock and run the ASSERT checks.
+ * Asserts may not be fatal, so pick the lock back up and continue onwards.
  */
 STATIC void
 xfs_ail_check(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip)
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
 {
-	xfs_log_item_t	*prev_lip;
+	struct xfs_log_item	*prev_lip;
+	struct xfs_log_item	*next_lip;
+	xfs_lsn_t		prev_lsn = NULLCOMMITLSN;
+	xfs_lsn_t		next_lsn = NULLCOMMITLSN;
+	xfs_lsn_t		lsn;
+	bool			in_ail;
+
 
 	if (list_empty(&ailp->ail_head))
 		return;
 
 	/*
-	 * Check the next and previous entries are valid.
+	 * Sample then check the next and previous entries are valid.
 	 */
-	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
-	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+	in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
+	prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
 	if (&prev_lip->li_ail != &ailp->ail_head)
-		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-
-	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-	if (&prev_lip->li_ail != &ailp->ail_head)
-		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+		prev_lsn = prev_lip->li_lsn;
+	next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
+	if (&next_lip->li_ail != &ailp->ail_head)
+		next_lsn = next_lip->li_lsn;
+	lsn = lip->li_lsn;
 
+	if (in_ail &&
+	    (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
+	    (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
+		return;
 
+	spin_unlock(&ailp->ail_lock);
+	ASSERT(in_ail);
+	ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
+	ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
+	spin_lock(&ailp->ail_lock);
 }
 #else /* !DEBUG */
 #define	xfs_ail_check(a,l)

From 3565b660e56a78dea189fed75f359fe6d59b6023 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:49:09 -0700
Subject: [PATCH 028/121] xfs: fix double ijoin in xfs_inactive_symlink_rmt()

xfs_inactive_symlink_rmt() does something nasty - it joins an inode
into a transaction it is already joined to. This means the inode can
have multiple log item descriptors attached to the transaction for
it. This breaks teh 1:1 mapping that is supposed to exist
between the log item and log item descriptor.

This results in the log item being processed twice during
transaction commit and CIL formatting, and there are lots of other
potential issues tha arise from double processing of log items in
the transaction commit state machine.

In this case, the inode is already held by the rolling transaction
returned from xfs_defer_finish(), so there's no need to join it
again.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_symlink.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 88643a8fd487..aed03da637d4 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -489,16 +489,11 @@ xfs_inactive_symlink_rmt(
 	error = xfs_defer_finish(&tp, &dfops);
 	if (error)
 		goto error_bmap_cancel;
-	/*
-	 * The first xact was committed, so add the inode to the new one.
-	 * Mark it dirty so it will be logged and moved forward in the log as
-	 * part of every commit.
-	 */
-	xfs_trans_ijoin(tp, ip, 0);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
 	/*
 	 * Commit the transaction containing extent freeing and EFDs.
 	 */
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	error = xfs_trans_commit(tp);
 	if (error) {
 		ASSERT(XFS_FORCED_SHUTDOWN(mp));

From c5295c6aad536ba7c257016632792f3031c21182 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:49:09 -0700
Subject: [PATCH 029/121] xfs: fix double ijoin in xfs_reflink_cancel_cow_range

xfs_reflink_cancel_cow_range joins an inode twice to the same
transaction.  This is not allowed, so fix it and document that the
callers of xfs_reflink_cancel_cow_blocks() must have already joined the
inode to the permanent transaction passed in.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
[darrick: edited the commit log to remove trace for nonexistent ASSERT]
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_reflink.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cdbd342a5249..bce2b5351d64 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -552,6 +552,9 @@ xfs_reflink_trim_irec_to_next_cow(
  *
  * If cancel_real is true this function cancels all COW fork extents for the
  * inode; if cancel_real is false, real extents are not cleared.
+ *
+ * Caller must have already joined the inode to the current transaction. The
+ * inode will be joined to the transaction returned to the caller.
  */
 int
 xfs_reflink_cancel_cow_blocks(
@@ -592,7 +595,6 @@ xfs_reflink_cancel_cow_blocks(
 			if (error)
 				break;
 		} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
-			xfs_trans_ijoin(*tpp, ip, 0);
 			xfs_defer_init(&dfops, &firstfsb);
 
 			/* Free the CoW orphan record. */
@@ -1570,6 +1572,7 @@ xfs_reflink_clear_inode_flag(
 	 * We didn't find any shared blocks so turn off the reflink flag.
 	 * First, get rid of any leftover CoW mappings.
 	 */
+	xfs_trans_ijoin(*tpp, ip, 0);
 	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
 	if (error)
 		return error;
@@ -1578,7 +1581,6 @@ xfs_reflink_clear_inode_flag(
 	trace_xfs_reflink_unset_inode_flag(ip);
 	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
 	xfs_inode_clear_cowblocks_tag(ip);
-	xfs_trans_ijoin(*tpp, ip, 0);
 	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
 
 	return error;

From 844e5e74c1a8256f2b0f89642ed9847b04fe2450 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:49:10 -0700
Subject: [PATCH 030/121] xfs: fix double ijoin in
 xfs_reflink_clear_inode_flag()

xfs_reflink_clear_inode_flag double-joins an inode to a transaction,
which is not allowed.  Fix that and document that the caller must have
already joined it.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
[darrick: edit out trace for nonexistent ASSERT]
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_reflink.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index bce2b5351d64..12d441a73b53 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1553,7 +1553,12 @@ next:
 	return 0;
 }
 
-/* Clear the inode reflink flag if there are no shared extents. */
+/*
+ * Clear the inode reflink flag if there are no shared extents.
+ *
+ * The caller is responsible for joining the inode to the transaction passed in.
+ * The inode will be joined to the transaction that is returned to the caller.
+ */
 int
 xfs_reflink_clear_inode_flag(
 	struct xfs_inode	*ip,
@@ -1572,7 +1577,6 @@ xfs_reflink_clear_inode_flag(
 	 * We didn't find any shared blocks so turn off the reflink flag.
 	 * First, get rid of any leftover CoW mappings.
 	 */
-	xfs_trans_ijoin(*tpp, ip, 0);
 	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
 	if (error)
 		return error;

From 1a2ebf835a3c23e0491e3279959cfb6c65a6ebbb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:49:10 -0700
Subject: [PATCH 031/121] xfs: add some more debug checks to buffer log item
 reuse

Just to make sure the item isn't associated with another
transaction when we try to reuse it.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf_item.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index df62082f2204..8d6ed045b643 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -745,8 +745,10 @@ xfs_buf_item_init(
 	 * nothing to do here so return.
 	 */
 	ASSERT(bp->b_target->bt_mount == mp);
-	if (bip != NULL) {
+	if (bip) {
 		ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+		ASSERT(!bp->b_transp);
+		ASSERT(bip->bli_buf == bp);
 		return 0;
 	}
 

From e6631f85546c8ff8842f62c73be44ff502d4287a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 9 May 2018 07:49:37 -0700
Subject: [PATCH 032/121] xfs: get rid of the log item descriptor

It's just a connector between a transaction and a log item. There's
a 1:1 relationship between a log item descriptor and a log item,
and a 1:1 relationship between a log item descriptor and a
transaction. Both relationships are created and terminated at the
same time, so why do we even have the descriptor?

Replace it with a specific list_head in the log item and a new
log item dirtied flag to replace the XFS_LID_DIRTY flag.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[darrick: fix up deferred agfl intent finish_item use of LID_DIRTY]
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c    |  8 ++---
 fs/xfs/libxfs/xfs_shared.h  | 15 ----------
 fs/xfs/xfs_buf_item.c       |  2 +-
 fs/xfs/xfs_icreate_item.c   |  2 +-
 fs/xfs/xfs_log.c            | 10 +++----
 fs/xfs/xfs_log_cil.c        | 21 ++++++--------
 fs/xfs/xfs_super.c          | 10 +------
 fs/xfs/xfs_trace.h          |  5 +---
 fs/xfs/xfs_trans.c          | 58 ++++++++++---------------------------
 fs/xfs/xfs_trans.h          |  8 ++---
 fs/xfs/xfs_trans_bmap.c     |  4 +--
 fs/xfs/xfs_trans_buf.c      | 22 ++++++--------
 fs/xfs/xfs_trans_dquot.c    |  4 +--
 fs/xfs/xfs_trans_extfree.c  |  6 ++--
 fs/xfs/xfs_trans_inode.c    |  3 +-
 fs/xfs/xfs_trans_priv.h     |  1 -
 fs/xfs/xfs_trans_refcount.c |  4 +--
 fs/xfs/xfs_trans_rmap.c     |  4 +--
 18 files changed, 63 insertions(+), 124 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index ddb5f5336ecf..3e5e9a667bc8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -246,7 +246,7 @@ xfs_bmap_get_bp(
 	struct xfs_btree_cur	*cur,
 	xfs_fsblock_t		bno)
 {
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item	*lip;
 	int			i;
 
 	if (!cur)
@@ -260,9 +260,9 @@ xfs_bmap_get_bp(
 	}
 
 	/* Chase down all the log items to see if the bp is there */
-	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
-		struct xfs_buf_log_item	*bip;
-		bip = (struct xfs_buf_log_item *)lidp->lid_item;
+	list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) {
+		struct xfs_buf_log_item	*bip = (struct xfs_buf_log_item *)lip;
+
 		if (bip->bli_item.li_type == XFS_LI_BUF &&
 		    XFS_BUF_ADDR(bip->bli_buf) == bno)
 			return bip->bli_buf;
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index d0b84da0cb1e..8efc06e62b13 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -57,21 +57,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 extern const struct xfs_buf_ops xfs_rtbuf_ops;
 
-/*
- * This structure is used to track log items associated with
- * a transaction.  It points to the log item and keeps some
- * flags to track the state of the log item.  It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
-	struct xfs_log_item	*lid_item;
-	struct list_head	lid_trans;
-	unsigned char		lid_flags;
-};
-
-#define XFS_LID_DIRTY		0x1
-
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
 int	xfs_log_calc_minimum_size(struct xfs_mount *);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 8d6ed045b643..c2311379d1c3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -438,7 +438,7 @@ xfs_buf_item_unpin(
 			 * xfs_trans_uncommit() will try to reference the
 			 * buffer which we no longer have a hold on.
 			 */
-			if (lip->li_desc)
+			if (!list_empty(&lip->li_trans))
 				xfs_trans_del_item(lip);
 
 			/*
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index a99a0f8aa528..5da9599156ed 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -184,5 +184,5 @@ xfs_icreate_log(
 
 	xfs_trans_add_item(tp, &icp->ic_item);
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags);
 }
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e427864434c1..c21039f27e39 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1047,6 +1047,7 @@ xfs_log_item_init(
 	INIT_LIST_HEAD(&item->li_ail);
 	INIT_LIST_HEAD(&item->li_cil);
 	INIT_LIST_HEAD(&item->li_bio_list);
+	INIT_LIST_HEAD(&item->li_trans);
 }
 
 /*
@@ -2110,10 +2111,10 @@ xlog_print_tic_res(
  */
 void
 xlog_print_trans(
-	struct xfs_trans		*tp)
+	struct xfs_trans	*tp)
 {
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_log_item_desc	*lidp;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_log_item	*lip;
 
 	/* dump core transaction and ticket info */
 	xfs_warn(mp, "transaction summary:");
@@ -2124,8 +2125,7 @@ xlog_print_trans(
 	xlog_print_tic_res(mp, tp->t_ticket);
 
 	/* dump each log item */
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		struct xfs_log_item	*lip = lidp->lid_item;
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
 		struct xfs_log_vec	*lv = lip->li_lv;
 		struct xfs_log_iovec	*vec;
 		int			i;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index a8675c631438..c15687724728 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -141,10 +141,9 @@ xlog_cil_alloc_shadow_bufs(
 	struct xlog		*log,
 	struct xfs_trans	*tp)
 {
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item	*lip;
 
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		struct xfs_log_item *lip = lidp->lid_item;
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
 		struct xfs_log_vec *lv;
 		int	niovecs = 0;
 		int	nbytes = 0;
@@ -152,7 +151,7 @@ xlog_cil_alloc_shadow_bufs(
 		bool	ordered = false;
 
 		/* Skip items which aren't dirty in this transaction. */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
 			continue;
 
 		/* get number of vecs and size of data to be stored */
@@ -317,7 +316,7 @@ xlog_cil_insert_format_items(
 	int			*diff_len,
 	int			*diff_iovecs)
 {
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item	*lip;
 
 
 	/* Bail out if we didn't find a log item.  */
@@ -326,15 +325,14 @@ xlog_cil_insert_format_items(
 		return;
 	}
 
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		struct xfs_log_item *lip = lidp->lid_item;
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
 		struct xfs_log_vec *lv;
 		struct xfs_log_vec *old_lv = NULL;
 		struct xfs_log_vec *shadow;
 		bool	ordered = false;
 
 		/* Skip items which aren't dirty in this transaction. */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
 			continue;
 
 		/*
@@ -406,7 +404,7 @@ xlog_cil_insert_items(
 {
 	struct xfs_cil		*cil = log->l_cilp;
 	struct xfs_cil_ctx	*ctx = cil->xc_ctx;
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item	*lip;
 	int			len = 0;
 	int			diff_iovecs = 0;
 	int			iclog_space;
@@ -479,11 +477,10 @@ xlog_cil_insert_items(
 	 * We do this here so we only need to take the CIL lock once during
 	 * the transaction commit.
 	 */
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		struct xfs_log_item	*lip = lidp->lid_item;
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
 
 		/* Skip items which aren't dirty in this transaction. */
-		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+		if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
 			continue;
 
 		/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d71424052917..5726ef496980 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1880,11 +1880,6 @@ xfs_init_zones(void)
 	if (!xfs_trans_zone)
 		goto out_destroy_ifork_zone;
 
-	xfs_log_item_desc_zone =
-		kmem_zone_init(sizeof(struct xfs_log_item_desc),
-			       "xfs_log_item_desc");
-	if (!xfs_log_item_desc_zone)
-		goto out_destroy_trans_zone;
 
 	/*
 	 * The size of the zone allocated buf log item is the maximum
@@ -1894,7 +1889,7 @@ xfs_init_zones(void)
 	xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
 					   "xfs_buf_item");
 	if (!xfs_buf_item_zone)
-		goto out_destroy_log_item_desc_zone;
+		goto out_destroy_trans_zone;
 
 	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
 			((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1982,8 +1977,6 @@ xfs_init_zones(void)
 	kmem_zone_destroy(xfs_efd_zone);
  out_destroy_buf_item_zone:
 	kmem_zone_destroy(xfs_buf_item_zone);
- out_destroy_log_item_desc_zone:
-	kmem_zone_destroy(xfs_log_item_desc_zone);
  out_destroy_trans_zone:
 	kmem_zone_destroy(xfs_trans_zone);
  out_destroy_ifork_zone:
@@ -2022,7 +2015,6 @@ xfs_destroy_zones(void)
 	kmem_zone_destroy(xfs_efi_zone);
 	kmem_zone_destroy(xfs_efd_zone);
 	kmem_zone_destroy(xfs_buf_item_zone);
-	kmem_zone_destroy(xfs_log_item_desc_zone);
 	kmem_zone_destroy(xfs_trans_zone);
 	kmem_zone_destroy(xfs_ifork_zone);
 	kmem_zone_destroy(xfs_da_state_zone);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e2ffc1e01bd8..9d4c4ca24fe6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -441,7 +441,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		__field(unsigned, bli_recur)
 		__field(int, bli_refcount)
 		__field(unsigned, bli_flags)
-		__field(void *, li_desc)
 		__field(unsigned long, li_flags)
 	),
 	TP_fast_assign(
@@ -455,12 +454,11 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
 		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
 		__entry->buf_lockval = bip->bli_buf->b_sema.count;
-		__entry->li_desc = bip->bli_item.li_desc;
 		__entry->li_flags = bip->bli_item.li_flags;
 	),
 	TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
 		  "lock %d flags %s recur %d refcount %d bliflags %s "
-		  "lidesc %p liflags %s",
+		  "liflags %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long long)__entry->buf_bno,
 		  __entry->buf_len,
@@ -471,7 +469,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 		  __entry->bli_recur,
 		  __entry->bli_refcount,
 		  __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
-		  __entry->li_desc,
 		  __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
 )
 
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4bbdb6faec30..fc7ba75b8b69 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -34,7 +34,6 @@
 #include "xfs_defer.h"
 
 kmem_zone_t	*xfs_trans_zone;
-kmem_zone_t	*xfs_log_item_desc_zone;
 
 #if defined(CONFIG_TRACEPOINTS)
 static void
@@ -734,77 +733,52 @@ out:
 	return;
 }
 
-/*
- * Add the given log item to the transaction's list of log items.
- *
- * The log item will now point to its new descriptor with its li_desc field.
- */
+/* Add the given log item to the transaction's list of log items. */
 void
 xfs_trans_add_item(
 	struct xfs_trans	*tp,
 	struct xfs_log_item	*lip)
 {
-	struct xfs_log_item_desc *lidp;
-
 	ASSERT(lip->li_mountp == tp->t_mountp);
 	ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
+	ASSERT(list_empty(&lip->li_trans));
+	ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
 
-	lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
-
-	lidp->lid_item = lip;
-	lidp->lid_flags = 0;
-	list_add_tail(&lidp->lid_trans, &tp->t_items);
-
-	lip->li_desc = lidp;
-
+	list_add_tail(&lip->li_trans, &tp->t_items);
 	trace_xfs_trans_add_item(tp, _RET_IP_);
 }
 
-STATIC void
-xfs_trans_free_item_desc(
-	struct xfs_log_item_desc *lidp)
-{
-	list_del_init(&lidp->lid_trans);
-	kmem_zone_free(xfs_log_item_desc_zone, lidp);
-}
-
 /*
- * Unlink and free the given descriptor.
+ * Unlink the log item from the transaction. the log item is no longer
+ * considered dirty in this transaction, as the linked transaction has
+ * finished, either by abort or commit completion.
  */
 void
 xfs_trans_del_item(
 	struct xfs_log_item	*lip)
 {
-	xfs_trans_free_item_desc(lip->li_desc);
-	lip->li_desc = NULL;
+	clear_bit(XFS_LI_DIRTY, &lip->li_flags);
+	list_del_init(&lip->li_trans);
 }
 
-/*
- * Unlock all of the items of a transaction and free all the descriptors
- * of that transaction.
- */
+/* Detach and unlock all of the items in a transaction */
 void
 xfs_trans_free_items(
 	struct xfs_trans	*tp,
 	xfs_lsn_t		commit_lsn,
 	bool			abort)
 {
-	struct xfs_log_item_desc *lidp, *next;
+	struct xfs_log_item	*lip, *next;
 
 	trace_xfs_trans_free_items(tp, _RET_IP_);
 
-	list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
-		struct xfs_log_item	*lip = lidp->lid_item;
-
-		lip->li_desc = NULL;
-
+	list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+		xfs_trans_del_item(lip);
 		if (commit_lsn != NULLCOMMITLSN)
 			lip->li_ops->iop_committing(lip, commit_lsn);
 		if (abort)
 			set_bit(XFS_LI_ABORTED, &lip->li_flags);
 		lip->li_ops->iop_unlock(lip);
-
-		xfs_trans_free_item_desc(lidp);
 	}
 }
 
@@ -1052,10 +1026,10 @@ xfs_trans_cancel(
 	}
 #ifdef DEBUG
 	if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
-		struct xfs_log_item_desc *lidp;
+		struct xfs_log_item *lip;
 
-		list_for_each_entry(lidp, &tp->t_items, lid_trans)
-			ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
+		list_for_each_entry(lip, &tp->t_items, li_trans)
+			ASSERT(!(lip->li_type == XFS_LI_EFD));
 	}
 #endif
 	xfs_trans_unreserve_and_mod_sb(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca449036f820..a386d8af56d2 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -27,7 +27,6 @@ struct xfs_efi_log_item;
 struct xfs_inode;
 struct xfs_item_ops;
 struct xfs_log_iovec;
-struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_trans_res;
@@ -43,8 +42,8 @@ struct xfs_bud_log_item;
 
 typedef struct xfs_log_item {
 	struct list_head		li_ail;		/* AIL pointers */
+	struct list_head		li_trans;	/* transaction list */
 	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
-	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
 	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
 	struct xfs_ail			*li_ailp;	/* ptr to AIL */
 	uint				li_type;	/* item type */
@@ -72,11 +71,13 @@ typedef struct xfs_log_item {
 #define	XFS_LI_IN_AIL	0
 #define	XFS_LI_ABORTED	1
 #define	XFS_LI_FAILED	2
+#define	XFS_LI_DIRTY	3	/* log item dirty in transaction */
 
 #define XFS_LI_FLAGS \
 	{ (1 << XFS_LI_IN_AIL),		"IN_AIL" }, \
 	{ (1 << XFS_LI_ABORTED),	"ABORTED" }, \
-	{ (1 << XFS_LI_FAILED),		"FAILED" }
+	{ (1 << XFS_LI_FAILED),		"FAILED" }, \
+	{ (1 << XFS_LI_DIRTY),		"DIRTY" }
 
 struct xfs_item_ops {
 	void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -248,7 +249,6 @@ void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
 					struct xfs_buf *src_bp);
 
 extern kmem_zone_t	*xfs_trans_zone;
-extern kmem_zone_t	*xfs_log_item_desc_zone;
 
 /* rmap updates */
 enum xfs_rmap_intent_type;
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 14543d93cd4b..230a21df4b12 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -79,7 +79,7 @@ xfs_trans_log_finish_bmap_update(
 	 * 2.) shuts down the filesystem
 	 */
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
 
 	return error;
 }
@@ -158,7 +158,7 @@ xfs_bmap_update_log_item(
 	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
 
 	/*
 	 * atomic_inc_return gives us the value after the increment;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0081e9b3decf..a8ddb4eed279 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,7 +40,7 @@ xfs_trans_buf_item_match(
 	struct xfs_buf_map	*map,
 	int			nmaps)
 {
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item	*lip;
 	struct xfs_buf_log_item	*blip;
 	int			len = 0;
 	int			i;
@@ -48,8 +48,8 @@ xfs_trans_buf_item_match(
 	for (i = 0; i < nmaps; i++)
 		len += map[i].bm_len;
 
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		blip = (struct xfs_buf_log_item *)lidp->lid_item;
+	list_for_each_entry(lip, &tp->t_items, li_trans) {
+		blip = (struct xfs_buf_log_item *)lip;
 		if (blip->bli_item.li_type == XFS_LI_BUF &&
 		    blip->bli_buf->b_target == target &&
 		    XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn &&
@@ -100,14 +100,10 @@ _xfs_trans_bjoin(
 	atomic_inc(&bip->bli_refcount);
 
 	/*
-	 * Get a log_item_desc to point at the new item.
+	 * Attach the item to the transaction so we can find it in
+	 * xfs_trans_get_buf() and friends.
 	 */
 	xfs_trans_add_item(tp, &bip->bli_item);
-
-	/*
-	 * Initialize b_fsprivate2 so we can find it with incore_match()
-	 * in xfs_trans_get_buf() and friends above.
-	 */
 	bp->b_transp = tp;
 
 }
@@ -391,7 +387,7 @@ xfs_trans_brelse(
 	 * If the buffer is dirty within this transaction, we can't
 	 * release it until we commit.
 	 */
-	if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
+	if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
 		return;
 
 	/*
@@ -542,7 +538,7 @@ xfs_trans_dirty_buf(
 	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
 }
 
 /*
@@ -626,7 +622,7 @@ xfs_trans_binval(
 		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
 		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
-		ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
+		ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
 		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
 		return;
 	}
@@ -642,7 +638,7 @@ xfs_trans_binval(
 		memset(bip->bli_formats[i].blf_data_map, 0,
 		       (bip->bli_formats[i].blf_map_size * sizeof(uint)));
 	}
-	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
 	tp->t_flags |= XFS_TRANS_DIRTY;
 }
 
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c3d547211d16..c381c02cca45 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -77,7 +77,7 @@ xfs_trans_log_dquot(
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
 }
 
 /*
@@ -879,7 +879,7 @@ xfs_trans_log_quotaoff_item(
 	xfs_qoff_logitem_t	*qlp)
 {
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index f5620796ae25..c60395eea21f 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -90,7 +90,7 @@ xfs_trans_free_extent(
 	 * 2.) shuts down the filesystem
 	 */
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
 
 	next_extent = efdp->efd_next_extent;
 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
@@ -155,7 +155,7 @@ xfs_extent_free_log_item(
 	free = container_of(item, struct xfs_extent_free_item, xefi_list);
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
 
 	/*
 	 * atomic_inc_return gives us the value after the increment;
@@ -273,7 +273,7 @@ xfs_agfl_free_finish_item(
 	 * 2.) shuts down the filesystem
 	 */
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
 
 	next_extent = efdp->efd_next_extent;
 	ASSERT(next_extent < efdp->efd_format.efd_nextents);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 07cea592dc01..f7bd7960a90f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -133,14 +133,13 @@ xfs_trans_log_inode(
 	 * set however, then go ahead and bump the i_version counter
 	 * unconditionally.
 	 */
-	if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+	if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
 	    IS_I_VERSION(VFS_I(ip))) {
 		if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
 			flags |= XFS_ILOG_CORE;
 	}
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
 	/*
 	 * Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 43f773297b9d..9717ae74b36d 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,7 +19,6 @@
 #define	__XFS_TRANS_PRIV_H__
 
 struct xfs_log_item;
-struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_ail;
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 94c1877af834..c7f8e82f5bda 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -77,7 +77,7 @@ xfs_trans_log_finish_refcount_update(
 	 * 2.) shuts down the filesystem
 	 */
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
 
 	return error;
 }
@@ -154,7 +154,7 @@ xfs_refcount_update_log_item(
 	refc = container_of(item, struct xfs_refcount_intent, ri_list);
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
 
 	/*
 	 * atomic_inc_return gives us the value after the increment;
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 9b577beb43d7..5831ca0c270b 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -117,7 +117,7 @@ xfs_trans_log_finish_rmap_update(
 	 * 2.) shuts down the filesystem
 	 */
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
 
 	return error;
 }
@@ -175,7 +175,7 @@ xfs_rmap_update_log_item(
 	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
-	ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
 
 	/*
 	 * atomic_inc_return gives us the value after the increment;

From fcb762f5de2e534ab47b5f034fe484c2b25b4d51 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 9 May 2018 08:45:04 -0700
Subject: [PATCH 033/121] xfs: add bmapi nodiscard flag

Freed extents are unconditionally discarded when online discard is
enabled. Define XFS_BMAPI_NODISCARD to allow callers to bypass
discards when unnecessary. For example, this will be useful for
eofblocks trimming.

This patch does not change behavior.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_alloc.c  | 10 +++++++---
 fs/xfs/libxfs/xfs_alloc.h  | 27 +++++++++++++++++++++++++--
 fs/xfs/libxfs/xfs_bmap.c   | 17 +++++++++++++----
 fs/xfs/libxfs/xfs_bmap.h   | 33 ++++++++++++++++++++++++++++++---
 fs/xfs/xfs_extfree_item.c  |  2 +-
 fs/xfs/xfs_trans.h         |  3 ++-
 fs/xfs/xfs_trans_extfree.c | 13 +++++++++----
 7 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 350ad203b082..5410635893df 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3009,18 +3009,20 @@ out:
  * after fixing up the freelist.
  */
 int				/* error */
-xfs_free_extent(
+__xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
 	struct xfs_owner_info	*oinfo,	/* extent owner */
-	enum xfs_ag_resv_type	type)	/* block reservation type */
+	enum xfs_ag_resv_type	type,	/* block reservation type */
+	bool			skip_discard)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	struct xfs_buf		*agbp;
 	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, bno);
 	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, bno);
 	int			error;
+	unsigned int		busy_flags = 0;
 
 	ASSERT(len != 0);
 	ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3044,7 +3046,9 @@ xfs_free_extent(
 	if (error)
 		goto err;
 
-	xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+	if (skip_discard)
+		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
+	xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
 	return 0;
 
 err:
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 949e21326066..1dcac78586b4 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -191,12 +191,35 @@ xfs_alloc_vextent(
  * Free an extent.
  */
 int				/* error */
-xfs_free_extent(
+__xfs_free_extent(
 	struct xfs_trans	*tp,	/* transaction pointer */
 	xfs_fsblock_t		bno,	/* starting block number of extent */
 	xfs_extlen_t		len,	/* length of extent */
 	struct xfs_owner_info	*oinfo,	/* extent owner */
-	enum xfs_ag_resv_type	type);	/* block reservation type */
+	enum xfs_ag_resv_type	type,	/* block reservation type */
+	bool			skip_discard);
+
+static inline int
+xfs_free_extent(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		bno,
+	xfs_extlen_t		len,
+	struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type	type)
+{
+	return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+}
+
+static inline int
+xfs_free_extent_nodiscard(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		bno,
+	xfs_extlen_t		len,
+	struct xfs_owner_info	*oinfo,
+	enum xfs_ag_resv_type	type)
+{
+	return __xfs_free_extent(tp, bno, len, oinfo, type, true);
+}
 
 int				/* error */
 xfs_alloc_lookup_le(
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 3e5e9a667bc8..534ffb856c4a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -544,12 +544,13 @@ xfs_bmap_validate_ret(
  * The list is maintained sorted (by block number).
  */
 void
-xfs_bmap_add_free(
+__xfs_bmap_add_free(
 	struct xfs_mount		*mp,
 	struct xfs_defer_ops		*dfops,
 	xfs_fsblock_t			bno,
 	xfs_filblks_t			len,
-	struct xfs_owner_info		*oinfo)
+	struct xfs_owner_info		*oinfo,
+	bool				skip_discard)
 {
 	struct xfs_extent_free_item	*new;		/* new element */
 #ifdef DEBUG
@@ -576,6 +577,7 @@ xfs_bmap_add_free(
 		new->xefi_oinfo = *oinfo;
 	else
 		xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+	new->xefi_skip_discard = skip_discard;
 	trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0,
 			XFS_FSB_TO_AGBNO(mp, bno), len);
 	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
@@ -5106,9 +5108,16 @@ xfs_bmap_del_extent_real(
 			error = xfs_refcount_decrease_extent(mp, dfops, del);
 			if (error)
 				goto done;
-		} else
-			xfs_bmap_add_free(mp, dfops, del->br_startblock,
+		} else {
+			if (bflags & XFS_BMAPI_NODISCARD) {
+				xfs_bmap_add_free_nodiscard(mp, dfops,
+					del->br_startblock, del->br_blockcount,
+					NULL);
+			} else {
+				xfs_bmap_add_free(mp, dfops, del->br_startblock,
 					del->br_blockcount, NULL);
+			}
+		}
 	}
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 2b766b37096d..8d8946bfdd8c 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -68,6 +68,7 @@ struct xfs_extent_free_item
 	xfs_extlen_t		xefi_blockcount;/* number of blocks in extent */
 	struct list_head	xefi_list;
 	struct xfs_owner_info	xefi_oinfo;	/* extent owner */
+	bool			xefi_skip_discard;
 };
 
 #define	XFS_BMAP_MAX_NMAP	4
@@ -116,6 +117,9 @@ struct xfs_extent_free_item
 /* Only convert unwritten extents, don't allocate new blocks */
 #define XFS_BMAPI_CONVERT_ONLY	0x800
 
+/* Skip online discard of freed extents */
+#define XFS_BMAPI_NODISCARD	0x1000
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -128,7 +132,8 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
 	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
 	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }, \
-	{ XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }
+	{ XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
+	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }
 
 
 static inline int xfs_bmapi_aflag(int w)
@@ -192,9 +197,9 @@ void	xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
 void	xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void	xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+void	__xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
 			  xfs_fsblock_t bno, xfs_filblks_t len,
-			  struct xfs_owner_info *oinfo);
+			  struct xfs_owner_info *oinfo, bool skip_discard);
 void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -240,6 +245,28 @@ int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
 		int eof);
 
+static inline void
+xfs_bmap_add_free(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_fsblock_t			bno,
+	xfs_filblks_t			len,
+	struct xfs_owner_info		*oinfo)
+{
+	__xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false);
+}
+
+static inline void
+xfs_bmap_add_free_nodiscard(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_fsblock_t			bno,
+	xfs_filblks_t			len,
+	struct xfs_owner_info		*oinfo)
+{
+	__xfs_bmap_add_free(mp, dfops, bno, len, oinfo, true);
+}
+
 enum xfs_bmap_intent_type {
 	XFS_BMAP_MAP = 1,
 	XFS_BMAP_UNMAP,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 70b7d48af6d6..a889b550979a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -542,7 +542,7 @@ xfs_efi_recover(
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &efip->efi_format.efi_extents[i];
 		error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
-					      extp->ext_len, &oinfo);
+					      extp->ext_len, &oinfo, false);
 		if (error)
 			goto abort_error;
 
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a386d8af56d2..29706b8b3bd4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -235,7 +235,8 @@ struct xfs_efd_log_item	*xfs_trans_get_efd(struct xfs_trans *,
 				  uint);
 int		xfs_trans_free_extent(struct xfs_trans *,
 				      struct xfs_efd_log_item *, xfs_fsblock_t,
-				      xfs_extlen_t, struct xfs_owner_info *);
+				      xfs_extlen_t, struct xfs_owner_info *,
+				      bool);
 int		xfs_trans_commit(struct xfs_trans *);
 int		xfs_trans_roll(struct xfs_trans **);
 int		xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index c60395eea21f..28a1d5a0467c 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -68,7 +68,8 @@ xfs_trans_free_extent(
 	struct xfs_efd_log_item	*efdp,
 	xfs_fsblock_t		start_block,
 	xfs_extlen_t		ext_len,
-	struct xfs_owner_info	*oinfo)
+	struct xfs_owner_info	*oinfo,
+	bool			skip_discard)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	uint			next_extent;
@@ -79,8 +80,12 @@ xfs_trans_free_extent(
 
 	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
 
-	error = xfs_free_extent(tp, start_block, ext_len, oinfo,
-			XFS_AG_RESV_NONE);
+	if (skip_discard)
+		error = xfs_free_extent_nodiscard(tp, start_block, ext_len,
+						  oinfo, XFS_AG_RESV_NONE);
+	else
+		error = xfs_free_extent(tp, start_block, ext_len, oinfo,
+					XFS_AG_RESV_NONE);
 
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
@@ -195,7 +200,7 @@ xfs_extent_free_finish_item(
 	error = xfs_trans_free_extent(tp, done_item,
 			free->xefi_startblock,
 			free->xefi_blockcount,
-			&free->xefi_oinfo);
+			&free->xefi_oinfo, free->xefi_skip_discard);
 	kmem_free(free);
 	return error;
 }

From 13b86fc33718c6b504baa472437ae14a33abc138 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 9 May 2018 08:45:04 -0700
Subject: [PATCH 034/121] xfs: skip online discard during eofblocks trims

We've had reports of online discard operations being sent from XFS
on write-only workloads. These discards occur as a result of
eofblocks trims that can occur after a large file copy completes.

These discards are slightly confusing for users who might be paying
close attention to online discards (i.e., vdo) due to performance
sensitivity. They also happen to be spurious because freed post-eof
blocks by definition have not been written to during the current
allocation cycle.

Update xfs_free_eofblocks() to skip discards that are purely
attributed to eofblocks trims. This cuts down the number of spurious
discards that may occur on write-only workloads due to normal
preallocation activity.

Note that discards of post-eof extents can still occur from other
codepaths that do not isolate handling of post-eof blocks from those
within eof. For example, file unlinks and truncates may still cause
discards for any file blocks affected by the operation.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_bmap_util.c |  4 ++--
 fs/xfs/xfs_inode.c     | 19 +++++++++++--------
 fs/xfs/xfs_inode.h     | 24 ++++++++++++++++++++++--
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8cd8c412f52d..696c3b6bd2c9 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -871,8 +871,8 @@ xfs_free_eofblocks(
 		 * contents of the file are flushed to disk then the files
 		 * may be full of holes (ie NULL files bug).
 		 */
-		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
-					      XFS_ISIZE(ip));
+		error = xfs_itruncate_extents_nodiscard(&tp, ip, XFS_DATA_FORK,
+							XFS_ISIZE(ip));
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 42781bae6794..5b2e08488107 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1548,11 +1548,12 @@ xfs_itruncate_clear_reflink_flags(
  * dirty on error so that transactions can be easily aborted if possible.
  */
 int
-xfs_itruncate_extents(
+__xfs_itruncate_extents(
 	struct xfs_trans	**tpp,
 	struct xfs_inode	*ip,
 	int			whichfork,
-	xfs_fsize_t		new_size)
+	xfs_fsize_t		new_size,
+	bool			skip_discard)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp = *tpp;
@@ -1563,6 +1564,7 @@ xfs_itruncate_extents(
 	xfs_filblks_t		unmap_len;
 	int			error = 0;
 	int			done = 0;
+	int			flags;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
@@ -1575,6 +1577,10 @@ xfs_itruncate_extents(
 
 	trace_xfs_itruncate_extents_start(ip, new_size);
 
+	flags = xfs_bmapi_aflag(whichfork);
+	if (skip_discard)
+		flags |= XFS_BMAPI_NODISCARD;
+
 	/*
 	 * Since it is possible for space to become allocated beyond
 	 * the end of the file (in a crash where the space is allocated
@@ -1593,12 +1599,9 @@ xfs_itruncate_extents(
 	unmap_len = last_block - first_unmap_block + 1;
 	while (!done) {
 		xfs_defer_init(&dfops, &first_block);
-		error = xfs_bunmapi(tp, ip,
-				    first_unmap_block, unmap_len,
-				    xfs_bmapi_aflag(whichfork),
-				    XFS_ITRUNC_MAX_EXTENTS,
-				    &first_block, &dfops,
-				    &done);
+		error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
+				    XFS_ITRUNC_MAX_EXTENTS, &first_block,
+				    &dfops, &done);
 		if (error)
 			goto out_bmap_cancel;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1eebc53df7d7..3dd3201f4409 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -415,8 +415,8 @@ uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 uint		xfs_ip2xflags(struct xfs_inode *);
 int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_defer_ops *);
-int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
-				      int, xfs_fsize_t);
+int		__xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
+				        int, xfs_fsize_t, bool);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 
 void		xfs_iunpin_wait(xfs_inode_t *);
@@ -433,6 +433,26 @@ int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
 			       xfs_nlink_t, dev_t, prid_t,
 			       struct xfs_inode **);
 
+static inline int
+xfs_itruncate_extents(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fsize_t		new_size)
+{
+	return __xfs_itruncate_extents(tpp, ip, whichfork, new_size, false);
+}
+
+static inline int
+xfs_itruncate_extents_nodiscard(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_fsize_t		new_size)
+{
+	return __xfs_itruncate_extents(tpp, ip, whichfork, new_size, true);
+}
+
 /* from xfs_file.c */
 enum xfs_prealloc_flags {
 	XFS_PREALLOC_SET	= (1 << 1),

From 84ca484ecf2f8e1dc3afddc895cb9b62c531db49 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 9 May 2018 08:45:05 -0700
Subject: [PATCH 035/121] xfs: don't discard on free of unwritten extents

Unwritten extents by definition have not been written to until they
are converted to normal written extents. If unwritten extents are
freed from a file, it is therefore guaranteed that the blocks have
not been written to since allocation (note that zero range punches
and reallocates blocks).

To cut down on online discards generated from workloads that make
use of preallocation, skip discards of extents if they are in the
unwritten state when the extent is freed.

Note that this optimization does not apply to log recovery, during
which all freed extents are discarded if online discard is enabled.
Also note that it may be possible for a filesystem crash to occur
after write completion of an unwritten extent but before unwritten
conversion such that the extent remains unwritten after log
recovery. Since this pseudo-inconsistency may already be possible
after a crash (consider writing to recently allocated blocks where
the allocation transaction is lost after a crash), this change
shouldn't introduce any fundamental limitations that don't already
exist. In short, on storage stacks where discards are important,
it's good practice to run an occasional fstrim even with online
discard enabled in the filesystem, particularly after a crash.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 534ffb856c4a..3d7c53db1977 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5109,7 +5109,8 @@ xfs_bmap_del_extent_real(
 			if (error)
 				goto done;
 		} else {
-			if (bflags & XFS_BMAPI_NODISCARD) {
+			if ((bflags & XFS_BMAPI_NODISCARD) ||
+			    (del->br_state == XFS_EXT_UNWRITTEN)) {
 				xfs_bmap_add_free_nodiscard(mp, dfops,
 					del->br_startblock, del->br_blockcount,
 					NULL);

From 7b6b50f55c1064e619f2da0696ed99f9db10b960 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:19 -0700
Subject: [PATCH 036/121] xfs: release new dquot buffer on defer_finish error

In commit efa092f3d4c6 "[XFS] Fixes a bug in the quota code when
allocating a new dquot record", we allocate a new dquot block, grab a
buffer to initialize it, and return the locked initialized dquot buffer
to the caller for further in-core dquot initialization.  Unfortunately,
if the _bmap_finish errored out, _qm_dqalloc would also error out
without bothering to free the (locked) buffer.  Leaking a locked buffer
caused hangs in generic/388 when quotas are enabled.

Furthermore, the _bmap_finish -> _defer_finish conversion in
310a75a3c6c747 ("xfs: change xfs_bmap_{finish,cancel,init,free} ->
xfs_defer_*") failed to observe that the buffer was held going into
_defer_finish and therefore failed to notice that the buffer lock is
/not/ maintained afterwards.  Now that we can bjoin a buffer to a
defer_ops, use this mechanism to ensure that the buffer stays locked
across the _defer_finish.  Release the holds and locks on the buffer as
appropriate if we have to error out.

There is a subtlety here for the caller in that the buffer emerges
locked and held to the transaction, so if the _trans_commit fails we
have to release the buffer explicitly.  This fixes the unmount hang
in generic/388 when quotas are enabled.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_dquot.c | 50 +++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 4ca9c39879ae..32d7359b3c18 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -362,32 +362,40 @@ xfs_qm_dqalloc(
 			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
 
 	/*
-	 * xfs_defer_finish() may commit the current transaction and
-	 * start a second transaction if the freelist is not empty.
+	 * Hold the buffer and join it to the dfops so that we'll still own
+	 * the buffer when we return to the caller.  The buffer disposal on
+	 * error must be paid attention to very carefully, as it has been
+	 * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota
+	 * code when allocating a new dquot record" in 2005, and the later
+	 * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep
+	 * the buffer locked across the _defer_finish call.  We can now do
+	 * this correctly with xfs_defer_bjoin.
 	 *
-	 * Since we still want to modify this buffer, we need to
-	 * ensure that the buffer is not released on commit of
-	 * the first transaction and ensure the buffer is added to the
-	 * second transaction.
+	 * Above, we allocated a disk block for the dquot information and
+	 * used get_buf to initialize the dquot.  If the _defer_bjoin fails,
+	 * the buffer is still locked to *tpp, so we must _bhold_release and
+	 * then _trans_brelse the buffer.  If the _defer_finish fails, the old
+	 * transaction is gone but the new buffer is not joined or held to any
+	 * transaction, so we must _buf_relse it.
 	 *
-	 * If there is only one transaction then don't stop the buffer
-	 * from being released when it commits later on.
+	 * If everything succeeds, the caller of this function is returned a
+	 * buffer that is locked and joined to the transaction.  The caller
+	 * is responsible for unlocking any buffer passed back, either
+	 * manually or by committing the transaction.
 	 */
-
-	xfs_trans_bhold(tp, bp);
-
-	error = xfs_defer_finish(tpp, &dfops);
-	if (error)
+	xfs_trans_bhold(*tpp, bp);
+	error = xfs_defer_bjoin(&dfops, bp);
+	if (error) {
+		xfs_trans_bhold_release(*tpp, bp);
+		xfs_trans_brelse(*tpp, bp);
 		goto error1;
-
-	/* Transaction was committed? */
-	if (*tpp != tp) {
-		tp = *tpp;
-		xfs_trans_bjoin(tp, bp);
-	} else {
-		xfs_trans_bhold_release(tp, bp);
 	}
-
+	error = xfs_defer_finish(tpp, &dfops);
+	if (error) {
+		xfs_buf_relse(bp);
+		goto error1;
+	}
+	xfs_trans_bhold_release(*tpp, bp);
 	*O_bpp = bp;
 	return 0;
 

From 609001bca49917b44c6af71abafd6e91c274006d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:20 -0700
Subject: [PATCH 037/121] xfs: don't spray logs when dquot flush/purge fail

When dquot flush or purge fail there's no need to spam the logs, we've
already logged the IO error or fs shutdown that caused the flush
failures.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_dquot_item.c |  5 +----
 fs/xfs/xfs_qm.c         | 10 ++--------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 57df98122156..8eb7415474d6 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -209,10 +209,7 @@ xfs_qm_dquot_logitem_push(
 	spin_unlock(&lip->li_ailp->ail_lock);
 
 	error = xfs_qm_dqflush(dqp, &bp);
-	if (error) {
-		xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT,
-			__func__, error, dqp);
-	} else {
+	if (!error) {
 		if (!xfs_buf_delwri_queue(bp, buffer_list))
 			rval = XFS_ITEM_FLUSHING;
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 62764f3e35e2..eb106366dea1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -161,10 +161,7 @@ xfs_qm_dqpurge(
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
 		error = xfs_qm_dqflush(dqp, &bp);
-		if (error) {
-			xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed",
-				__func__, dqp);
-		} else {
+		if (!error) {
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
 		}
@@ -479,11 +476,8 @@ xfs_qm_dquot_isolate(
 		spin_unlock(lru_lock);
 
 		error = xfs_qm_dqflush(dqp, &bp);
-		if (error) {
-			xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed",
-				 __func__, dqp);
+		if (error)
 			goto out_unlock_dirty;
-		}
 
 		xfs_buf_delwri_queue(bp, &isol->buffers);
 		xfs_buf_relse(bp);

From 2e330e76e03dd0caee6804b49e9e49d7c3998867 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:20 -0700
Subject: [PATCH 038/121] xfs: refactor XFS_QMOPT_DQNEXT out of existence

There's only one caller of DQNEXT and its semantics can be moved into a
separate function, so create the function and get rid of the flag.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_quota_defs.h |  1 -
 fs/xfs/scrub/quota.c           |  3 +-
 fs/xfs/xfs_dquot.c             | 63 +++++++++++-----------
 fs/xfs/xfs_dquot.h             |  2 +
 fs/xfs/xfs_qm.h                |  6 ++-
 fs/xfs/xfs_qm_syscalls.c       | 96 +++++++++++++++++++++++-----------
 fs/xfs/xfs_quotaops.c          |  8 ++-
 7 files changed, 108 insertions(+), 71 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1aac52d7fef4..619905f3bcac 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -114,7 +114,6 @@ typedef uint16_t	xfs_qwarncnt_t;
 #define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
 #define XFS_QMOPT_GQUOTA	0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC	0x0004000 /* enospc instead of edquot (prj) */
-#define XFS_QMOPT_DQNEXT	0x0008000 /* return next dquot >= this ID */
 
 /*
  * flags to xfs_trans_mod_dquot to indicate which field needs to be
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 6ba465e6c885..50415e8e5dd1 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -268,8 +268,7 @@ xfs_scrub_quota(
 		if (xfs_scrub_should_terminate(sc, &error))
 			break;
 
-		error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT,
-				&dq);
+		error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
 		if (error == -ENOENT)
 			break;
 		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 32d7359b3c18..8d2a3becc4f4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -736,18 +736,6 @@ restart:
 			goto restart;
 		}
 
-		/* uninit / unused quota found in radix tree, keep looking  */
-		if (flags & XFS_QMOPT_DQNEXT) {
-			if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-				xfs_dqunlock(dqp);
-				mutex_unlock(&qi->qi_tree_lock);
-				error = xfs_dq_get_next_id(mp, type, &id);
-				if (error)
-					return error;
-				goto restart;
-			}
-		}
-
 		dqp->q_nrefs++;
 		mutex_unlock(&qi->qi_tree_lock);
 
@@ -774,13 +762,6 @@ restart:
 	if (ip)
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-	/* If we are asked to find next active id, keep looking */
-	if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
-		error = xfs_dq_get_next_id(mp, type, &id);
-		if (!error)
-			goto restart;
-	}
-
 	if (error)
 		return error;
 
@@ -831,17 +812,6 @@ restart:
 	qi->qi_dquots++;
 	mutex_unlock(&qi->qi_tree_lock);
 
-	/* If we are asked to find next active id, keep looking */
-	if (flags & XFS_QMOPT_DQNEXT) {
-		if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-			xfs_qm_dqput(dqp);
-			error = xfs_dq_get_next_id(mp, type, &id);
-			if (error)
-				return error;
-			goto restart;
-		}
-	}
-
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);
@@ -849,6 +819,39 @@ restart:
 	return 0;
 }
 
+/*
+ * Starting at @id and progressing upwards, look for an initialized incore
+ * dquot, lock it, and return it.
+ */
+int
+xfs_qm_dqget_next(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	struct xfs_dquot	**dqpp)
+{
+	struct xfs_dquot	*dqp;
+	int			error = 0;
+
+	*dqpp = NULL;
+	for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
+		error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+		if (error == -ENOENT)
+			continue;
+		else if (error != 0)
+			break;
+
+		if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+			*dqpp = dqp;
+			return 0;
+		}
+
+		xfs_qm_dqput(dqp);
+	}
+
+	return error;
+}
+
 /*
  * Release a reference to the dquot (decrement ref-count) and unlock it.
  *
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 2f536f33cd26..303e71dfcf70 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -171,6 +171,8 @@ extern void		xfs_qm_adjust_dqlimits(struct xfs_mount *,
 					       struct xfs_dquot *);
 extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
 					xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern int		xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+					uint type, struct xfs_dquot **dqpp);
 extern void		xfs_qm_dqput(xfs_dquot_t *);
 
 extern void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 2975a822e9f0..e3129b280423 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -170,8 +170,10 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 
 /* quota ops */
 extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
-					uint, struct qc_dqblk *, uint);
+extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+					uint, struct qc_dqblk *);
+extern int		xfs_qm_scall_getquota_next(struct xfs_mount *,
+					xfs_dqid_t *, uint, struct qc_dqblk *);
 extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
 					struct qc_dqblk *);
 extern int		xfs_qm_scall_quotaon(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9cb5c381b01c..0234cfc4d445 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -622,39 +622,14 @@ out:
 	return error;
 }
 
-
-int
-xfs_qm_scall_getquota(
+/* Fill out the quota context. */
+static void
+xfs_qm_scall_getquota_fill_qc(
 	struct xfs_mount	*mp,
-	xfs_dqid_t		*id,
 	uint			type,
-	struct qc_dqblk		*dst,
-	uint			dqget_flags)
+	const struct xfs_dquot	*dqp,
+	struct qc_dqblk		*dst)
 {
-	struct xfs_dquot	*dqp;
-	int			error;
-
-	/*
-	 * Try to get the dquot. We don't want it allocated on disk, so
-	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
-	 * exist, we'll get ENOENT back.
-	 */
-	error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
-	if (error)
-		return error;
-
-	/*
-	 * If everything's NULL, this dquot doesn't quite exist as far as
-	 * our utility programs are concerned.
-	 */
-	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-		error = -ENOENT;
-		goto out_put;
-	}
-
-	/* Fill in the ID we actually read from disk */
-	*id = be32_to_cpu(dqp->q_core.d_id);
-
 	memset(dst, 0, sizeof(*dst));
 	dst->d_spc_hardlimit =
 		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -696,7 +671,7 @@ xfs_qm_scall_getquota(
 	if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
 	     (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
 	     (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
-	    *id != 0) {
+	    dqp->q_core.d_id != 0) {
 		if ((dst->d_space > dst->d_spc_softlimit) &&
 		    (dst->d_spc_softlimit > 0)) {
 			ASSERT(dst->d_spc_timer != 0);
@@ -707,11 +682,70 @@ xfs_qm_scall_getquota(
 		}
 	}
 #endif
+}
+
+/* Return the quota information for the dquot matching id. */
+int
+xfs_qm_scall_getquota(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	struct qc_dqblk		*dst)
+{
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	/*
+	 * Try to get the dquot. We don't want it allocated on disk, so
+	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+	 * exist, we'll get ENOENT back.
+	 */
+	error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+	if (error)
+		return error;
+
+	/*
+	 * If everything's NULL, this dquot doesn't quite exist as far as
+	 * our utility programs are concerned.
+	 */
+	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+		error = -ENOENT;
+		goto out_put;
+	}
+
+	xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
 out_put:
 	xfs_qm_dqput(dqp);
 	return error;
 }
 
+/*
+ * Return the quota information for the first initialized dquot whose id
+ * is at least as high as id.
+ */
+int
+xfs_qm_scall_getquota_next(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		*id,
+	uint			type,
+	struct qc_dqblk		*dst)
+{
+	struct xfs_dquot	*dqp;
+	int			error;
+
+	error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+	if (error)
+		return error;
+
+	/* Fill in the ID we actually read from disk */
+	*id = be32_to_cpu(dqp->q_core.d_id);
+
+	xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
+	xfs_qm_dqput(dqp);
+	return error;
+}
 
 STATIC int
 xfs_dqrele_inode(
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a65108594a07..c93fc913dffb 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -239,8 +239,7 @@ xfs_fs_get_dqblk(
 		return -ESRCH;
 
 	id = from_kqid(&init_user_ns, qid);
-	return xfs_qm_scall_getquota(mp, &id,
-				      xfs_quota_type(qid.type), qdq, 0);
+	return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq);
 }
 
 /* Return quota info for active quota >= this qid */
@@ -260,9 +259,8 @@ xfs_fs_get_nextdqblk(
 		return -ESRCH;
 
 	id = from_kqid(&init_user_ns, *qid);
-	ret = xfs_qm_scall_getquota(mp, &id,
-				    xfs_quota_type(qid->type), qdq,
-				    XFS_QMOPT_DQNEXT);
+	ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type),
+			qdq);
 	if (ret)
 		return ret;
 

From cc2047c4d0367cf4c51b631284d2266e1f0525c7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:20 -0700
Subject: [PATCH 039/121] xfs: refactor dquot cache handling

Delegate the dquot cache handling (radix tree lookup and insertion) to
separate helper functions so that we can continue to simplify the body
of dqget.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c | 112 +++++++++++++++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 34 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 8d2a3becc4f4..299d4ce90ded 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -686,6 +686,81 @@ xfs_dq_get_next_id(
 	return error;
 }
 
+/*
+ * Look up the dquot in the in-core cache.  If found, the dquot is returned
+ * locked and ready to go.
+ */
+static struct xfs_dquot *
+xfs_qm_dqget_cache_lookup(
+	struct xfs_mount	*mp,
+	struct xfs_quotainfo	*qi,
+	struct radix_tree_root	*tree,
+	xfs_dqid_t		id)
+{
+	struct xfs_dquot	*dqp;
+
+restart:
+	mutex_lock(&qi->qi_tree_lock);
+	dqp = radix_tree_lookup(tree, id);
+	if (!dqp) {
+		mutex_unlock(&qi->qi_tree_lock);
+		XFS_STATS_INC(mp, xs_qm_dqcachemisses);
+		return NULL;
+	}
+
+	xfs_dqlock(dqp);
+	if (dqp->dq_flags & XFS_DQ_FREEING) {
+		xfs_dqunlock(dqp);
+		mutex_unlock(&qi->qi_tree_lock);
+		trace_xfs_dqget_freeing(dqp);
+		delay(1);
+		goto restart;
+	}
+
+	dqp->q_nrefs++;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	trace_xfs_dqget_hit(dqp);
+	XFS_STATS_INC(mp, xs_qm_dqcachehits);
+	return dqp;
+}
+
+/*
+ * Try to insert a new dquot into the in-core cache.  If an error occurs the
+ * caller should throw away the dquot and start over.  Otherwise, the dquot
+ * is returned locked (and held by the cache) as if there had been a cache
+ * hit.
+ */
+static int
+xfs_qm_dqget_cache_insert(
+	struct xfs_mount	*mp,
+	struct xfs_quotainfo	*qi,
+	struct radix_tree_root	*tree,
+	xfs_dqid_t		id,
+	struct xfs_dquot	*dqp)
+{
+	int			error;
+
+	mutex_lock(&qi->qi_tree_lock);
+	error = radix_tree_insert(tree, id, dqp);
+	if (unlikely(error)) {
+		/* Duplicate found!  Caller must try again. */
+		WARN_ON(error != -EEXIST);
+		mutex_unlock(&qi->qi_tree_lock);
+		trace_xfs_dqget_dup(dqp);
+		return error;
+	}
+
+	/* Return a locked dquot to the caller, with a reference taken. */
+	xfs_dqlock(dqp);
+	dqp->q_nrefs = 1;
+
+	qi->qi_dquots++;
+	mutex_unlock(&qi->qi_tree_lock);
+
+	return 0;
+}
+
 /*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
@@ -724,28 +799,11 @@ xfs_qm_dqget(
 	}
 
 restart:
-	mutex_lock(&qi->qi_tree_lock);
-	dqp = radix_tree_lookup(tree, id);
+	dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
 	if (dqp) {
-		xfs_dqlock(dqp);
-		if (dqp->dq_flags & XFS_DQ_FREEING) {
-			xfs_dqunlock(dqp);
-			mutex_unlock(&qi->qi_tree_lock);
-			trace_xfs_dqget_freeing(dqp);
-			delay(1);
-			goto restart;
-		}
-
-		dqp->q_nrefs++;
-		mutex_unlock(&qi->qi_tree_lock);
-
-		trace_xfs_dqget_hit(dqp);
-		XFS_STATS_INC(mp, xs_qm_dqcachehits);
 		*O_dqpp = dqp;
 		return 0;
 	}
-	mutex_unlock(&qi->qi_tree_lock);
-	XFS_STATS_INC(mp, xs_qm_dqcachemisses);
 
 	/*
 	 * Dquot cache miss. We don't want to keep the inode lock across
@@ -787,31 +845,17 @@ restart:
 		}
 	}
 
-	mutex_lock(&qi->qi_tree_lock);
-	error = radix_tree_insert(tree, id, dqp);
-	if (unlikely(error)) {
-		WARN_ON(error != -EEXIST);
-
+	error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+	if (error) {
 		/*
 		 * Duplicate found. Just throw away the new dquot and start
 		 * over.
 		 */
-		mutex_unlock(&qi->qi_tree_lock);
-		trace_xfs_dqget_dup(dqp);
 		xfs_qm_dqdestroy(dqp);
 		XFS_STATS_INC(mp, xs_qm_dquot_dups);
 		goto restart;
 	}
 
-	/*
-	 * We return a locked dquot to the caller, with a reference taken
-	 */
-	xfs_dqlock(dqp);
-	dqp->q_nrefs = 1;
-
-	qi->qi_dquots++;
-	mutex_unlock(&qi->qi_tree_lock);
-
  dqret:
 	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);

From d7103eeb0051db9c27b7aaaf07262bd3802c529f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:21 -0700
Subject: [PATCH 040/121] xfs: delegate dqget input checks to helper function

Move the dqget input checks to a separate function in preparation for
splitting up the dqget functionality.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 299d4ce90ded..1ee05e5ab1d9 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -761,6 +761,34 @@ xfs_qm_dqget_cache_insert(
 	return 0;
 }
 
+/* Check our input parameters. */
+static int
+xfs_qm_dqget_checks(
+	struct xfs_mount	*mp,
+	uint			type)
+{
+	if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
+		return -ESRCH;
+
+	switch (type) {
+	case XFS_DQ_USER:
+		if (!XFS_IS_UQUOTA_ON(mp))
+			return -ESRCH;
+		return 0;
+	case XFS_DQ_GROUP:
+		if (!XFS_IS_GQUOTA_ON(mp))
+			return -ESRCH;
+		return 0;
+	case XFS_DQ_PROJ:
+		if (!XFS_IS_PQUOTA_ON(mp))
+			return -ESRCH;
+		return 0;
+	default:
+		WARN_ON_ONCE(0);
+		return -EINVAL;
+	}
+}
+
 /*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
@@ -783,16 +811,10 @@ xfs_qm_dqget(
 	struct xfs_dquot	*dqp;
 	int			error;
 
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
-	    (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
-	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
-		return -ESRCH;
-	}
+	error = xfs_qm_dqget_checks(mp, type);
+	if (error)
+		return error;
 
-	ASSERT(type == XFS_DQ_USER ||
-	       type == XFS_DQ_PROJ ||
-	       type == XFS_DQ_GROUP);
 	if (ip) {
 		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 		ASSERT(xfs_inode_dquot(ip, type) == NULL);

From c14cfccabe2af251388e20c1004ac5c6a970ba53 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:21 -0700
Subject: [PATCH 041/121] xfs: remove unnecessary xfs_qm_dqattach parameter

The flags argument is always zero, get rid of it.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_attr.c |  4 ++--
 fs/xfs/xfs_bmap_util.c   |  6 +++---
 fs/xfs/xfs_inode.c       | 10 +++++-----
 fs/xfs/xfs_iomap.c       |  4 ++--
 fs/xfs/xfs_iops.c        |  2 +-
 fs/xfs/xfs_qm.c          |  7 +++----
 fs/xfs/xfs_quota.h       |  4 ++--
 fs/xfs/xfs_reflink.c     |  2 +-
 8 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 35a124400d60..c3d02a66d39d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -236,7 +236,7 @@ xfs_attr_set(
 	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
 	args.total = xfs_attr_calc_size(&args, &local);
 
-	error = xfs_qm_dqattach(dp, 0);
+	error = xfs_qm_dqattach(dp);
 	if (error)
 		return error;
 
@@ -427,7 +427,7 @@ xfs_attr_remove(
 	 */
 	args.op_flags = XFS_DA_OP_OKNOENT;
 
-	error = xfs_qm_dqattach(dp, 0);
+	error = xfs_qm_dqattach(dp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 696c3b6bd2c9..518627c1b412 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -848,7 +848,7 @@ xfs_free_eofblocks(
 		/*
 		 * Attach the dquots to the inode up front.
 		 */
-		error = xfs_qm_dqattach(ip, 0);
+		error = xfs_qm_dqattach(ip);
 		if (error)
 			return error;
 
@@ -918,7 +918,7 @@ xfs_alloc_file_space(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		return error;
 
@@ -1169,7 +1169,7 @@ xfs_free_file_space(
 
 	trace_xfs_free_file_space(ip);
 
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5b2e08488107..74d5cbee8a71 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1424,11 +1424,11 @@ xfs_link(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	error = xfs_qm_dqattach(sip, 0);
+	error = xfs_qm_dqattach(sip);
 	if (error)
 		goto std_return;
 
-	error = xfs_qm_dqattach(tdp, 0);
+	error = xfs_qm_dqattach(tdp);
 	if (error)
 		goto std_return;
 
@@ -1929,7 +1929,7 @@ xfs_inactive(
 	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
 		truncate = 1;
 
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		return;
 
@@ -2592,11 +2592,11 @@ xfs_remove(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	error = xfs_qm_dqattach(dp, 0);
+	error = xfs_qm_dqattach(dp);
 	if (error)
 		goto std_return;
 
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		goto std_return;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d03e65f01c89..0880685a1143 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -224,7 +224,7 @@ xfs_iomap_write_direct(
 	 * necessary and move on to transaction setup.
 	 */
 	xfs_iunlock(ip, lockmode);
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		return error;
 
@@ -692,7 +692,7 @@ xfs_iomap_write_allocate(
 	/*
 	 * Make sure that the dquots are there.
 	 */
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a3ed3c811dfa..5afe3c2234b3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -855,7 +855,7 @@ xfs_setattr_size(
 	/*
 	 * Make sure that the dquots are attached to the inode.
 	 */
-	error = xfs_qm_dqattach(ip, 0);
+	error = xfs_qm_dqattach(ip);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eb106366dea1..3893f541f88d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -378,8 +378,7 @@ done:
 
 int
 xfs_qm_dqattach(
-	struct xfs_inode	*ip,
-	uint			flags)
+	struct xfs_inode	*ip)
 {
 	int			error;
 
@@ -387,7 +386,7 @@ xfs_qm_dqattach(
 		return 0;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_qm_dqattach_locked(ip, flags);
+	error = xfs_qm_dqattach_locked(ip, 0);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	return error;
@@ -1926,7 +1925,7 @@ xfs_qm_vop_rename_dqattach(
 		 */
 		if (i == 0 || ip != i_tab[i-1]) {
 			if (XFS_NOT_DQATTACHED(mp, ip)) {
-				error = xfs_qm_dqattach(ip, 0);
+				error = xfs_qm_dqattach(ip);
 				if (error)
 					return error;
 			}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index ce6506adab7b..a4d392240cf4 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -90,7 +90,7 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
 extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *,
 		struct xfs_dquot *, uint);
-extern int xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *);
 extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
 extern void xfs_qm_dqdetach(struct xfs_inode *);
 extern void xfs_qm_dqrele(struct xfs_dquot *);
@@ -132,7 +132,7 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
 #define xfs_qm_vop_rename_dqattach(it)					(0)
 #define xfs_qm_vop_chown(tp, ip, old, new)				(NULL)
 #define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl)			(0)
-#define xfs_qm_dqattach(ip, fl)						(0)
+#define xfs_qm_dqattach(ip)						(0)
 #define xfs_qm_dqattach_locked(ip, fl)					(0)
 #define xfs_qm_dqdetach(ip)
 #define xfs_qm_dqrele(d)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 12d441a73b53..7e2b8078ade2 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1361,7 +1361,7 @@ xfs_reflink_remap_range(
 		goto out_unlock;
 
 	/* Attach dquots to dest inode before changing block map */
-	ret = xfs_qm_dqattach(dest, 0);
+	ret = xfs_qm_dqattach(dest);
 	if (ret)
 		goto out_unlock;
 

From 4882c19d2a77c4d3d20ebcb40531ea9812f4cbba Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:22 -0700
Subject: [PATCH 042/121] xfs: split out dqget for inodes from regular dqget

There are two uses of dqget here -- one is to return the dquot for a
given type and id, and the other is to return the dquot for a given type
and inode.  Those are two separate things, so split them into two
smaller functions.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c       | 146 +++++++++++++++++++++++++++------------
 fs/xfs/xfs_dquot.h       |  10 ++-
 fs/xfs/xfs_iomap.c       |   2 +-
 fs/xfs/xfs_qm.c          |  39 ++++-------
 fs/xfs/xfs_qm_bhv.c      |   2 +-
 fs/xfs/xfs_qm_syscalls.c |   4 +-
 fs/xfs/xfs_quota.h       |   2 +-
 fs/xfs/xfs_reflink.c     |   4 +-
 8 files changed, 133 insertions(+), 76 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1ee05e5ab1d9..376923fd2174 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -790,24 +790,19 @@ xfs_qm_dqget_checks(
 }
 
 /*
- * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
- * a locked dquot, doing an allocation (if requested) as needed.
- * When both an inode and an id are given, the inode's id takes precedence.
- * That is, if the id changes while we don't hold the ilock inside this
- * function, the new dquot is returned, not necessarily the one requested
- * in the id argument.
+ * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
+ * dquot, doing an allocation (if requested) as needed.
  */
 int
 xfs_qm_dqget(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,	  /* locked inode (optional) */
-	xfs_dqid_t	id,	  /* uid/projid/gid depending on type */
-	uint		type,	  /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
-	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
-	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	uint			flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+	struct xfs_dquot	**O_dqpp)
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
 	struct xfs_dquot	*dqp;
 	int			error;
 
@@ -815,11 +810,83 @@ xfs_qm_dqget(
 	if (error)
 		return error;
 
-	if (ip) {
-		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-		ASSERT(xfs_inode_dquot(ip, type) == NULL);
+restart:
+	dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+	if (dqp) {
+		*O_dqpp = dqp;
+		return 0;
 	}
 
+	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
+	if (error)
+		return error;
+
+	error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+	if (error) {
+		/*
+		 * Duplicate found. Just throw away the new dquot and start
+		 * over.
+		 */
+		xfs_qm_dqdestroy(dqp);
+		XFS_STATS_INC(mp, xs_qm_dquot_dups);
+		goto restart;
+	}
+
+	trace_xfs_dqget_miss(dqp);
+	*O_dqpp = dqp;
+	return 0;
+}
+
+/* Return the quota id for a given inode and type. */
+xfs_dqid_t
+xfs_qm_id_for_quotatype(
+	struct xfs_inode	*ip,
+	uint			type)
+{
+	switch (type) {
+	case XFS_DQ_USER:
+		return ip->i_d.di_uid;
+	case XFS_DQ_GROUP:
+		return ip->i_d.di_gid;
+	case XFS_DQ_PROJ:
+		return xfs_get_projid(ip);
+	}
+	ASSERT(0);
+	return 0;
+}
+
+/*
+ * Return the dquot for a given inode and type.  If @can_alloc is true, then
+ * allocate blocks if needed.  The inode's ILOCK must be held and it must not
+ * have already had an inode attached.
+ */
+int
+xfs_qm_dqget_inode(
+	struct xfs_inode	*ip,
+	uint			type,
+	bool			can_alloc,
+	struct xfs_dquot	**O_dqpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_quotainfo	*qi = mp->m_quotainfo;
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
+	struct xfs_dquot	*dqp;
+	xfs_dqid_t		id;
+	uint			flags = 0;
+	int			error;
+
+	error = xfs_qm_dqget_checks(mp, type);
+	if (error)
+		return error;
+
+	if (can_alloc)
+		flags |= XFS_QMOPT_DQALLOC;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(xfs_inode_dquot(ip, type) == NULL);
+
+	id = xfs_qm_id_for_quotatype(ip, type);
+
 restart:
 	dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
 	if (dqp) {
@@ -834,37 +901,30 @@ restart:
 	 * lock here means dealing with a chown that can happen before
 	 * we re-acquire the lock.
 	 */
-	if (ip)
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
-
-	if (ip)
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
 
-	if (ip) {
-		/*
-		 * A dquot could be attached to this inode by now, since
-		 * we had dropped the ilock.
-		 */
-		if (xfs_this_quota_on(mp, type)) {
-			struct xfs_dquot	*dqp1;
+	/*
+	 * A dquot could be attached to this inode by now, since we had
+	 * dropped the ilock.
+	 */
+	if (xfs_this_quota_on(mp, type)) {
+		struct xfs_dquot	*dqp1;
 
-			dqp1 = xfs_inode_dquot(ip, type);
-			if (dqp1) {
-				xfs_qm_dqdestroy(dqp);
-				dqp = dqp1;
-				xfs_dqlock(dqp);
-				goto dqret;
-			}
-		} else {
-			/* inode stays locked on return */
+		dqp1 = xfs_inode_dquot(ip, type);
+		if (dqp1) {
 			xfs_qm_dqdestroy(dqp);
-			return -ESRCH;
+			dqp = dqp1;
+			xfs_dqlock(dqp);
+			goto dqret;
 		}
+	} else {
+		/* inode stays locked on return */
+		xfs_qm_dqdestroy(dqp);
+		return -ESRCH;
 	}
 
 	error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
@@ -878,8 +938,8 @@ restart:
 		goto restart;
 	}
 
- dqret:
-	ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+dqret:
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	trace_xfs_dqget_miss(dqp);
 	*O_dqpp = dqp;
 	return 0;
@@ -901,7 +961,7 @@ xfs_qm_dqget_next(
 
 	*dqpp = NULL;
 	for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
-		error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+		error = xfs_qm_dqget(mp, id, type, 0, &dqp);
 		if (error == -ENOENT)
 			continue;
 		else if (error != 0)
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 303e71dfcf70..8a5b30e952b0 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -169,8 +169,14 @@ extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
 					xfs_disk_dquot_t *);
 extern void		xfs_qm_adjust_dqlimits(struct xfs_mount *,
 					       struct xfs_dquot *);
-extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
-					xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern xfs_dqid_t	xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+					uint type);
+extern int		xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+					uint type, uint flags,
+					struct xfs_dquot **dqpp);
+extern int		xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
+					bool can_alloc,
+					struct xfs_dquot **dqpp);
 extern int		xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
 					uint type, struct xfs_dquot **dqpp);
 extern void		xfs_qm_dqput(xfs_dquot_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0880685a1143..c6ce6f9335b6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -576,7 +576,7 @@ xfs_file_iomap_begin_delay(
 		goto done;
 	}
 
-	error = xfs_qm_dqattach_locked(ip, 0);
+	error = xfs_qm_dqattach_locked(ip, false);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3893f541f88d..6122097da0b6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -262,7 +262,7 @@ xfs_qm_dqattach_one(
 	xfs_inode_t	*ip,
 	xfs_dqid_t	id,
 	uint		type,
-	uint		doalloc,
+	bool		doalloc,
 	xfs_dquot_t	**IO_idqpp)
 {
 	xfs_dquot_t	*dqp;
@@ -288,7 +288,7 @@ xfs_qm_dqattach_one(
 	 * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
 	 * turned off suddenly.
 	 */
-	error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp);
+	error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp);
 	if (error)
 		return error;
 
@@ -330,7 +330,7 @@ xfs_qm_need_dqattach(
 int
 xfs_qm_dqattach_locked(
 	xfs_inode_t	*ip,
-	uint		flags)
+	bool		doalloc)
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	int		error = 0;
@@ -342,8 +342,7 @@ xfs_qm_dqattach_locked(
 
 	if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
 		error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
-						flags & XFS_QMOPT_DQALLOC,
-						&ip->i_udquot);
+				doalloc, &ip->i_udquot);
 		if (error)
 			goto done;
 		ASSERT(ip->i_udquot);
@@ -351,8 +350,7 @@ xfs_qm_dqattach_locked(
 
 	if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
 		error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
-						flags & XFS_QMOPT_DQALLOC,
-						&ip->i_gdquot);
+				doalloc, &ip->i_gdquot);
 		if (error)
 			goto done;
 		ASSERT(ip->i_gdquot);
@@ -360,8 +358,7 @@ xfs_qm_dqattach_locked(
 
 	if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
 		error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
-						flags & XFS_QMOPT_DQALLOC,
-						&ip->i_pdquot);
+				doalloc, &ip->i_pdquot);
 		if (error)
 			goto done;
 		ASSERT(ip->i_pdquot);
@@ -386,7 +383,7 @@ xfs_qm_dqattach(
 		return 0;
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_qm_dqattach_locked(ip, 0);
+	error = xfs_qm_dqattach_locked(ip, false);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	return error;
@@ -1068,7 +1065,7 @@ xfs_qm_quotacheck_dqadjust(
 	struct xfs_dquot	*dqp;
 	int			error;
 
-	error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp);
+	error = xfs_qm_dqget_inode(ip, type, true, &dqp);
 	if (error) {
 		/*
 		 * Shouldn't be able to turn off quotas here.
@@ -1667,7 +1664,7 @@ xfs_qm_vop_dqalloc(
 	 * if necessary. The dquot(s) will not be locked.
 	 */
 	if (XFS_NOT_DQATTACHED(mp, ip)) {
-		error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+		error = xfs_qm_dqattach_locked(ip, true);
 		if (error) {
 			xfs_iunlock(ip, lockflags);
 			return error;
@@ -1686,10 +1683,8 @@ xfs_qm_vop_dqalloc(
 			 * holding ilock.
 			 */
 			xfs_iunlock(ip, lockflags);
-			error = xfs_qm_dqget(mp, NULL, uid,
-						 XFS_DQ_USER,
-						 XFS_QMOPT_DQALLOC,
-						 &uq);
+			error = xfs_qm_dqget(mp, uid, XFS_DQ_USER,
+					XFS_QMOPT_DQALLOC, &uq);
 			if (error) {
 				ASSERT(error != -ENOENT);
 				return error;
@@ -1712,10 +1707,8 @@ xfs_qm_vop_dqalloc(
 	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
 		if (ip->i_d.di_gid != gid) {
 			xfs_iunlock(ip, lockflags);
-			error = xfs_qm_dqget(mp, NULL, gid,
-						 XFS_DQ_GROUP,
-						 XFS_QMOPT_DQALLOC,
-						 &gq);
+			error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP,
+					XFS_QMOPT_DQALLOC, &gq);
 			if (error) {
 				ASSERT(error != -ENOENT);
 				goto error_rele;
@@ -1731,10 +1724,8 @@ xfs_qm_vop_dqalloc(
 	if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
 		if (xfs_get_projid(ip) != prid) {
 			xfs_iunlock(ip, lockflags);
-			error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
-						 XFS_DQ_PROJ,
-						 XFS_QMOPT_DQALLOC,
-						 &pq);
+			error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
+					XFS_QMOPT_DQALLOC, &pq);
 			if (error) {
 				ASSERT(error != -ENOENT);
 				goto error_rele;
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2be6d2735ca9..531e8224dcb6 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -72,7 +72,7 @@ xfs_qm_statvfs(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_dquot_t		*dqp;
 
-	if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+	if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
 		xfs_fill_statvfs_from_dquot(statp, dqp);
 		xfs_qm_dqput(dqp);
 	}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 0234cfc4d445..b9243f554697 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -425,7 +425,7 @@ xfs_qm_scall_setqlim(
 	 * a reference to the dquot, so it's safe to do this unlock/lock without
 	 * it being reclaimed in the mean time.
 	 */
-	error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+	error = xfs_qm_dqget(mp, id, type, XFS_QMOPT_DQALLOC, &dqp);
 	if (error) {
 		ASSERT(error != -ENOENT);
 		goto out_unlock;
@@ -700,7 +700,7 @@ xfs_qm_scall_getquota(
 	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
 	 * exist, we'll get ENOENT back.
 	 */
-	error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+	error = xfs_qm_dqget(mp, id, type, 0, &dqp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index a4d392240cf4..1c79ebbe5236 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -91,7 +91,7 @@ extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *,
 		struct xfs_dquot *, uint);
 extern int xfs_qm_dqattach(struct xfs_inode *);
-extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
 extern void xfs_qm_dqdetach(struct xfs_inode *);
 extern void xfs_qm_dqrele(struct xfs_dquot *);
 extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 7e2b8078ade2..713e857d9ffa 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -305,7 +305,7 @@ xfs_reflink_reserve_cow(
 	 * Fork all the shared blocks from our write offset until the end of
 	 * the extent.
 	 */
-	error = xfs_qm_dqattach_locked(ip, 0);
+	error = xfs_qm_dqattach_locked(ip, false);
 	if (error)
 		return error;
 
@@ -431,7 +431,7 @@ retry:
 		if (error)
 			return error;
 
-		error = xfs_qm_dqattach_locked(ip, 0);
+		error = xfs_qm_dqattach_locked(ip, false);
 		if (error)
 			goto out;
 		goto retry;

From 0fcef1270f309df142575216383a3533779127f3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:22 -0700
Subject: [PATCH 043/121] xfs: fetch dquots directly during quotacheck

Quotacheck only runs during mount, which means that there are no other
processes in the system that could be doing chown or chproj.  Therefore
there's no potential for racing to attach dquots to the inode so we can
drop all the ILOCK and race detection bits from quotacheck.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_qm.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6122097da0b6..f86b3608567d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1056,16 +1056,17 @@ out:
 STATIC int
 xfs_qm_quotacheck_dqadjust(
 	struct xfs_inode	*ip,
-	xfs_dqid_t		id,
 	uint			type,
 	xfs_qcnt_t		nblks,
 	xfs_qcnt_t		rtblks)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_dquot	*dqp;
+	xfs_dqid_t		id;
 	int			error;
 
-	error = xfs_qm_dqget_inode(ip, type, true, &dqp);
+	id = xfs_qm_id_for_quotatype(ip, type);
+	error = xfs_qm_dqget(mp, id, type, XFS_QMOPT_DQALLOC, &dqp);
 	if (error) {
 		/*
 		 * Shouldn't be able to turn off quotas here.
@@ -1138,13 +1139,10 @@ xfs_qm_dqusage_adjust(
 	}
 
 	/*
-	 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
-	 * interface expects the inode to be exclusively locked because that's
-	 * the case in all other instances. It's OK that we do this because
-	 * quotacheck is done only at mount time.
+	 * We don't _need_ to take the ilock EXCL here because quotacheck runs
+	 * at mount time and therefore nobody will be racing chown/chproj.
 	 */
-	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
-			 &ip);
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
 	if (error) {
 		*res = BULKSTAT_RV_NOTHING;
 		return error;
@@ -1179,33 +1177,31 @@ xfs_qm_dqusage_adjust(
 	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
 	 */
 	if (XFS_IS_UQUOTA_ON(mp)) {
-		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
-						   XFS_DQ_USER, nblks, rtblks);
+		error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks,
+				rtblks);
 		if (error)
 			goto error0;
 	}
 
 	if (XFS_IS_GQUOTA_ON(mp)) {
-		error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
-						   XFS_DQ_GROUP, nblks, rtblks);
+		error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks,
+				rtblks);
 		if (error)
 			goto error0;
 	}
 
 	if (XFS_IS_PQUOTA_ON(mp)) {
-		error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
-						   XFS_DQ_PROJ, nblks, rtblks);
+		error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks,
+				rtblks);
 		if (error)
 			goto error0;
 	}
 
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	IRELE(ip);
 	*res = BULKSTAT_RV_DIDONE;
 	return 0;
 
 error0:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	IRELE(ip);
 	*res = BULKSTAT_RV_GIVEUP;
 	return error;

From 617cd5c12c3c37d52f092887b092bdb3245a1310 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:23 -0700
Subject: [PATCH 044/121] xfs: refactor incore dquot initialization functions

Create two incore dquot initialization functions that will help us to
disentangle dqget and dqread.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c | 83 +++++++++++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 376923fd2174..434137eb07a4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -499,26 +499,14 @@ xfs_qm_dqtobp(
 	return 0;
 }
 
-
-/*
- * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
- */
-int
-xfs_qm_dqread(
+/* Allocate and initialize everything we need for an incore dquot. */
+STATIC struct xfs_dquot *
+xfs_dquot_alloc(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
-	uint			type,
-	uint			flags,
-	struct xfs_dquot	**O_dqpp)
+	uint			type)
 {
 	struct xfs_dquot	*dqp;
-	struct xfs_disk_dquot	*ddqp;
-	struct xfs_buf		*bp;
-	struct xfs_trans	*tp = NULL;
-	int			error;
 
 	dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
 
@@ -556,8 +544,54 @@ xfs_qm_dqread(
 		break;
 	}
 
-	XFS_STATS_INC(mp, xs_qm_dquot);
+	xfs_qm_dquot_logitem_init(dqp);
 
+	XFS_STATS_INC(mp, xs_qm_dquot);
+	return dqp;
+}
+
+/* Copy the in-core quota fields in from the on-disk buffer. */
+STATIC void
+xfs_dquot_from_disk(
+	struct xfs_dquot	*dqp,
+	struct xfs_disk_dquot	*ddqp)
+{
+	/* copy everything from disk dquot to the incore dquot */
+	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+
+	/*
+	 * Reservation counters are defined as reservation plus current usage
+	 * to avoid having to add every time.
+	 */
+	dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
+	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
+	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+
+	/* initialize the dquot speculative prealloc thresholds */
+	xfs_dquot_set_prealloc_limits(dqp);
+}
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
+ */
+int
+xfs_qm_dqread(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	uint			flags,
+	struct xfs_dquot	**O_dqpp)
+{
+	struct xfs_dquot	*dqp;
+	struct xfs_disk_dquot	*ddqp;
+	struct xfs_buf		*bp;
+	struct xfs_trans	*tp = NULL;
+	int			error;
+
+	dqp = xfs_dquot_alloc(mp, id, type);
 	trace_xfs_dqread(dqp);
 
 	if (flags & XFS_QMOPT_DQALLOC) {
@@ -582,20 +616,7 @@ xfs_qm_dqread(
 		goto error1;
 	}
 
-	/* copy everything from disk dquot to the incore dquot */
-	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
-	xfs_qm_dquot_logitem_init(dqp);
-
-	/*
-	 * Reservation counters are defined as reservation plus current usage
-	 * to avoid having to add every time.
-	 */
-	dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
-	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
-	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
-
-	/* initialize the dquot speculative prealloc thresholds */
-	xfs_dquot_set_prealloc_limits(dqp);
+	xfs_dquot_from_disk(dqp, ddqp);
 
 	/* Mark the buf so that this will stay incore a little longer */
 	xfs_buf_set_ref(bp, XFS_DQUOT_REF);

From d63192c8904727fced1644f0dd0f7aa26b5f7dc4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:23 -0700
Subject: [PATCH 045/121] xfs: refactor xfs_qm_dqtobp and xfs_qm_dqalloc

Separate the disk dquot read and allocation functionality into
two helper functions, then refactor dqread to call them directly.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_dquot.c | 270 ++++++++++++++++++++-------------------------
 1 file changed, 122 insertions(+), 148 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 434137eb07a4..22bb52ee82c8 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -288,49 +288,43 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
 }
 
 /*
- * Allocate a block and fill it with dquots.
- * This is called when the bmapi finds a hole.
+ * Ensure that the given in-core dquot has a buffer on disk backing it, and
+ * return the buffer. This is called when the bmapi finds a hole.
  */
 STATIC int
-xfs_qm_dqalloc(
-	xfs_trans_t	**tpp,
-	xfs_mount_t	*mp,
-	xfs_dquot_t	*dqp,
-	xfs_inode_t	*quotip,
-	xfs_fileoff_t	offset_fsb,
-	xfs_buf_t	**O_bpp)
+xfs_dquot_disk_alloc(
+	struct xfs_trans	**tpp,
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
 {
-	xfs_fsblock_t	firstblock;
-	struct xfs_defer_ops dfops;
-	xfs_bmbt_irec_t map;
-	int		nmaps, error;
-	xfs_buf_t	*bp;
-	xfs_trans_t	*tp = *tpp;
-
-	ASSERT(tp != NULL);
+	struct xfs_bmbt_irec	map;
+	struct xfs_defer_ops	dfops;
+	struct xfs_mount	*mp = (*tpp)->t_mountp;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*quotip = xfs_quota_inode(mp, dqp->dq_flags);
+	xfs_fsblock_t		firstblock;
+	int			nmaps = 1;
+	int			error;
 
 	trace_xfs_dqalloc(dqp);
 
-	/*
-	 * Initialize the bmap freelist prior to calling bmapi code.
-	 */
 	xfs_defer_init(&dfops, &firstblock);
 	xfs_ilock(quotip, XFS_ILOCK_EXCL);
-	/*
-	 * Return if this type of quotas is turned off while we didn't
-	 * have an inode lock
-	 */
 	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+		/*
+		 * Return if this type of quotas is turned off while we didn't
+		 * have an inode lock
+		 */
 		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 		return -ESRCH;
 	}
 
-	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
-	nmaps = 1;
-	error = xfs_bmapi_write(tp, quotip, offset_fsb,
-				XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
-				&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
-				&map, &nmaps, &dfops);
+	/* Create the block mapping. */
+	xfs_trans_ijoin(*tpp, quotip, XFS_ILOCK_EXCL);
+	error = xfs_bmapi_write(*tpp, quotip, dqp->q_fileoffset,
+			XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+			&firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+			&map, &nmaps, &dfops);
 	if (error)
 		goto error0;
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -344,10 +338,8 @@ xfs_qm_dqalloc(
 	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
 
 	/* now we can just get the buffer (there's nothing to read yet) */
-	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
-			       dqp->q_blkno,
-			       mp->m_quotainfo->qi_dqchunklen,
-			       0);
+	bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, dqp->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, 0);
 	if (!bp) {
 		error = -ENOMEM;
 		goto error1;
@@ -358,8 +350,9 @@ xfs_qm_dqalloc(
 	 * Make a chunk of dquots out of this buffer and log
 	 * the entire thing.
 	 */
-	xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+	xfs_qm_init_dquot_blk(*tpp, mp, be32_to_cpu(dqp->q_core.d_id),
 			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
 
 	/*
 	 * Hold the buffer and join it to the dfops so that we'll still own
@@ -379,7 +372,7 @@ xfs_qm_dqalloc(
 	 * transaction, so we must _buf_relse it.
 	 *
 	 * If everything succeeds, the caller of this function is returned a
-	 * buffer that is locked and joined to the transaction.  The caller
+	 * buffer that is locked and held to the transaction.  The caller
 	 * is responsible for unlocking any buffer passed back, either
 	 * manually or by committing the transaction.
 	 */
@@ -395,8 +388,7 @@ xfs_qm_dqalloc(
 		xfs_buf_relse(bp);
 		goto error1;
 	}
-	xfs_trans_bhold_release(*tpp, bp);
-	*O_bpp = bp;
+	*bpp = bp;
 	return 0;
 
 error1:
@@ -406,32 +398,24 @@ error0:
 }
 
 /*
- * Maps a dquot to the buffer containing its on-disk version.
- * This returns a ptr to the buffer containing the on-disk dquot
- * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ * Read in the in-core dquot's on-disk metadata and return the buffer.
+ * Returns ENOENT to signal a hole.
  */
 STATIC int
-xfs_qm_dqtobp(
-	xfs_trans_t		**tpp,
-	xfs_dquot_t		*dqp,
-	xfs_disk_dquot_t	**O_ddpp,
-	xfs_buf_t		**O_bpp,
-	uint			flags)
+xfs_dquot_disk_read(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
 {
 	struct xfs_bmbt_irec	map;
-	int			nmaps = 1, error;
 	struct xfs_buf		*bp;
-	struct xfs_inode	*quotip;
-	struct xfs_mount	*mp = dqp->q_mount;
-	xfs_dqid_t		id = be32_to_cpu(dqp->q_core.d_id);
-	struct xfs_trans	*tp = (tpp ? *tpp : NULL);
+	struct xfs_inode	*quotip = xfs_quota_inode(mp, dqp->dq_flags);
 	uint			lock_mode;
-
-	quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
-	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+	int			nmaps = 1;
+	int			error;
 
 	lock_mode = xfs_ilock_data_map_shared(quotip);
-	if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+	if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
 		/*
 		 * Return if this type of quotas is turned off while we
 		 * didn't have the quota inode lock.
@@ -444,57 +428,36 @@ xfs_qm_dqtobp(
 	 * Find the block map; no allocations yet
 	 */
 	error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
-			       XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
-
+			XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
 	xfs_iunlock(quotip, lock_mode);
 	if (error)
 		return error;
 
 	ASSERT(nmaps == 1);
-	ASSERT(map.br_blockcount == 1);
+	ASSERT(map.br_blockcount >= 1);
+	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+	if (map.br_startblock == HOLESTARTBLOCK)
+		return -ENOENT;
+
+	trace_xfs_dqtobp_read(dqp);
 
 	/*
-	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 * store the blkno etc so that we don't have to do the
+	 * mapping all the time
 	 */
-	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
-		sizeof(xfs_dqblk_t);
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
 
-	ASSERT(map.br_startblock != DELAYSTARTBLOCK);
-	if (map.br_startblock == HOLESTARTBLOCK) {
-		/*
-		 * We don't allocate unless we're asked to
-		 */
-		if (!(flags & XFS_QMOPT_DQALLOC))
-			return -ENOENT;
-
-		ASSERT(tp);
-		error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-					dqp->q_fileoffset, &bp);
-		if (error)
-			return error;
-		tp = *tpp;
-	} else {
-		trace_xfs_dqtobp_read(dqp);
-
-		/*
-		 * store the blkno etc so that we don't have to do the
-		 * mapping all the time
-		 */
-		dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
-		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-					   dqp->q_blkno,
-					   mp->m_quotainfo->qi_dqchunklen,
-					   0, &bp, &xfs_dquot_buf_ops);
-		if (error) {
-			ASSERT(bp == NULL);
-			return error;
-		}
+	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+			mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+			&xfs_dquot_buf_ops);
+	if (error) {
+		ASSERT(bp == NULL);
+		return error;
 	}
 
 	ASSERT(xfs_buf_islocked(bp));
-	*O_bpp = bp;
-	*O_ddpp = bp->b_addr + dqp->q_bufoffset;
+	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+	*bpp = bp;
 
 	return 0;
 }
@@ -516,6 +479,12 @@ xfs_dquot_alloc(
 	INIT_LIST_HEAD(&dqp->q_lru);
 	mutex_init(&dqp->q_qlock);
 	init_waitqueue_head(&dqp->q_pinwait);
+	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+	/*
+	 * Offset of dquot in the (fixed sized) dquot chunk.
+	 */
+	dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+			sizeof(xfs_dqblk_t);
 
 	/*
 	 * Because we want to use a counting completion, complete
@@ -554,8 +523,10 @@ xfs_dquot_alloc(
 STATIC void
 xfs_dquot_from_disk(
 	struct xfs_dquot	*dqp,
-	struct xfs_disk_dquot	*ddqp)
+	struct xfs_buf		*bp)
 {
+	struct xfs_disk_dquot	*ddqp = bp->b_addr + dqp->q_bufoffset;
+
 	/* copy everything from disk dquot to the incore dquot */
 	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
 
@@ -571,6 +542,44 @@ xfs_dquot_from_disk(
 	xfs_dquot_set_prealloc_limits(dqp);
 }
 
+/* Allocate and initialize the dquot buffer for this in-core dquot. */
+static int
+xfs_qm_dqread_alloc(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_trans	*tp;
+	struct xfs_buf		*bp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+			XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
+	if (error)
+		goto err;
+
+	error = xfs_dquot_disk_alloc(&tp, dqp, &bp);
+	if (error)
+		goto err_cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error) {
+		/*
+		 * Buffer was held to the transaction, so we have to unlock it
+		 * manually here because we're not passing it back.
+		 */
+		xfs_buf_relse(bp);
+		goto err;
+	}
+	*bpp = bp;
+	return 0;
+
+err_cancel:
+	xfs_trans_cancel(tp);
+err:
+	return error;
+}
+
 /*
  * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
  * and release the buffer immediately.
@@ -583,74 +592,39 @@ xfs_qm_dqread(
 	xfs_dqid_t		id,
 	uint			type,
 	uint			flags,
-	struct xfs_dquot	**O_dqpp)
+	struct xfs_dquot	**dqpp)
 {
 	struct xfs_dquot	*dqp;
-	struct xfs_disk_dquot	*ddqp;
 	struct xfs_buf		*bp;
-	struct xfs_trans	*tp = NULL;
 	int			error;
 
 	dqp = xfs_dquot_alloc(mp, id, type);
 	trace_xfs_dqread(dqp);
 
-	if (flags & XFS_QMOPT_DQALLOC) {
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
-				XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
-		if (error)
-			goto error0;
-	}
+	/* Try to read the buffer, allocating if necessary. */
+	error = xfs_dquot_disk_read(mp, dqp, &bp);
+	if (error == -ENOENT && (flags & XFS_QMOPT_DQALLOC))
+		error = xfs_qm_dqread_alloc(mp, dqp, &bp);
+	if (error)
+		goto err;
 
 	/*
-	 * get a pointer to the on-disk dquot and the buffer containing it
-	 * dqp already knows its own type (GROUP/USER).
-	 */
-	error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
-	if (error) {
-		/*
-		 * This can happen if quotas got turned off (ESRCH),
-		 * or if the dquot didn't exist on disk and we ask to
-		 * allocate (ENOENT).
-		 */
-		trace_xfs_dqread_fail(dqp);
-		goto error1;
-	}
-
-	xfs_dquot_from_disk(dqp, ddqp);
-
-	/* Mark the buf so that this will stay incore a little longer */
-	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
-
-	/*
-	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
-	 * So we need to release with xfs_trans_brelse().
-	 * The strategy here is identical to that of inodes; we lock
-	 * the dquot in xfs_qm_dqget() before making it accessible to
-	 * others. This is because dquots, like inodes, need a good level of
-	 * concurrency, and we don't want to take locks on the entire buffers
-	 * for dquot accesses.
-	 * Note also that the dquot buffer may even be dirty at this point, if
-	 * this particular dquot was repaired. We still aren't afraid to
-	 * brelse it because we have the changes incore.
+	 * At this point we should have a clean locked buffer.  Copy the data
+	 * to the incore dquot and release the buffer since the incore dquot
+	 * has its own locking protocol so we needn't tie up the buffer any
+	 * further.
 	 */
 	ASSERT(xfs_buf_islocked(bp));
-	xfs_trans_brelse(tp, bp);
+	xfs_dquot_from_disk(dqp, bp);
 
-	if (tp) {
-		error = xfs_trans_commit(tp);
-		if (error)
-			goto error0;
-	}
-
-	*O_dqpp = dqp;
+	xfs_buf_relse(bp);
+	*dqpp = dqp;
 	return error;
 
-error1:
-	if (tp)
-		xfs_trans_cancel(tp);
-error0:
+err:
+	trace_xfs_dqread_fail(dqp);
 	xfs_qm_dqdestroy(dqp);
-	*O_dqpp = NULL;
+	*dqpp = NULL;
 	return error;
 }
 

From 114e73ccfa2a51b47160f49524aa46f0d70cb8a9 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:23 -0700
Subject: [PATCH 046/121] xfs: remove direct calls to _qm_dqread

The quota initialization code needs an "uncached" variant of _dqget to
read in default quota limits and timers before the dquot cache is fully
set up.  We've already split up _dqget into its component pieces so
create a fourth variant to address this need, and make dqread internal
to xfs_dquot.c again.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dquot.c | 24 +++++++++++++++++++++++-
 fs/xfs/xfs_dquot.h |  5 +++--
 fs/xfs/xfs_qm.c    | 25 +++++++++++++------------
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 22bb52ee82c8..5593a344732c 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -586,7 +586,7 @@ err:
  *
  * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
  */
-int
+static int
 xfs_qm_dqread(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
@@ -832,6 +832,28 @@ restart:
 	return 0;
 }
 
+/*
+ * Given a dquot id and type, read and initialize a dquot from the on-disk
+ * metadata.  This function is only for use during quota initialization so
+ * it ignores the dquot cache assuming that the dquot shrinker isn't set up.
+ * The caller is responsible for _qm_dqdestroy'ing the returned dquot.
+ */
+int
+xfs_qm_dqget_uncached(
+	struct xfs_mount	*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	struct xfs_dquot	**dqpp)
+{
+	int			error;
+
+	error = xfs_qm_dqget_checks(mp, type);
+	if (error)
+		return error;
+
+	return xfs_qm_dqread(mp, id, type, 0, dqpp);
+}
+
 /* Return the quota id for a given inode and type. */
 xfs_dqid_t
 xfs_qm_id_for_quotatype(
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 8a5b30e952b0..38a42874a98c 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -160,8 +160,6 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
 #define XFS_QM_ISPDQ(dqp)	((dqp)->dq_flags & XFS_DQ_PROJ)
 #define XFS_QM_ISGDQ(dqp)	((dqp)->dq_flags & XFS_DQ_GROUP)
 
-extern int		xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
-					uint, struct xfs_dquot	**);
 extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int		xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
 extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
@@ -179,6 +177,9 @@ extern int		xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
 					struct xfs_dquot **dqpp);
 extern int		xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
 					uint type, struct xfs_dquot **dqpp);
+extern int		xfs_qm_dqget_uncached(struct xfs_mount *mp,
+					xfs_dqid_t id, uint type,
+					struct xfs_dquot **dqpp);
 extern void		xfs_qm_dqput(xfs_dquot_t *);
 
 extern void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f86b3608567d..fac649518101 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -563,8 +563,7 @@ xfs_qm_set_defquota(
 	struct xfs_def_quota    *defq;
 	int			error;
 
-	error = xfs_qm_dqread(mp, 0, type, 0, &dqp);
-
+	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
 	if (!error) {
 		xfs_disk_dquot_t        *ddqp = &dqp->q_core;
 
@@ -590,11 +589,12 @@ xfs_qm_set_defquota(
  */
 STATIC int
 xfs_qm_init_quotainfo(
-	xfs_mount_t	*mp)
+	struct xfs_mount	*mp)
 {
-	xfs_quotainfo_t *qinf;
-	int		error;
-	xfs_dquot_t	*dqp;
+	struct xfs_quotainfo	*qinf;
+	struct xfs_dquot	*dqp;
+	uint			type;
+	int			error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
@@ -637,12 +637,13 @@ xfs_qm_init_quotainfo(
 	 * user/group/proj quota types, otherwise a default value is used.
 	 * This should be split into different fields per quota type.
 	 */
-	error = xfs_qm_dqread(mp, 0,
-			XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
-			 (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
-			  XFS_DQ_PROJ),
-			0, &dqp);
-
+	if (XFS_IS_UQUOTA_RUNNING(mp))
+		type = XFS_DQ_USER;
+	else if (XFS_IS_GQUOTA_RUNNING(mp))
+		type = XFS_DQ_GROUP;
+	else
+		type = XFS_DQ_PROJ;
+	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
 	if (!error) {
 		xfs_disk_dquot_t	*ddqp = &dqp->q_core;
 

From 30ab2dcf2c0693e518b1920e6edc4212cba10d10 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:30:24 -0700
Subject: [PATCH 047/121] xfs: replace XFS_QMOPT_DQALLOC with a simple boolean

DQALLOC is only ever used with xfs_qm_dqget*, and the only flag that the
_dqget family of functions cares about is DQALLOC.  Therefore, change
it to a boolean 'can alloc?' flag for the dqget interfaces where that
makes sense.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_quota_defs.h |  1 -
 fs/xfs/xfs_dquot.c             | 21 ++++++++-------------
 fs/xfs/xfs_dquot.h             |  2 +-
 fs/xfs/xfs_qm.c                | 12 +++++-------
 fs/xfs/xfs_qm_bhv.c            |  2 +-
 fs/xfs/xfs_qm_syscalls.c       |  9 ++++-----
 6 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 619905f3bcac..d4af2804b178 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -107,7 +107,6 @@ typedef uint16_t	xfs_qwarncnt_t;
  * to a single function. None of these XFS_QMOPT_* flags are meant to have
  * persistent values (ie. their values can and will change between versions)
  */
-#define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
 #define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
 #define XFS_QMOPT_PQUOTA	0x0000008 /* project dquot requested */
 #define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5593a344732c..85f9ffd99998 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -582,16 +582,15 @@ err:
 
 /*
  * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
+ * and release the buffer immediately.  If @can_alloc is true, fill any
+ * holes in the on-disk metadata.
  */
 static int
 xfs_qm_dqread(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
-	uint			flags,
+	bool			can_alloc,
 	struct xfs_dquot	**dqpp)
 {
 	struct xfs_dquot	*dqp;
@@ -603,7 +602,7 @@ xfs_qm_dqread(
 
 	/* Try to read the buffer, allocating if necessary. */
 	error = xfs_dquot_disk_read(mp, dqp, &bp);
-	if (error == -ENOENT && (flags & XFS_QMOPT_DQALLOC))
+	if (error == -ENOENT && can_alloc)
 		error = xfs_qm_dqread_alloc(mp, dqp, &bp);
 	if (error)
 		goto err;
@@ -793,7 +792,7 @@ xfs_qm_dqget(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
-	uint			flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+	bool			can_alloc,
 	struct xfs_dquot	**O_dqpp)
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
@@ -812,7 +811,7 @@ restart:
 		return 0;
 	}
 
-	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
+	error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
 	if (error)
 		return error;
 
@@ -889,16 +888,12 @@ xfs_qm_dqget_inode(
 	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
 	struct xfs_dquot	*dqp;
 	xfs_dqid_t		id;
-	uint			flags = 0;
 	int			error;
 
 	error = xfs_qm_dqget_checks(mp, type);
 	if (error)
 		return error;
 
-	if (can_alloc)
-		flags |= XFS_QMOPT_DQALLOC;
-
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(xfs_inode_dquot(ip, type) == NULL);
 
@@ -919,7 +914,7 @@ restart:
 	 * we re-acquire the lock.
 	 */
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	error = xfs_qm_dqread(mp, id, type, flags, &dqp);
+	error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (error)
 		return error;
@@ -978,7 +973,7 @@ xfs_qm_dqget_next(
 
 	*dqpp = NULL;
 	for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
-		error = xfs_qm_dqget(mp, id, type, 0, &dqp);
+		error = xfs_qm_dqget(mp, id, type, false, &dqp);
 		if (error == -ENOENT)
 			continue;
 		else if (error != 0)
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 38a42874a98c..113a16c37122 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -170,7 +170,7 @@ extern void		xfs_qm_adjust_dqlimits(struct xfs_mount *,
 extern xfs_dqid_t	xfs_qm_id_for_quotatype(struct xfs_inode *ip,
 					uint type);
 extern int		xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
-					uint type, uint flags,
+					uint type, bool can_alloc,
 					struct xfs_dquot **dqpp);
 extern int		xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
 					bool can_alloc,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index fac649518101..50ba0ea6165f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -323,7 +323,7 @@ xfs_qm_need_dqattach(
 /*
  * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
  * into account.
- * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * If @doalloc is true, the dquot(s) will be allocated if needed.
  * Inode may get unlocked and relocked in here, and the caller must deal with
  * the consequences.
  */
@@ -1067,7 +1067,7 @@ xfs_qm_quotacheck_dqadjust(
 	int			error;
 
 	id = xfs_qm_id_for_quotatype(ip, type);
-	error = xfs_qm_dqget(mp, id, type, XFS_QMOPT_DQALLOC, &dqp);
+	error = xfs_qm_dqget(mp, id, type, true, &dqp);
 	if (error) {
 		/*
 		 * Shouldn't be able to turn off quotas here.
@@ -1680,8 +1680,7 @@ xfs_qm_vop_dqalloc(
 			 * holding ilock.
 			 */
 			xfs_iunlock(ip, lockflags);
-			error = xfs_qm_dqget(mp, uid, XFS_DQ_USER,
-					XFS_QMOPT_DQALLOC, &uq);
+			error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
 			if (error) {
 				ASSERT(error != -ENOENT);
 				return error;
@@ -1704,8 +1703,7 @@ xfs_qm_vop_dqalloc(
 	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
 		if (ip->i_d.di_gid != gid) {
 			xfs_iunlock(ip, lockflags);
-			error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP,
-					XFS_QMOPT_DQALLOC, &gq);
+			error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
 			if (error) {
 				ASSERT(error != -ENOENT);
 				goto error_rele;
@@ -1722,7 +1720,7 @@ xfs_qm_vop_dqalloc(
 		if (xfs_get_projid(ip) != prid) {
 			xfs_iunlock(ip, lockflags);
 			error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
-					XFS_QMOPT_DQALLOC, &pq);
+					true, &pq);
 			if (error) {
 				ASSERT(error != -ENOENT);
 				goto error_rele;
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 531e8224dcb6..36b89e2c5eb9 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -72,7 +72,7 @@ xfs_qm_statvfs(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_dquot_t		*dqp;
 
-	if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+	if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) {
 		xfs_fill_statvfs_from_dquot(statp, dqp);
 		xfs_qm_dqput(dqp);
 	}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index b9243f554697..3e05d300b14e 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -425,7 +425,7 @@ xfs_qm_scall_setqlim(
 	 * a reference to the dquot, so it's safe to do this unlock/lock without
 	 * it being reclaimed in the mean time.
 	 */
-	error = xfs_qm_dqget(mp, id, type, XFS_QMOPT_DQALLOC, &dqp);
+	error = xfs_qm_dqget(mp, id, type, true, &dqp);
 	if (error) {
 		ASSERT(error != -ENOENT);
 		goto out_unlock;
@@ -696,11 +696,10 @@ xfs_qm_scall_getquota(
 	int			error;
 
 	/*
-	 * Try to get the dquot. We don't want it allocated on disk, so
-	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
-	 * exist, we'll get ENOENT back.
+	 * Try to get the dquot. We don't want it allocated on disk, so don't
+	 * set doalloc. If it doesn't exist, we'll get ENOENT back.
 	 */
-	error = xfs_qm_dqget(mp, id, type, 0, &dqp);
+	error = xfs_qm_dqget(mp, id, type, false, &dqp);
 	if (error)
 		return error;
 

From 28b9060bd80851d889152bc17a241fe1eddb1a0b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:31:20 -0700
Subject: [PATCH 048/121] xfs: rename on-disk dquot counter zap functions

The function 'xfs_qm_dqiterate' doesn't iterate dquots at all, it
iterates all dquot blocks of a quota inode and clears the counters.
Therefore, change the name to something more descriptive so that we can
introduce a real dquot iterator later.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_qm.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 50ba0ea6165f..f927b7d72db1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -884,7 +884,7 @@ xfs_qm_reset_dqcounts(
 }
 
 STATIC int
-xfs_qm_dqiter_bufs(
+xfs_qm_reset_dqcounts_all(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		firstid,
 	xfs_fsblock_t		bno,
@@ -952,11 +952,11 @@ xfs_qm_dqiter_bufs(
 }
 
 /*
- * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
- * caller supplied function for every chunk of dquots that we find.
+ * Iterate over all allocated dquot blocks in this quota inode, zeroing all
+ * counters for every chunk of dquots that we find.
  */
 STATIC int
-xfs_qm_dqiterate(
+xfs_qm_reset_dqcounts_buf(
 	struct xfs_mount	*mp,
 	struct xfs_inode	*qip,
 	uint			flags,
@@ -1032,7 +1032,7 @@ xfs_qm_dqiterate(
 			 * Iterate thru all the blks in the extent and
 			 * reset the counters of all the dquots inside them.
 			 */
-			error = xfs_qm_dqiter_bufs(mp, firstid,
+			error = xfs_qm_reset_dqcounts_all(mp, firstid,
 						   map[i].br_startblock,
 						   map[i].br_blockcount,
 						   flags, buffer_list);
@@ -1293,7 +1293,7 @@ xfs_qm_quotacheck(
 	 * We don't log our changes till later.
 	 */
 	if (uip) {
-		error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
+		error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA,
 					 &buffer_list);
 		if (error)
 			goto error_return;
@@ -1301,7 +1301,7 @@ xfs_qm_quotacheck(
 	}
 
 	if (gip) {
-		error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
+		error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA,
 					 &buffer_list);
 		if (error)
 			goto error_return;
@@ -1309,7 +1309,7 @@ xfs_qm_quotacheck(
 	}
 
 	if (pip) {
-		error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
+		error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA,
 					 &buffer_list);
 		if (error)
 			goto error_return;

From 554ba965407e90fed381a93f230b6ca675cfcd07 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 4 May 2018 15:31:21 -0700
Subject: [PATCH 049/121] xfs: refactor dquot iteration

Create a helper function to iterate all the dquots of a given type in
the system, and refactor the dquot scrub to use it.  This will get more
use in the quota repair code.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/scrub/quota.c | 56 ++++++++++++++++++++------------------------
 fs/xfs/xfs_dquot.c   | 32 +++++++++++++++++++++++++
 fs/xfs/xfs_dquot.h   |  5 ++++
 3 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 50415e8e5dd1..ba87c3aaa8b7 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -77,14 +77,20 @@ xfs_scrub_setup_quota(
 
 /* Quotas. */
 
+struct xfs_scrub_quota_info {
+	struct xfs_scrub_context	*sc;
+	xfs_dqid_t			last_id;
+};
+
 /* Scrub the fields in an individual quota item. */
-STATIC void
+STATIC int
 xfs_scrub_quota_item(
-	struct xfs_scrub_context	*sc,
-	uint				dqtype,
 	struct xfs_dquot		*dq,
-	xfs_dqid_t			id)
+	uint				dqtype,
+	void				*priv)
 {
+	struct xfs_scrub_quota_info	*sqi = priv;
+	struct xfs_scrub_context	*sc = sqi->sc;
 	struct xfs_mount		*mp = sc->mp;
 	struct xfs_disk_dquot		*d = &dq->q_core;
 	struct xfs_quotainfo		*qi = mp->m_quotainfo;
@@ -99,17 +105,18 @@ xfs_scrub_quota_item(
 	unsigned long long		icount;
 	unsigned long long		rcount;
 	xfs_ino_t			fs_icount;
-
-	offset = id / qi->qi_dqperchunk;
+	xfs_dqid_t			id = be32_to_cpu(d->d_id);
 
 	/*
-	 * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
-	 * that the actual dquot we got must either have the same id or
-	 * the next higher id.
+	 * Except for the root dquot, the actual dquot we got must either have
+	 * the same or higher id as we saw before.
 	 */
-	if (id > be32_to_cpu(d->d_id))
+	offset = id / qi->qi_dqperchunk;
+	if (id && id <= sqi->last_id)
 		xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
 
+	sqi->last_id = id;
+
 	/* Did we get the dquot type we wanted? */
 	if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
 		xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
@@ -183,6 +190,8 @@ xfs_scrub_quota_item(
 		xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
 	if (id != 0 && rhard != 0 && rcount > rhard)
 		xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+
+	return 0;
 }
 
 /* Scrub all of a quota type's items. */
@@ -191,13 +200,12 @@ xfs_scrub_quota(
 	struct xfs_scrub_context	*sc)
 {
 	struct xfs_bmbt_irec		irec = { 0 };
+	struct xfs_scrub_quota_info	sqi;
 	struct xfs_mount		*mp = sc->mp;
 	struct xfs_inode		*ip;
 	struct xfs_quotainfo		*qi = mp->m_quotainfo;
-	struct xfs_dquot		*dq;
 	xfs_fileoff_t			max_dqid_off;
 	xfs_fileoff_t			off = 0;
-	xfs_dqid_t			id = 0;
 	uint				dqtype;
 	int				nimaps;
 	int				error = 0;
@@ -264,24 +272,12 @@ xfs_scrub_quota(
 		goto out;
 
 	/* Check all the quota items. */
-	while (id < ((xfs_dqid_t)-1ULL)) {
-		if (xfs_scrub_should_terminate(sc, &error))
-			break;
-
-		error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
-		if (error == -ENOENT)
-			break;
-		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
-				id * qi->qi_dqperchunk, &error))
-			break;
-
-		xfs_scrub_quota_item(sc, dqtype, dq, id);
-
-		id = be32_to_cpu(dq->q_core.d_id) + 1;
-		xfs_qm_dqput(dq);
-		if (!id)
-			break;
-	}
+	sqi.sc = sc;
+	sqi.last_id = 0;
+	error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi);
+	if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
+			sqi.last_id * qi->qi_dqperchunk, &error))
+		goto out;
 
 out:
 	/* We set sc->ip earlier, so make sure we clear it now. */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 85f9ffd99998..2567391489bd 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1267,3 +1267,35 @@ xfs_qm_exit(void)
 	kmem_zone_destroy(xfs_qm_dqtrxzone);
 	kmem_zone_destroy(xfs_qm_dqzone);
 }
+
+/*
+ * Iterate every dquot of a particular type.  The caller must ensure that the
+ * particular quota type is active.  iter_fn can return negative error codes,
+ * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating.
+ */
+int
+xfs_qm_dqiterate(
+	struct xfs_mount	*mp,
+	uint			dqtype,
+	xfs_qm_dqiterate_fn	iter_fn,
+	void			*priv)
+{
+	struct xfs_dquot	*dq;
+	xfs_dqid_t		id = 0;
+	int			error;
+
+	do {
+		error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
+		if (error == -ENOENT)
+			return 0;
+		if (error)
+			return error;
+
+		error = iter_fn(dq, dqtype, priv);
+		id = be32_to_cpu(dq->q_core.d_id);
+		xfs_qm_dqput(dq);
+		id++;
+	} while (error == 0 && id != 0);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 113a16c37122..bdd6bd921528 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -194,4 +194,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
 	return dqp;
 }
 
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype,
+		void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype,
+		xfs_qm_dqiterate_fn iter_fn, void *priv);
+
 #endif /* __XFS_DQUOT_H__ */

From eb41c93fef19ebcafbbe3c180ec41c21032b751e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:00 -0700
Subject: [PATCH 050/121] xfs: avoid ilock games in the quota scrubber

Refactor the quota scrubber to take the quotaofflock and grab the quota
inode in the setup function so that we can treat quota in the same
"scrub in the context of this inode" (i.e. sc->ip) manner as we treat
any other inode.  We do have to drop the quota inode's ILOCK_EXCL to use
dqiterate, but since dquots have their own individual locks the ILOCK
wasn't helping us anyway.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/quota.c | 53 +++++++++++++++++++++-----------------------
 fs/xfs/scrub/scrub.c |  4 ++++
 fs/xfs/scrub/scrub.h |  1 +
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index ba87c3aaa8b7..d3d08978f53a 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -66,12 +66,24 @@ xfs_scrub_setup_quota(
 	struct xfs_inode		*ip)
 {
 	uint				dqtype;
+	int				error;
+
+	if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
+		return -ENOENT;
 
 	dqtype = xfs_scrub_quota_to_dqtype(sc);
 	if (dqtype == 0)
 		return -EINVAL;
+	sc->has_quotaofflock = true;
+	mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
 	if (!xfs_this_quota_on(sc->mp, dqtype))
 		return -ENOENT;
+	error = xfs_scrub_setup_fs(sc, ip);
+	if (error)
+		return error;
+	sc->ip = xfs_quota_inode(sc->mp, dqtype);
+	xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+	sc->ilock_flags = XFS_ILOCK_EXCL;
 	return 0;
 }
 
@@ -202,7 +214,6 @@ xfs_scrub_quota(
 	struct xfs_bmbt_irec		irec = { 0 };
 	struct xfs_scrub_quota_info	sqi;
 	struct xfs_mount		*mp = sc->mp;
-	struct xfs_inode		*ip;
 	struct xfs_quotainfo		*qi = mp->m_quotainfo;
 	xfs_fileoff_t			max_dqid_off;
 	xfs_fileoff_t			off = 0;
@@ -210,25 +221,12 @@ xfs_scrub_quota(
 	int				nimaps;
 	int				error = 0;
 
-	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-		return -ENOENT;
-
-	mutex_lock(&qi->qi_quotaofflock);
 	dqtype = xfs_scrub_quota_to_dqtype(sc);
-	if (!xfs_this_quota_on(sc->mp, dqtype)) {
-		error = -ENOENT;
-		goto out_unlock_quota;
-	}
-
-	/* Attach to the quota inode and set sc->ip so that reporting works. */
-	ip = xfs_quota_inode(sc->mp, dqtype);
-	sc->ip = ip;
 
 	/* Look for problem extents. */
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+	if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
 		xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
-		goto out_unlock_inode;
+		goto out;
 	}
 	max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
 	while (1) {
@@ -237,11 +235,11 @@ xfs_scrub_quota(
 
 		off = irec.br_startoff + irec.br_blockcount;
 		nimaps = 1;
-		error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps,
+		error = xfs_bmapi_read(sc->ip, off, -1, &irec, &nimaps,
 				XFS_BMAPI_ENTIRE);
 		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
 				&error))
-			goto out_unlock_inode;
+			goto out;
 		if (!nimaps)
 			break;
 		if (irec.br_startblock == HOLESTARTBLOCK)
@@ -267,26 +265,25 @@ xfs_scrub_quota(
 		    irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
 			xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
 	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		goto out;
 
-	/* Check all the quota items. */
+	/*
+	 * Check all the quota items.  Now that we've checked the quota inode
+	 * data fork we have to drop ILOCK_EXCL to use the regular dquot
+	 * functions.
+	 */
+	xfs_iunlock(sc->ip, sc->ilock_flags);
+	sc->ilock_flags = 0;
 	sqi.sc = sc;
 	sqi.last_id = 0;
 	error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi);
+	sc->ilock_flags = XFS_ILOCK_EXCL;
+	xfs_ilock(sc->ip, sc->ilock_flags);
 	if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
 			sqi.last_id * qi->qi_dqperchunk, &error))
 		goto out;
 
 out:
-	/* We set sc->ip earlier, so make sure we clear it now. */
-	sc->ip = NULL;
-out_unlock_quota:
-	mutex_unlock(&qi->qi_quotaofflock);
 	return error;
-
-out_unlock_inode:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	goto out;
 }
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 26c75967a072..67509d8b8b8e 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -42,6 +42,8 @@
 #include "xfs_refcount_btree.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -166,6 +168,8 @@ xfs_scrub_teardown(
 			iput(VFS_I(sc->ip));
 		sc->ip = NULL;
 	}
+	if (sc->has_quotaofflock)
+		mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
 	if (sc->buf) {
 		kmem_free(sc->buf);
 		sc->buf = NULL;
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 0d92af86f67a..5d797319fc9a 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -73,6 +73,7 @@ struct xfs_scrub_context {
 	void				*buf;
 	uint				ilock_flags;
 	bool				try_harder;
+	bool				has_quotaofflock;
 
 	/* State tracking for single-AG operations. */
 	struct xfs_scrub_ag		sa;

From 631fc955bdc86c3fed5880cba80c663d1b32e0c2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:00 -0700
Subject: [PATCH 051/121] xfs: clean up scrub usage of KM_NOFS

All scrub code runs in transaction context, which means that memory
allocations are automatically run in PF_MEMALLOC_NOFS context.  It's
therefore unnecessary to pass in KM_NOFS to allocation routines, so
clean them all out.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/agheader.c | 3 ++-
 fs/xfs/scrub/btree.c    | 2 +-
 fs/xfs/scrub/refcount.c | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 018aabbd9394..08a1f013d92c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -797,7 +797,8 @@ xfs_scrub_agfl(
 	}
 	memset(&sai, 0, sizeof(sai));
 	sai.sz_entries = agflcount;
-	sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
+	sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
+			KM_MAYFAIL);
 	if (!sai.entries) {
 		error = -ENOMEM;
 		goto out;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 54218168c8f9..ea972dab1485 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -442,7 +442,7 @@ xfs_scrub_btree_check_owner(
 	 */
 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
 		co = kmem_alloc(sizeof(struct check_owner),
-				KM_MAYFAIL | KM_NOFS);
+				KM_MAYFAIL);
 		if (!co)
 			return -ENOMEM;
 		co->level = level;
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 400f1561cd3d..d86526d2932c 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -150,7 +150,7 @@ xfs_scrub_refcountbt_rmap_check(
 		 * so we don't need insertion sort here.
 		 */
 		frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag),
-				KM_MAYFAIL | KM_NOFS);
+				KM_MAYFAIL);
 		if (!frag)
 			return -ENOMEM;
 		memcpy(&frag->rm, rec, sizeof(frag->rm));

From 08a3a692ef586cc82648b88856a97e0d173319e2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:00 -0700
Subject: [PATCH 052/121] xfs: btree scrub should check minrecs

Strengthen the btree block header checks to detect the number of records
being less than the btree type's minimum record count.  Certain blocks
are allowed to violate this constraint -- specifically any btree block
at the top of the tree can have fewer than minrecs records.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/btree.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index ea972dab1485..2d29dceaa00e 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -454,6 +454,44 @@ xfs_scrub_btree_check_owner(
 	return xfs_scrub_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp));
 }
 
+/*
+ * Check that this btree block has at least minrecs records or is one of the
+ * special blocks that don't require that.
+ */
+STATIC void
+xfs_scrub_btree_check_minrecs(
+	struct xfs_scrub_btree	*bs,
+	int			level,
+	struct xfs_btree_block	*block)
+{
+	unsigned int		numrecs;
+	int			ok_level;
+
+	numrecs = be16_to_cpu(block->bb_numrecs);
+
+	/* More records than minrecs means the block is ok. */
+	if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level))
+		return;
+
+	/*
+	 * Certain btree blocks /can/ have fewer than minrecs records.  Any
+	 * level greater than or equal to the level of the highest dedicated
+	 * btree block are allowed to violate this constraint.
+	 *
+	 * For a btree rooted in a block, the btree root can have fewer than
+	 * minrecs records.  If the btree is rooted in an inode and does not
+	 * store records in the root, the direct children of the root and the
+	 * root itself can have fewer than minrecs records.
+	 */
+	ok_level = bs->cur->bc_nlevels - 1;
+	if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		ok_level--;
+	if (level >= ok_level)
+		return;
+
+	xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+}
+
 /*
  * Grab and scrub a btree block given a btree pointer.  Returns block
  * and buffer pointers (if applicable) if they're ok to use.
@@ -491,6 +529,8 @@ xfs_scrub_btree_get_block(
 	if (*pbp)
 		xfs_scrub_buffer_recheck(bs->sc, *pbp);
 
+	xfs_scrub_btree_check_minrecs(bs, level, *pblock);
+
 	/*
 	 * Check the block's owner; this function absorbs error codes
 	 * for us.

From 9d9c90286a74decf11caa9dd625f862ae0257ce0 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:01 -0700
Subject: [PATCH 053/121] xfs: refactor scrub transaction allocation function

Since the transaction allocation helper is about to become more complex,
move it to common.c and remove the redundant parameters.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/bmap.c   |  3 +--
 fs/xfs/scrub/common.c | 16 +++++++++++++---
 fs/xfs/scrub/common.h | 14 +-------------
 fs/xfs/scrub/inode.c  |  5 ++---
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 639d14b51e90..3f8fd10160f0 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -51,7 +51,6 @@ xfs_scrub_setup_inode_bmap(
 	struct xfs_scrub_context	*sc,
 	struct xfs_inode		*ip)
 {
-	struct xfs_mount		*mp = sc->mp;
 	int				error;
 
 	error = xfs_scrub_get_inode(sc, ip);
@@ -75,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
 	}
 
 	/* Got the inode, lock it and we're ready to go. */
-	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	error = xfs_scrub_trans_alloc(sc);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8ed91d5c868d..95625aa90c24 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -568,13 +568,24 @@ xfs_scrub_ag_init(
 
 /* Per-scrubber setup functions */
 
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ */
+int
+xfs_scrub_trans_alloc(
+	struct xfs_scrub_context	*sc)
+{
+	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
 /* Set us up with a transaction and an empty context. */
 int
 xfs_scrub_setup_fs(
 	struct xfs_scrub_context	*sc,
 	struct xfs_inode		*ip)
 {
-	return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+	return xfs_scrub_trans_alloc(sc);
 }
 
 /* Set us up with AG headers and btree cursors. */
@@ -695,7 +706,6 @@ xfs_scrub_setup_inode_contents(
 	struct xfs_inode		*ip,
 	unsigned int			resblks)
 {
-	struct xfs_mount		*mp = sc->mp;
 	int				error;
 
 	error = xfs_scrub_get_inode(sc, ip);
@@ -705,7 +715,7 @@ xfs_scrub_setup_inode_contents(
 	/* Got the inode, lock it and we're ready to go. */
 	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	error = xfs_scrub_trans_alloc(sc);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index deaf60400981..c95c30c986b7 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -38,19 +38,7 @@ xfs_scrub_should_terminate(
 	return false;
 }
 
-/*
- * Grab an empty transaction so that we can re-grab locked buffers if
- * one of our btrees turns out to be cyclic.
- */
-static inline int
-xfs_scrub_trans_alloc(
-	struct xfs_scrub_metadata	*sm,
-	struct xfs_mount		*mp,
-	struct xfs_trans		**tpp)
-{
-	return xfs_trans_alloc_empty(mp, tpp);
-}
-
+int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc);
 bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
 		xfs_agblock_t bno, int *error);
 bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index df14930e4fc5..e15b1bc5053f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -55,7 +55,6 @@ xfs_scrub_setup_inode(
 	struct xfs_scrub_context	*sc,
 	struct xfs_inode		*ip)
 {
-	struct xfs_mount		*mp = sc->mp;
 	int				error;
 
 	/*
@@ -68,7 +67,7 @@ xfs_scrub_setup_inode(
 		break;
 	case -EFSCORRUPTED:
 	case -EFSBADCRC:
-		return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+		return xfs_scrub_trans_alloc(sc);
 	default:
 		return error;
 	}
@@ -76,7 +75,7 @@ xfs_scrub_setup_inode(
 	/* Got the inode, lock it and we're ready to go. */
 	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+	error = xfs_scrub_trans_alloc(sc);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;

From 14861c47400b4a1669956d8b027fe4b7855e39f1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:01 -0700
Subject: [PATCH 054/121] xfs: add helpers to calculate btree size

Add a bunch of helper functions that calculate the sizes of various
btrees.  These will be used to repair btrees and btree headers.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc_btree.c  | 9 +++++++++
 fs/xfs/libxfs/xfs_alloc_btree.h  | 2 ++
 fs/xfs/libxfs/xfs_bmap_btree.c   | 9 +++++++++
 fs/xfs/libxfs/xfs_bmap_btree.h   | 3 +++
 fs/xfs/libxfs/xfs_btree.c        | 4 ++--
 fs/xfs/libxfs/xfs_btree.h        | 2 +-
 fs/xfs/libxfs/xfs_ialloc_btree.c | 9 +++++++++
 fs/xfs/libxfs/xfs_ialloc_btree.h | 2 ++
 8 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index b451649ba176..18aec7a0e599 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -547,3 +547,12 @@ xfs_allocbt_maxrecs(
 		return blocklen / sizeof(xfs_alloc_rec_t);
 	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
 }
+
+/* Calculate the freespace btree size for some records. */
+xfs_extlen_t
+xfs_allocbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_alloc_mnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..2fd54728871c 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -61,5 +61,7 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_buf *,
 		xfs_agnumber_t, xfs_btnum_t);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
 
 #endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d89d06bea6e3..ac9d4aeedb09 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -660,3 +660,12 @@ xfs_bmbt_change_owner(
 	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
 	return error;
 }
+
+/* Calculate the bmap btree size for some records. */
+unsigned long long
+xfs_bmbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index e4505746ccaa..fb3cd2d9e0f8 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -118,4 +118,7 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
 
+extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ac7d66427e42..2a8e01a05631 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4836,14 +4836,14 @@ xfs_btree_query_all(
  * Calculate the number of blocks needed to store a given number of records
  * in a short-format (per-AG metadata) btree.
  */
-xfs_extlen_t
+unsigned long long
 xfs_btree_calc_size(
 	uint			*limits,
 	unsigned long long	len)
 {
 	int			level;
 	int			maxrecs;
-	xfs_extlen_t		rval;
+	unsigned long long	rval;
 
 	maxrecs = limits[0];
 	for (level = 0, rval = 0; len > 1; level++) {
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 9227159a751e..3236b3a8c6bd 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -482,7 +482,7 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
 		unsigned int max_recs);
 
 uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
-xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len);
+unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
 
 /* return codes */
 #define XFS_BTREE_QUERY_RANGE_CONTINUE	0	/* keep iterating */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 367e9a0726e6..ba053a4c124f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -608,3 +608,12 @@ xfs_finobt_calc_reserves(
 	*used += tree_len;
 	return 0;
 }
+
+/* Calculate the inobt btree size for some records. */
+xfs_extlen_t
+xfs_iallocbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp->m_inobt_mnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index aa81e2e63f3f..4acdd5458d59 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -74,5 +74,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
 
 int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
 		xfs_extlen_t *ask, xfs_extlen_t *used);
+extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
 
 #endif	/* __XFS_IALLOC_BTREE_H__ */

From 7f8f1313d91a7db9546de6e5bfeb1a2eebb1fef5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:02 -0700
Subject: [PATCH 055/121] xfs: expose various functions to repair code

Expose various helpers that the repair code will want to use.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_ialloc.c   | 2 +-
 fs/xfs/libxfs/xfs_ialloc.h   | 3 +++
 fs/xfs/libxfs/xfs_refcount.c | 4 ++--
 fs/xfs/libxfs/xfs_refcount.h | 5 +++++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index de627fa19168..4ca4ff7a757d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -148,7 +148,7 @@ xfs_inobt_get_rec(
 /*
  * Insert a single inobt record. Cursor must already point to desired location.
  */
-STATIC int
+int
 xfs_inobt_insert_rec(
 	struct xfs_btree_cur	*cur,
 	uint16_t		holemask,
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index c5402bb4ce0c..77fffced8bac 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -176,6 +176,9 @@ int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
 		xfs_agino_t high, bool *exists);
 int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
 		xfs_agino_t *freecount);
+int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
+		uint8_t count, int32_t freecount, xfs_inofree_t free,
+		int *stat);
 
 int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
 void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 560e28473024..f94bac7a6555 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -89,7 +89,7 @@ xfs_refcount_lookup_ge(
 }
 
 /* Convert on-disk record to in-core format. */
-static inline void
+void
 xfs_refcount_btrec_to_irec(
 	union xfs_btree_rec		*rec,
 	struct xfs_refcount_irec	*irec)
@@ -149,7 +149,7 @@ xfs_refcount_update(
  * by [bno, len, refcount].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
-STATIC int
+int
 xfs_refcount_insert(
 	struct xfs_btree_cur		*cur,
 	struct xfs_refcount_irec	*irec,
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 2a731ac68fe4..5856abb265ec 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -85,5 +85,10 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
 
 extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
 		xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+union xfs_btree_rec;
+extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec,
+		struct xfs_refcount_irec *irec);
+extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
+		struct xfs_refcount_irec *irec, int *stat);
 
 #endif	/* __XFS_REFCOUNT_H__ */

From 4d4f86b49fd0d88677ce45c9cc544cdf663bf047 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:02 -0700
Subject: [PATCH 056/121] xfs: add repair helpers for the reverse mapping btree

Add a couple of functions to the reverse mapping btree that will be used
to repair the rmapbt.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_rmap.c | 81 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_rmap.h |  4 ++
 2 files changed, 85 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index f7769edf5b68..c0644f1be8a8 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2032,6 +2032,34 @@ out_error:
 	return error;
 }
 
+/* Insert a raw rmap into the rmapbt. */
+int
+xfs_rmap_map_raw(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rmap)
+{
+	struct xfs_owner_info	oinfo;
+
+	oinfo.oi_owner = rmap->rm_owner;
+	oinfo.oi_offset = rmap->rm_offset;
+	oinfo.oi_flags = 0;
+	if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+	if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+	if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+		return xfs_rmap_map(cur, rmap->rm_startblock,
+				rmap->rm_blockcount,
+				rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+				&oinfo);
+
+	return xfs_rmap_map_shared(cur, rmap->rm_startblock,
+			rmap->rm_blockcount,
+			rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+			&oinfo);
+}
+
 struct xfs_rmap_query_range_info {
 	xfs_rmap_query_range_fn	fn;
 	void				*priv;
@@ -2455,3 +2483,56 @@ xfs_rmap_record_exists(
 		     irec.rm_startblock + irec.rm_blockcount >= bno + len);
 	return 0;
 }
+
+struct xfs_rmap_key_state {
+	uint64_t			owner;
+	uint64_t			offset;
+	unsigned int			flags;
+	bool				has_rmap;
+};
+
+/* For each rmap given, figure out if it doesn't match the key we want. */
+STATIC int
+xfs_rmap_has_other_keys_helper(
+	struct xfs_btree_cur		*cur,
+	struct xfs_rmap_irec		*rec,
+	void				*priv)
+{
+	struct xfs_rmap_key_state	*rks = priv;
+
+	if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
+	    ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
+		return 0;
+	rks->has_rmap = true;
+	return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Given an extent and some owner info, can we find records overlapping
+ * the extent whose owner info does not match the given owner?
+ */
+int
+xfs_rmap_has_other_keys(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	struct xfs_owner_info		*oinfo,
+	bool				*has_rmap)
+{
+	struct xfs_rmap_irec		low = {0};
+	struct xfs_rmap_irec		high;
+	struct xfs_rmap_key_state	rks;
+	int				error;
+
+	xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
+	rks.has_rmap = false;
+
+	low.rm_startblock = bno;
+	memset(&high, 0xFF, sizeof(high));
+	high.rm_startblock = bno + len - 1;
+
+	error = xfs_rmap_query_range(cur, &low, &high,
+			xfs_rmap_has_other_keys_helper, &rks);
+	*has_rmap = rks.has_rmap;
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 380e53be98d5..43e506f67680 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -238,5 +238,9 @@ int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, struct xfs_owner_info *oinfo,
 		bool *has_rmap);
+int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		xfs_extlen_t len, struct xfs_owner_info *oinfo,
+		bool *has_rmap);
+int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
 
 #endif	/* __XFS_RMAP_H__ */

From 08daa3ccf541b8cc59d198daaccefae17fe565ae Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:03 -0700
Subject: [PATCH 057/121] xfs: add repair helpers for the reference count btree

Add a couple of functions to the refcount btree and generic btree code
that will be used to repair the refcountbt.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_btree.c    | 21 +++++++++++++++++++++
 fs/xfs/libxfs/xfs_btree.h    |  1 +
 fs/xfs/libxfs/xfs_refcount.c | 17 +++++++++++++++++
 fs/xfs/libxfs/xfs_refcount.h |  2 ++
 4 files changed, 41 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2a8e01a05631..c825c8182b30 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4919,3 +4919,24 @@ xfs_btree_has_record(
 	*exists = false;
 	return error;
 }
+
+/* Are there more records in this btree? */
+bool
+xfs_btree_has_more_records(
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+
+	block = xfs_btree_get_block(cur, 0, &bp);
+
+	/* There are still records in this block. */
+	if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+		return true;
+
+	/* There are more record blocks. */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
+	else
+		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3236b3a8c6bd..d7911efee6dc 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -528,5 +528,6 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
 		union xfs_btree_key *key);
 int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
 		union xfs_btree_irec *high, bool *exists);
+bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
 
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index f94bac7a6555..0f23371227cc 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -88,6 +88,23 @@ xfs_refcount_lookup_ge(
 	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
 }
 
+/*
+ * Look up the first record equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_eq(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+			XFS_LOOKUP_LE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
 /* Convert on-disk record to in-core format. */
 void
 xfs_refcount_btrec_to_irec(
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 5856abb265ec..a92ad9078bc1 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -24,6 +24,8 @@ extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
 		xfs_agblock_t bno, int *stat);
 extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
 		xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
+		xfs_agblock_t bno, int *stat);
 extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
 		struct xfs_refcount_irec *irec, int *stat);
 

From 95eb308caa0ff7c4a0a86053422934737e6e6dc7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:02:32 -0700
Subject: [PATCH 058/121] xfs: add BMAPI_NORMAP flag to perform block remapping
 without updating rmapbt

Add a new flag, XFS_BMAPI_NORMAP, which will perform file block
remapping without updating the rmapbt.  This will be used by the repair
code to reconstruct bmbts from the rmapbt, in which case we don't want
the rmapbt update.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 29 ++++++++++++++++++-----------
 fs/xfs/libxfs/xfs_bmap.h |  6 +++++-
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 3d7c53db1977..c7ea3d6f89f1 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -2005,10 +2005,13 @@ xfs_bmap_add_extent_delay_real(
 		ASSERT(0);
 	}
 
-	/* add reverse mapping */
-	error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
-	if (error)
-		goto done;
+	/* add reverse mapping unless caller opted out */
+	if (!(bma->flags & XFS_BMAPI_NORMAP)) {
+		error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip,
+				whichfork, new);
+		if (error)
+			goto done;
+	}
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2672,7 +2675,8 @@ xfs_bmap_add_extent_hole_real(
 	struct xfs_bmbt_irec	*new,
 	xfs_fsblock_t		*first,
 	struct xfs_defer_ops	*dfops,
-	int			*logflagsp)
+	int			*logflagsp,
+	int			flags)
 {
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -2849,10 +2853,12 @@ xfs_bmap_add_extent_hole_real(
 		break;
 	}
 
-	/* add reverse mapping */
-	error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
-	if (error)
-		goto done;
+	/* add reverse mapping unless caller opted out */
+	if (!(flags & XFS_BMAPI_NORMAP)) {
+		error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
+		if (error)
+			goto done;
+	}
 
 	/* convert to a btree if necessary */
 	if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4127,7 +4133,8 @@ xfs_bmapi_allocate(
 	else
 		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
 				whichfork, &bma->icur, &bma->cur, &bma->got,
-				bma->firstblock, bma->dfops, &bma->logflags);
+				bma->firstblock, bma->dfops, &bma->logflags,
+				bma->flags);
 
 	bma->logflags |= tmp_logflags;
 	if (error)
@@ -4573,7 +4580,7 @@ xfs_bmapi_remap(
 	got.br_state = XFS_EXT_NORM;
 
 	error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur,
-			&cur, &got, &firstblock, dfops, &logflags);
+			&cur, &got, &firstblock, dfops, &logflags, 0);
 	if (error)
 		goto error0;
 
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8d8946bfdd8c..c46b73d1cb5b 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -120,6 +120,9 @@ struct xfs_extent_free_item
 /* Skip online discard of freed extents */
 #define XFS_BMAPI_NODISCARD	0x1000
 
+/* Do not update the rmap btree.  Used for reconstructing bmbt from rmapbt. */
+#define XFS_BMAPI_NORMAP	0x2000
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -133,7 +136,8 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
 	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }, \
 	{ XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
-	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }
+	{ XFS_BMAPI_NODISCARD,	"NODISCARD" }, \
+	{ XFS_BMAPI_NORMAP,	"NORMAP" }
 
 
 static inline int xfs_bmapi_aflag(int w)

From d6b636ebb1c9f412687a7b6ed97c996247cc2380 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 9 May 2018 10:03:56 -0700
Subject: [PATCH 059/121] xfs: halt auto-reclamation activities while
 rebuilding rmap

Rebuilding the reverse-mapping tree requires us to quiesce all inodes in
the filesystem, so we must stop background reclamation of post-EOF and
CoW prealloc blocks.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_icache.c | 18 ++++++++++++++++++
 fs/xfs/xfs_icache.h |  3 +++
 fs/xfs/xfs_mount.c  |  4 +---
 fs/xfs/xfs_super.c  | 18 +++++++++---------
 4 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9deff136c5b9..164350d91efc 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1826,3 +1826,21 @@ xfs_inode_clear_cowblocks_tag(
 	return __xfs_inode_clear_blocks_tag(ip,
 			trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
 }
+
+/* Disable post-EOF and CoW block auto-reclamation. */
+void
+xfs_icache_disable_reclaim(
+	struct xfs_mount	*mp)
+{
+	cancel_delayed_work_sync(&mp->m_eofblocks_work);
+	cancel_delayed_work_sync(&mp->m_cowblocks_work);
+}
+
+/* Enable post-EOF and CoW block auto-reclamation. */
+void
+xfs_icache_enable_reclaim(
+	struct xfs_mount	*mp)
+{
+	xfs_queue_eofblocks(mp);
+	xfs_queue_cowblocks(mp);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index d4a77588eca1..d69a0f5a6a73 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -131,4 +131,7 @@ xfs_fs_eofblocks_from_user(
 int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
 				  xfs_ino_t ino, bool *inuse);
 
+void xfs_icache_disable_reclaim(struct xfs_mount *mp);
+void xfs_icache_enable_reclaim(struct xfs_mount *mp);
+
 #endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a901b86772f8..73ed8fec0328 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1072,9 +1072,7 @@ xfs_unmountfs(
 	uint64_t		resblks;
 	int			error;
 
-	cancel_delayed_work_sync(&mp->m_eofblocks_work);
-	cancel_delayed_work_sync(&mp->m_cowblocks_work);
-
+	xfs_icache_disable_reclaim(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 5726ef496980..84aefc81c18e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1372,7 +1372,6 @@ xfs_fs_remount(
 		 */
 		xfs_restore_resvblks(mp);
 		xfs_log_work_queue(mp);
-		xfs_queue_eofblocks(mp);
 
 		/* Recover any CoW blocks that never got remapped. */
 		error = xfs_reflink_recover_cow(mp);
@@ -1382,7 +1381,7 @@ xfs_fs_remount(
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 			return error;
 		}
-		xfs_queue_cowblocks(mp);
+		xfs_icache_enable_reclaim(mp);
 
 		/* Create the per-AG metadata reservation pool .*/
 		error = xfs_fs_reserve_ag_blocks(mp);
@@ -1392,8 +1391,13 @@ xfs_fs_remount(
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
+		/*
+		 * Cancel background eofb scanning so it cannot race with the
+		 * final log force+buftarg wait and deadlock the remount.
+		 */
+		xfs_icache_disable_reclaim(mp);
+
 		/* Get rid of any leftover CoW reservations... */
-		cancel_delayed_work_sync(&mp->m_cowblocks_work);
 		error = xfs_icache_free_cowblocks(mp, NULL);
 		if (error) {
 			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1416,12 +1420,6 @@ xfs_fs_remount(
 		 */
 		xfs_save_resvblks(mp);
 
-		/*
-		 * Cancel background eofb scanning so it cannot race with the
-		 * final log force+buftarg wait and deadlock the remount.
-		 */
-		cancel_delayed_work_sync(&mp->m_eofblocks_work);
-
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
 	}
@@ -1441,6 +1439,7 @@ xfs_fs_freeze(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
+	xfs_icache_disable_reclaim(mp);
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
 	return xfs_sync_sb(mp, true);
@@ -1454,6 +1453,7 @@ xfs_fs_unfreeze(
 
 	xfs_restore_resvblks(mp);
 	xfs_log_work_queue(mp);
+	xfs_icache_enable_reclaim(mp);
 	return 0;
 }
 

From 67482129cdabf7cede1301d2415ef4f0156d35cd Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 10 May 2018 08:38:15 -0700
Subject: [PATCH 060/121] iomap: add a swapfile activation function

Add a new iomap_swapfile_activate function so that filesystems can
activate swap files without having to use the obsolete and slow bmap
function.  This enables XFS to support fallocate'd swap files and
swap files on realtime devices.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/iomap.c            | 162 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_aops.c     |  12 ++++
 include/linux/iomap.h |  11 +++
 3 files changed, 185 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index bcfd7f3654d4..d193390a1c20 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -27,6 +27,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
 #include <linux/sched/signal.h>
+#include <linux/swap.h>
 
 #include "internal.h"
 
@@ -1139,3 +1140,164 @@ out_free_dio:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+/* Swapfile activation */
+
+#ifdef CONFIG_SWAP
+struct iomap_swapfile_info {
+	struct iomap iomap;		/* accumulated iomap */
+	struct swap_info_struct *sis;
+	uint64_t lowest_ppage;		/* lowest physical addr seen (pages) */
+	uint64_t highest_ppage;		/* highest physical addr seen (pages) */
+	unsigned long nr_pages;		/* number of pages collected */
+	int nr_extents;			/* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file.  Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary.  The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+	struct iomap *iomap = &isi->iomap;
+	unsigned long nr_pages;
+	uint64_t first_ppage;
+	uint64_t first_ppage_reported;
+	uint64_t next_ppage;
+	int error;
+
+	/*
+	 * Round the start up and the end down so that the physical
+	 * extent aligns to a page boundary.
+	 */
+	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+			PAGE_SHIFT;
+
+	/* Skip too-short physical extents. */
+	if (first_ppage >= next_ppage)
+		return 0;
+	nr_pages = next_ppage - first_ppage;
+
+	/*
+	 * Calculate how much swap space we're adding; the first page contains
+	 * the swap header and doesn't count.  The mm still wants that first
+	 * page fed to add_swap_extent, however.
+	 */
+	first_ppage_reported = first_ppage;
+	if (iomap->offset == 0)
+		first_ppage_reported++;
+	if (isi->lowest_ppage > first_ppage_reported)
+		isi->lowest_ppage = first_ppage_reported;
+	if (isi->highest_ppage < (next_ppage - 1))
+		isi->highest_ppage = next_ppage - 1;
+
+	/* Add extent, set up for the next call. */
+	error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+	if (error < 0)
+		return error;
+	isi->nr_extents += error;
+	isi->nr_pages += nr_pages;
+	return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+		loff_t count, void *data, struct iomap *iomap)
+{
+	struct iomap_swapfile_info *isi = data;
+	int error;
+
+	/* Skip holes. */
+	if (iomap->type == IOMAP_HOLE)
+		goto out;
+
+	/* Only one bdev per swap file. */
+	if (iomap->bdev != isi->sis->bdev)
+		goto err;
+
+	/* Only real or unwritten extents. */
+	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN)
+		goto err;
+
+	/* No uncommitted metadata or shared blocks or inline data. */
+	if (iomap->flags & (IOMAP_F_DIRTY | IOMAP_F_SHARED |
+			    IOMAP_F_DATA_INLINE))
+		goto err;
+
+	/* No null physical addresses. */
+	if (iomap->addr == IOMAP_NULL_ADDR)
+		goto err;
+
+	if (isi->iomap.length == 0) {
+		/* No accumulated extent, so just store it. */
+		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+		/* Append this to the accumulated extent. */
+		isi->iomap.length += iomap->length;
+	} else {
+		/* Otherwise, add the retained iomap and store this one. */
+		error = iomap_swapfile_add_extent(isi);
+		if (error)
+			return error;
+		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+	}
+out:
+	return count;
+err:
+	pr_err("swapon: file cannot be used for swap\n");
+	return -EINVAL;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+		struct file *swap_file, sector_t *pagespan,
+		const struct iomap_ops *ops)
+{
+	struct iomap_swapfile_info isi = {
+		.sis = sis,
+		.lowest_ppage = (sector_t)-1ULL,
+	};
+	struct address_space *mapping = swap_file->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t pos = 0;
+	loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+	loff_t ret;
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		return ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+				ops, &isi, iomap_swapfile_activate_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	if (isi.iomap.length) {
+		ret = iomap_swapfile_add_extent(&isi);
+		if (ret)
+			return ret;
+	}
+
+	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+	sis->max = isi.nr_pages;
+	sis->pages = isi.nr_pages - 1;
+	sis->highest_bit = isi.nr_pages - 1;
+	return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+#endif /* CONFIG_SWAP */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0ab824f574ed..80de476cecf8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1475,6 +1475,16 @@ xfs_vm_set_page_dirty(
 	return newly_dirty;
 }
 
+static int
+xfs_iomap_swapfile_activate(
+	struct swap_info_struct		*sis,
+	struct file			*swap_file,
+	sector_t			*span)
+{
+	sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
+	return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
+}
+
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
@@ -1488,6 +1498,7 @@ const struct address_space_operations xfs_address_space_operations = {
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
 	.error_remove_page	= generic_error_remove_page,
+	.swap_activate		= xfs_iomap_swapfile_activate,
 };
 
 const struct address_space_operations xfs_dax_aops = {
@@ -1495,4 +1506,5 @@ const struct address_space_operations xfs_dax_aops = {
 	.direct_IO		= noop_direct_IO,
 	.set_page_dirty		= noop_set_page_dirty,
 	.invalidatepage		= noop_invalidatepage,
+	.swap_activate		= xfs_iomap_swapfile_activate,
 };
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 19a07de28212..4bd87294219a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -106,4 +106,15 @@ typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, iomap_dio_end_io_t end_io);
 
+#ifdef CONFIG_SWAP
+struct file;
+struct swap_info_struct;
+
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+		struct file *swap_file, sector_t *pagespan,
+		const struct iomap_ops *ops);
+#else
+# define iomap_swapfile_activate(sis, swapfile, pagespan, ops)	(-EIO)
+#endif /* CONFIG_SWAP */
+
 #endif /* LINUX_IOMAP_H */

From 4e529339af15226a30e0ca044aa2d78ba3518494 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 10 May 2018 09:35:42 -0700
Subject: [PATCH 061/121] xfs: factor out nodiscard helpers

The changes to skip discards of speculative preallocation and
unwritten extents introduced several new wrapper functions through
the bunmapi -> extent free codepath to reduce churn in all of the
associated callers. In several cases, these wrappers simply toggle a
single flag to skip or not skip discards for the resulting blocks.

The explicit _nodiscard() wrappers for such an isolated set of
callers is a bit overkill. Kill off these wrappers and replace with
the calls to the underlying functions in the contexts that need to
control discard behavior. Retain the wrappers that preserve the
original calling conventions to serve the original purpose of
reducing code churn.

This is a refactoring patch and does not change behavior.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_alloc.h  | 11 -----------
 fs/xfs/libxfs/xfs_bmap.c   | 13 ++++---------
 fs/xfs/libxfs/xfs_bmap.h   | 11 -----------
 fs/xfs/xfs_bmap_util.c     |  4 ++--
 fs/xfs/xfs_inode.c         |  9 +++------
 fs/xfs/xfs_inode.h         | 16 +++-------------
 fs/xfs/xfs_trans_extfree.c |  9 ++-------
 7 files changed, 14 insertions(+), 59 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 1dcac78586b4..46d48c6f83b7 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -210,17 +210,6 @@ xfs_free_extent(
 	return __xfs_free_extent(tp, bno, len, oinfo, type, false);
 }
 
-static inline int
-xfs_free_extent_nodiscard(
-	struct xfs_trans	*tp,
-	xfs_fsblock_t		bno,
-	xfs_extlen_t		len,
-	struct xfs_owner_info	*oinfo,
-	enum xfs_ag_resv_type	type)
-{
-	return __xfs_free_extent(tp, bno, len, oinfo, type, true);
-}
-
 int				/* error */
 xfs_alloc_lookup_le(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c7ea3d6f89f1..0fd051064ff0 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5116,15 +5116,10 @@ xfs_bmap_del_extent_real(
 			if (error)
 				goto done;
 		} else {
-			if ((bflags & XFS_BMAPI_NODISCARD) ||
-			    (del->br_state == XFS_EXT_UNWRITTEN)) {
-				xfs_bmap_add_free_nodiscard(mp, dfops,
-					del->br_startblock, del->br_blockcount,
-					NULL);
-			} else {
-				xfs_bmap_add_free(mp, dfops, del->br_startblock,
-					del->br_blockcount, NULL);
-			}
+			__xfs_bmap_add_free(mp, dfops, del->br_startblock,
+					del->br_blockcount, NULL,
+					(bflags & XFS_BMAPI_NODISCARD) ||
+					del->br_state == XFS_EXT_UNWRITTEN);
 		}
 	}
 
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index c46b73d1cb5b..6046012674c8 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -260,17 +260,6 @@ xfs_bmap_add_free(
 	__xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false);
 }
 
-static inline void
-xfs_bmap_add_free_nodiscard(
-	struct xfs_mount		*mp,
-	struct xfs_defer_ops		*dfops,
-	xfs_fsblock_t			bno,
-	xfs_filblks_t			len,
-	struct xfs_owner_info		*oinfo)
-{
-	__xfs_bmap_add_free(mp, dfops, bno, len, oinfo, true);
-}
-
 enum xfs_bmap_intent_type {
 	XFS_BMAP_MAP = 1,
 	XFS_BMAP_UNMAP,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 518627c1b412..06badcbadeb4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -871,8 +871,8 @@ xfs_free_eofblocks(
 		 * contents of the file are flushed to disk then the files
 		 * may be full of holes (ie NULL files bug).
 		 */
-		error = xfs_itruncate_extents_nodiscard(&tp, ip, XFS_DATA_FORK,
-							XFS_ISIZE(ip));
+		error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
+					XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
 		if (error) {
 			/*
 			 * If we get an error at this point we simply don't
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 74d5cbee8a71..05207a64dd53 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1548,12 +1548,12 @@ xfs_itruncate_clear_reflink_flags(
  * dirty on error so that transactions can be easily aborted if possible.
  */
 int
-__xfs_itruncate_extents(
+xfs_itruncate_extents_flags(
 	struct xfs_trans	**tpp,
 	struct xfs_inode	*ip,
 	int			whichfork,
 	xfs_fsize_t		new_size,
-	bool			skip_discard)
+	int			flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp = *tpp;
@@ -1564,7 +1564,6 @@ __xfs_itruncate_extents(
 	xfs_filblks_t		unmap_len;
 	int			error = 0;
 	int			done = 0;
-	int			flags;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
@@ -1577,9 +1576,7 @@ __xfs_itruncate_extents(
 
 	trace_xfs_itruncate_extents_start(ip, new_size);
 
-	flags = xfs_bmapi_aflag(whichfork);
-	if (skip_discard)
-		flags |= XFS_BMAPI_NODISCARD;
+	flags |= xfs_bmapi_aflag(whichfork);
 
 	/*
 	 * Since it is possible for space to become allocated beyond
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3dd3201f4409..00fee6824745 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -415,8 +415,8 @@ uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 uint		xfs_ip2xflags(struct xfs_inode *);
 int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_defer_ops *);
-int		__xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
-				        int, xfs_fsize_t, bool);
+int		xfs_itruncate_extents_flags(struct xfs_trans **,
+				struct xfs_inode *, int, xfs_fsize_t, int);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 
 void		xfs_iunpin_wait(xfs_inode_t *);
@@ -440,17 +440,7 @@ xfs_itruncate_extents(
 	int			whichfork,
 	xfs_fsize_t		new_size)
 {
-	return __xfs_itruncate_extents(tpp, ip, whichfork, new_size, false);
-}
-
-static inline int
-xfs_itruncate_extents_nodiscard(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*ip,
-	int			whichfork,
-	xfs_fsize_t		new_size)
-{
-	return __xfs_itruncate_extents(tpp, ip, whichfork, new_size, true);
+	return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
 }
 
 /* from xfs_file.c */
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 28a1d5a0467c..2f44a08bdf65 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -80,13 +80,8 @@ xfs_trans_free_extent(
 
 	trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
 
-	if (skip_discard)
-		error = xfs_free_extent_nodiscard(tp, start_block, ext_len,
-						  oinfo, XFS_AG_RESV_NONE);
-	else
-		error = xfs_free_extent(tp, start_block, ext_len, oinfo,
-					XFS_AG_RESV_NONE);
-
+	error = __xfs_free_extent(tp, start_block, ext_len,
+				  oinfo, XFS_AG_RESV_NONE, skip_discard);
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
 	 * transaction is aborted, which:

From dae5cd811852c92072f89b36242981e05f856d5b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 10 May 2018 21:50:23 -0700
Subject: [PATCH 062/121] xfs: add mount delay debug option

Similar to log_recovery_delay, this delay occurs between the VFS
superblock being initialised and the xfs_mount being fully
initialised. It also poisons the per-ag radix tree node so that it
can be used for triggering shrinker races during mount
such as the following:

<run memory pressure workload in background>

$ cat dirty-mount.sh
#! /bin/bash

umount -f /dev/pmem0
mkfs.xfs -f /dev/pmem0
mount /dev/pmem0 /mnt/test
rm -f /mnt/test/foo
xfs_io -fxc "pwrite 0 4k" -c fsync -c "shutdown" /mnt/test/foo
umount /dev/pmem0

# let's crash it now!
echo 30 > /sys/fs/xfs/debug/mount_delay
mount /dev/pmem0 /mnt/test
echo 0 > /sys/fs/xfs/debug/mount_delay
umount /dev/pmem0
$ sudo ./dirty-mount.sh
.....
[   60.378118] CPU: 3 PID: 3577 Comm: fs_mark Tainted: G      D W        4.16.0-rc5-dgc #440
[   60.378120] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   60.378124] RIP: 0010:radix_tree_next_chunk+0x76/0x320
[   60.378127] RSP: 0018:ffffc9000276f4f8 EFLAGS: 00010282
[   60.383670] RAX: a5a5a5a5a5a5a5a4 RBX: 0000000000000010 RCX: 000000000000001a
[   60.385277] RDX: 0000000000000000 RSI: ffffc9000276f540 RDI: 0000000000000000
[   60.386554] RBP: 0000000000000000 R08: 0000000000000000 R09: a5a5a5a5a5a5a5a5
[   60.388194] R10: 0000000000000006 R11: 0000000000000001 R12: ffffc9000276f598
[   60.389288] R13: 0000000000000040 R14: 0000000000000228 R15: ffff880816cd6458
[   60.390827] FS:  00007f5c124b9740(0000) GS:ffff88083fc00000(0000) knlGS:0000000000000000
[   60.392253] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   60.393423] CR2: 00007f5c11bba0b8 CR3: 000000035580e001 CR4: 00000000000606e0
[   60.394519] Call Trace:
[   60.395252]  radix_tree_gang_lookup_tag+0xc4/0x130
[   60.395948]  xfs_perag_get_tag+0x37/0xf0
[   60.396522]  xfs_reclaim_inodes_count+0x32/0x40
[   60.397178]  xfs_fs_nr_cached_objects+0x11/0x20
[   60.397837]  super_cache_count+0x35/0xc0
[   60.399159]  shrink_slab.part.66+0xb1/0x370
[   60.400194]  shrink_node+0x7e/0x1a0
[   60.401058]  try_to_free_pages+0x199/0x470
[   60.402081]  __alloc_pages_slowpath+0x3a1/0xd20
[   60.403729]  __alloc_pages_nodemask+0x1c3/0x200
[   60.404941]  cache_grow_begin+0x20b/0x2e0
[   60.406164]  fallback_alloc+0x160/0x200
[   60.407088]  kmem_cache_alloc+0x111/0x4e0
[   60.408038]  ? xfs_buf_rele+0x61/0x430
[   60.408925]  kmem_zone_alloc+0x61/0xe0
[   60.409965]  xfs_inode_alloc+0x24/0x1d0
.....


Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_globals.c |  1 +
 fs/xfs/xfs_super.c   | 11 +++++++++++
 fs/xfs/xfs_sysctl.h  |  1 +
 fs/xfs/xfs_sysfs.c   | 31 +++++++++++++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 3e1cc3001bcb..fdde17a2333c 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -47,6 +47,7 @@ xfs_param_t xfs_params = {
 
 struct xfs_globals xfs_globals = {
 	.log_recovery_delay	=	0,	/* no delay by default */
+	.mount_delay		=	0,	/* no delay by default */
 #ifdef XFS_ASSERT_FATAL
 	.bug_on_assert		=	true,	/* assert failures BUG() */
 #else
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 84aefc81c18e..f19fe291356e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1635,6 +1635,17 @@ xfs_fs_fill_super(
 #endif
 	sb->s_op = &xfs_super_operations;
 
+	/*
+	 * Delay mount work if the debug hook is set. This is debug
+	 * instrumention to coordinate simulation of xfs mount failures with
+	 * VFS superblock operations
+	 */
+	if (xfs_globals.mount_delay) {
+		xfs_notice(mp, "Delaying mount for %d seconds.",
+			xfs_globals.mount_delay);
+		msleep(xfs_globals.mount_delay * 1000);
+	}
+
 	if (silent)
 		flags |= XFS_MFSI_QUIET;
 
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 82afee005140..b53a33e69932 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -95,6 +95,7 @@ extern xfs_param_t	xfs_params;
 
 struct xfs_globals {
 	int	log_recovery_delay;	/* log recovery delay (secs) */
+	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
 };
 extern struct xfs_globals	xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 8b2ccc234f36..2d5cd2529f8e 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -165,9 +165,40 @@ log_recovery_delay_show(
 }
 XFS_SYSFS_ATTR_RW(log_recovery_delay);
 
+STATIC ssize_t
+mount_delay_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > 60)
+		return -EINVAL;
+
+	xfs_globals.mount_delay = val;
+
+	return count;
+}
+
+STATIC ssize_t
+mount_delay_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+}
+XFS_SYSFS_ATTR_RW(mount_delay);
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
+	ATTR_LIST(mount_delay),
 	NULL,
 };
 

From c9fbd7bbc23dbdd73364be4d045e5d3612cf6e82 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 10 May 2018 21:50:23 -0700
Subject: [PATCH 063/121] xfs: clear sb->s_fs_info on mount failure

We recently had an oops reported on a 4.14 kernel in
xfs_reclaim_inodes_count() where sb->s_fs_info pointed to garbage
and so the m_perag_tree lookup walked into lala land.

Essentially, the machine was under memory pressure when the mount
was being run, xfs_fs_fill_super() failed after allocating the
xfs_mount and attaching it to sb->s_fs_info. It then cleaned up and
freed the xfs_mount, but the sb->s_fs_info field still pointed to
the freed memory. Hence when the superblock shrinker then ran
it fell off the bad pointer.

With the superblock shrinker problem fixed at teh VFS level, this
stale s_fs_info pointer is still a problem - we use it
unconditionally in ->put_super when the superblock is being torn
down, and hence we can still trip over it after a ->fill_super
call failure. Hence we need to clear s_fs_info if
xfs-fs_fill_super() fails, and we need to check if it's valid in
the places it can potentially be dereferenced after a ->fill_super
failure.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_super.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f19fe291356e..39e5ec3d407f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1772,6 +1772,7 @@ xfs_fs_fill_super(
  out_close_devices:
 	xfs_close_devices(mp);
  out_free_fsname:
+	sb->s_fs_info = NULL;
 	xfs_free_fsname(mp);
 	kfree(mp);
  out:
@@ -1789,6 +1790,10 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
+	/* if ->fill_super failed, we have no mount to tear down */
+	if (!sb->s_fs_info)
+		return;
+
 	xfs_notice(mp, "Unmounting Filesystem");
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
@@ -1798,6 +1803,8 @@ xfs_fs_put_super(
 	xfs_destroy_percpu_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
 	xfs_close_devices(mp);
+
+	sb->s_fs_info = NULL;
 	xfs_free_fsname(mp);
 	kfree(mp);
 }
@@ -1817,6 +1824,9 @@ xfs_fs_nr_cached_objects(
 	struct super_block	*sb,
 	struct shrink_control	*sc)
 {
+	/* Paranoia: catch incorrect calls during mount setup or teardown */
+	if (WARN_ON_ONCE(!sb->s_fs_info))
+		return 0;
 	return xfs_reclaim_inodes_count(XFS_M(sb));
 }
 

From 8389f3ffa22a119b37dc7c2217cd2862bb2ed9da Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:31 -0700
Subject: [PATCH 064/121] xfs: skip scrub xref if corruption already noted

Don't bother looking for cross-referencing problems if the metadata is
already corrupt or we've already found a cross-referencing problem.
Since we added a helper function for flags testing, convert existing
users to use it.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/alloc.c    |  4 ++--
 fs/xfs/scrub/bmap.c     |  2 +-
 fs/xfs/scrub/common.c   |  4 ++++
 fs/xfs/scrub/common.h   | 10 ++++++++++
 fs/xfs/scrub/ialloc.c   |  7 ++++---
 fs/xfs/scrub/inode.c    |  5 ++++-
 fs/xfs/scrub/refcount.c |  8 ++++----
 fs/xfs/scrub/rmap.c     |  6 +++---
 fs/xfs/scrub/rtbitmap.c |  3 +++
 fs/xfs/scrub/scrub.c    |  4 ++--
 10 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 517c079d3f68..941a0a55224e 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -70,7 +70,7 @@ xfs_scrub_allocbt_xref_other(
 		pcur = &sc->sa.cnt_cur;
 	else
 		pcur = &sc->sa.bno_cur;
-	if (!*pcur)
+	if (!*pcur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec);
@@ -172,7 +172,7 @@ xfs_scrub_xref_is_used_space(
 	bool				is_freesp;
 	int				error;
 
-	if (!sc->sa.bno_cur)
+	if (!sc->sa.bno_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 3f8fd10160f0..e5a4611abf86 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -174,7 +174,7 @@ xfs_scrub_bmap_xref_rmap(
 	unsigned long long		rmap_end;
 	uint64_t			owner;
 
-	if (!info->sc->sa.rmap_cur)
+	if (!info->sc->sa.rmap_cur || xfs_scrub_skip_xref(info->sc->sm))
 		return;
 
 	if (info->whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 95625aa90c24..2ac8576c4670 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -737,6 +737,10 @@ xfs_scrub_should_check_xref(
 	int				*error,
 	struct xfs_btree_cur		**curpp)
 {
+	/* No point in xref if we already know we're corrupt. */
+	if (xfs_scrub_skip_xref(sc->sm))
+		return false;
+
 	if (*error == 0)
 		return true;
 
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index c95c30c986b7..a23ad7fa2b6c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -145,4 +145,14 @@ int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc,
 				   struct xfs_inode *ip, unsigned int resblks);
 void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp);
 
+/*
+ * Don't bother cross-referencing if we already found corruption or cross
+ * referencing discrepancies.
+ */
+static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
+{
+	return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+			       XFS_SCRUB_OFLAG_XCORRUPT);
+}
+
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 106ca4bd753f..00a834d3b56d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -387,7 +387,8 @@ xfs_scrub_iallocbt_xref_rmap_btreeblks(
 	int				error;
 
 	if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
-	    (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur))
+	    (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) ||
+	    xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	/* Check that we saw as many inobt blocks as the rmap says. */
@@ -424,7 +425,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes(
 	xfs_filblks_t			blocks;
 	int				error;
 
-	if (!sc->sa.rmap_cur)
+	if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	/* Check that we saw as many inode blocks as the rmap knows about. */
@@ -496,7 +497,7 @@ xfs_scrub_xref_inode_check(
 	bool				has_inodes;
 	int				error;
 
-	if (!(*icur))
+	if (!(*icur) || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index e15b1bc5053f..550c0cf70a92 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -448,7 +448,7 @@ xfs_scrub_inode_xref_finobt(
 	int				has_record;
 	int				error;
 
-	if (!sc->sa.fino_cur)
+	if (!sc->sa.fino_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	agino = XFS_INO_TO_AGINO(sc->mp, ino);
@@ -491,6 +491,9 @@ xfs_scrub_inode_xref_bmap(
 	xfs_filblks_t			acount;
 	int				error;
 
+	if (xfs_scrub_skip_xref(sc->sm))
+		return;
+
 	/* Walk all the extents to check nextents/naextents/nblocks. */
 	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
 			&nextents, &count);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index d86526d2932c..324a5f159145 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -310,7 +310,7 @@ xfs_scrub_refcountbt_xref_rmap(
 	struct xfs_scrub_refcnt_frag	*n;
 	int				error;
 
-	if (!sc->sa.rmap_cur)
+	if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	/* Cross-reference with the rmapbt to confirm the refcount. */
@@ -404,7 +404,7 @@ xfs_scrub_refcount_xref_rmap(
 	xfs_filblks_t			blocks;
 	int				error;
 
-	if (!sc->sa.rmap_cur)
+	if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	/* Check that we saw as many refcbt blocks as the rmap knows about. */
@@ -460,7 +460,7 @@ xfs_scrub_xref_is_cow_staging(
 	int				has_refcount;
 	int				error;
 
-	if (!sc->sa.refc_cur)
+	if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	/* Find the CoW staging extent. */
@@ -504,7 +504,7 @@ xfs_scrub_xref_is_not_shared(
 	bool				shared;
 	int				error;
 
-	if (!sc->sa.refc_cur)
+	if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8f2a7c3ff455..b376a9a77c04 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -66,7 +66,7 @@ xfs_scrub_rmapbt_xref_refc(
 	bool				is_unwritten;
 	int				error;
 
-	if (!sc->sa.refc_cur)
+	if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
@@ -207,7 +207,7 @@ xfs_scrub_xref_check_owner(
 	bool				has_rmap;
 	int				error;
 
-	if (!sc->sa.rmap_cur)
+	if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
@@ -250,7 +250,7 @@ xfs_scrub_xref_has_no_owner(
 	bool				has_rmap;
 	int				error;
 
-	if (!sc->sa.rmap_cur)
+	if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
 		return;
 
 	error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 39c41dfe08ee..8b048f107af2 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -110,6 +110,9 @@ xfs_scrub_xref_is_used_rt_space(
 	bool				is_free;
 	int				error;
 
+	if (xfs_scrub_skip_xref(sc->sm))
+		return;
+
 	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
 	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len,
 			&is_free);
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 67509d8b8b8e..ad6eda4fa123 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -450,8 +450,8 @@ retry_op:
 	} else if (error)
 		goto out_teardown;
 
-	if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
-			       XFS_SCRUB_OFLAG_XCORRUPT))
+	if (sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+			    XFS_SCRUB_OFLAG_XCORRUPT))
 		xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
 
 out_teardown:

From 689e11c84b15866619e7582486acacaf79d7e3e2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:31 -0700
Subject: [PATCH 065/121] xfs: superblock scrub should use short-lived buffers

Secondary superblocks are rarely used, so create a helper to read a
given non-primary AG's superblock and ensure that it won't stick around
hogging memory.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_sb.c     | 22 ++++++++++++++++++++++
 fs/xfs/libxfs/xfs_sb.h     |  3 +++
 fs/xfs/libxfs/xfs_shared.h |  1 +
 fs/xfs/scrub/agheader.c    |  4 +---
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d9b94bd5f689..d9bef41a3f26 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -972,3 +972,25 @@ xfs_fs_geometry(
 
 	return 0;
 }
+
+/* Read a secondary superblock. */
+int
+xfs_sb_read_secondary(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(agno != 0 && agno != NULLAGNUMBER);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+	if (error)
+		return error;
+	xfs_buf_set_ref(bp, XFS_SSB_REF);
+	*bpp = bp;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 63dcd2a1a657..5166d78b2c34 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -37,5 +37,8 @@ extern void	xfs_sb_quota_from_disk(struct xfs_sb *sbp);
 #define XFS_FS_GEOM_MAX_STRUCT_VER	(4)
 extern int	xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
 				int struct_version);
+extern int	xfs_sb_read_secondary(struct xfs_mount *mp,
+				struct xfs_trans *tp, xfs_agnumber_t agno,
+				struct xfs_buf **bpp);
 
 #endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 8efc06e62b13..ae99c260adb1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -112,6 +112,7 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_ATTR_BTREE_REF	1
 #define	XFS_DQUOT_REF		1
 #define	XFS_REFC_BTREE_REF	1
+#define	XFS_SSB_REF		0
 
 /*
  * Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 08a1f013d92c..831acc0a328f 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -157,9 +157,7 @@ xfs_scrub_superblock(
 	if (agno == 0)
 		return 0;
 
-	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
-		  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-		  XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+	error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp);
 	/*
 	 * The superblock verifier can return several different error codes
 	 * if it thinks the superblock doesn't look right.  For a mount these

From eac69e167665d342b741986e9d0b53a0a26a6608 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:32 -0700
Subject: [PATCH 066/121] xfs: refactor quota limits initialization

Replace all the if (!error) weirdness with helper functions that follow
our regular coding practices, and factor out the ternary expression soup.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_qm.c | 142 ++++++++++++++++++++++++++----------------------
 1 file changed, 78 insertions(+), 64 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f927b7d72db1..c3e014bfc848 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -561,26 +561,88 @@ xfs_qm_set_defquota(
 {
 	xfs_dquot_t		*dqp;
 	struct xfs_def_quota    *defq;
+	struct xfs_disk_dquot	*ddqp;
 	int			error;
 
 	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
-	if (!error) {
-		xfs_disk_dquot_t        *ddqp = &dqp->q_core;
+	if (error)
+		return;
 
-		defq = xfs_get_defquota(dqp, qinf);
+	ddqp = &dqp->q_core;
+	defq = xfs_get_defquota(dqp, qinf);
 
-		/*
-		 * Timers and warnings have been already set, let's just set the
-		 * default limits for this quota type
-		 */
-		defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
-		defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
-		defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
-		defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
-		defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
-		defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-		xfs_qm_dqdestroy(dqp);
-	}
+	/*
+	 * Timers and warnings have been already set, let's just set the
+	 * default limits for this quota type
+	 */
+	defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+	defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+	defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+	defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+	defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+	defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+	xfs_qm_dqdestroy(dqp);
+}
+
+/* Initialize quota time limits from the root dquot. */
+static void
+xfs_qm_init_timelimits(
+	struct xfs_mount	*mp,
+	struct xfs_quotainfo	*qinf)
+{
+	struct xfs_disk_dquot	*ddqp;
+	struct xfs_dquot	*dqp;
+	uint			type;
+	int			error;
+
+	qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+	qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+	qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+	qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+	qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+	qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+
+	/*
+	 * We try to get the limits from the superuser's limits fields.
+	 * This is quite hacky, but it is standard quota practice.
+	 *
+	 * Since we may not have done a quotacheck by this point, just read
+	 * the dquot without attaching it to any hashtables or lists.
+	 *
+	 * Timers and warnings are globally set by the first timer found in
+	 * user/group/proj quota types, otherwise a default value is used.
+	 * This should be split into different fields per quota type.
+	 */
+	if (XFS_IS_UQUOTA_RUNNING(mp))
+		type = XFS_DQ_USER;
+	else if (XFS_IS_GQUOTA_RUNNING(mp))
+		type = XFS_DQ_GROUP;
+	else
+		type = XFS_DQ_PROJ;
+	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+	if (error)
+		return;
+
+	ddqp = &dqp->q_core;
+	/*
+	 * The warnings and timers set the grace period given to
+	 * a user or group before he or she can not perform any
+	 * more writing. If it is zero, a default is used.
+	 */
+	if (ddqp->d_btimer)
+		qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer);
+	if (ddqp->d_itimer)
+		qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer);
+	if (ddqp->d_rtbtimer)
+		qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer);
+	if (ddqp->d_bwarns)
+		qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns);
+	if (ddqp->d_iwarns)
+		qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns);
+	if (ddqp->d_rtbwarns)
+		qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns);
+
+	xfs_qm_dqdestroy(dqp);
 }
 
 /*
@@ -592,8 +654,6 @@ xfs_qm_init_quotainfo(
 	struct xfs_mount	*mp)
 {
 	struct xfs_quotainfo	*qinf;
-	struct xfs_dquot	*dqp;
-	uint			type;
 	int			error;
 
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -626,53 +686,7 @@ xfs_qm_init_quotainfo(
 
 	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
 
-	/*
-	 * We try to get the limits from the superuser's limits fields.
-	 * This is quite hacky, but it is standard quota practice.
-	 *
-	 * Since we may not have done a quotacheck by this point, just read
-	 * the dquot without attaching it to any hashtables or lists.
-	 *
-	 * Timers and warnings are globally set by the first timer found in
-	 * user/group/proj quota types, otherwise a default value is used.
-	 * This should be split into different fields per quota type.
-	 */
-	if (XFS_IS_UQUOTA_RUNNING(mp))
-		type = XFS_DQ_USER;
-	else if (XFS_IS_GQUOTA_RUNNING(mp))
-		type = XFS_DQ_GROUP;
-	else
-		type = XFS_DQ_PROJ;
-	error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
-	if (!error) {
-		xfs_disk_dquot_t	*ddqp = &dqp->q_core;
-
-		/*
-		 * The warnings and timers set the grace period given to
-		 * a user or group before he or she can not perform any
-		 * more writing. If it is zero, a default is used.
-		 */
-		qinf->qi_btimelimit = ddqp->d_btimer ?
-			be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
-		qinf->qi_itimelimit = ddqp->d_itimer ?
-			be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
-		qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
-			be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
-		qinf->qi_bwarnlimit = ddqp->d_bwarns ?
-			be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
-		qinf->qi_iwarnlimit = ddqp->d_iwarns ?
-			be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
-		qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
-			be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
-		xfs_qm_dqdestroy(dqp);
-	} else {
-		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
-		qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
-		qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
-		qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
-		qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
-		qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
-	}
+	xfs_qm_init_timelimits(mp, qinf);
 
 	if (XFS_IS_UQUOTA_RUNNING(mp))
 		xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);

From 8bc763c24de745608679b128e2e5e25c5070f7d3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:32 -0700
Subject: [PATCH 067/121] xfs: don't continue scrub if already corrupt

If we've already decided that something is corrupt, we might as well
abort all the loops and exit as quickly as possible.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/attr.c   |  3 ++-
 fs/xfs/scrub/bmap.c   |  3 ++-
 fs/xfs/scrub/dir.c    | 35 +++++++++++++++++++++++++++--------
 fs/xfs/scrub/parent.c |  3 +++
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 127575f0abfb..84b6d6b66578 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -126,8 +126,9 @@ xfs_scrub_xattr_listent(
 	if (args.valuelen != valuelen)
 		xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
 					     args.blkno);
-
 fail_xref:
+	if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		context->seen_enough = 1;
 	return;
 }
 
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e5a4611abf86..42a115e83739 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -683,7 +683,8 @@ xfs_scrub_bmap(
 	info.lastoff = 0;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	for_each_xfs_iext(ifp, &icur, &irec) {
-		if (xfs_scrub_should_terminate(sc, &error))
+		if (xfs_scrub_should_terminate(sc, &error) ||
+		    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
 			break;
 		if (isnullstartblock(irec.br_startblock))
 			continue;
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 38f29806eb54..1a4309b3e786 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -172,7 +172,7 @@ xfs_scrub_dir_actor(
 	error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
 	if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
 			&error))
-		goto fail_xref;
+		goto out;
 	if (lookup_ino != ino) {
 		xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
 		goto out;
@@ -183,8 +183,13 @@ xfs_scrub_dir_actor(
 	if (error)
 		goto out;
 out:
-	return error;
-fail_xref:
+	/*
+	 * A negative error code returned here is supposed to cause the
+	 * dir_emit caller (xfs_readdir) to abort the directory iteration
+	 * and return zero to xfs_scrub_directory.
+	 */
+	if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -EFSCORRUPTED;
 	return error;
 }
 
@@ -240,6 +245,9 @@ xfs_scrub_dir_rec(
 	}
 	xfs_scrub_buffer_recheck(ds->sc, bp);
 
+	if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out_relse;
+
 	dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
 
 	/* Make sure we got a real directory entry. */
@@ -357,6 +365,9 @@ xfs_scrub_directory_data_bestfree(
 
 	/* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
 
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out_buf;
+
 	/* Do the bestfrees correspond to actual free space? */
 	bf = d_ops->data_bestfree_p(bp->b_addr);
 	smallest_bestfree = UINT_MAX;
@@ -413,14 +424,18 @@ xfs_scrub_directory_data_bestfree(
 
 		/* Spot check this free entry */
 		tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
-		if (tag != ((char *)dup - (char *)bp->b_addr))
+		if (tag != ((char *)dup - (char *)bp->b_addr)) {
 			xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+			goto out_buf;
+		}
 
 		/*
 		 * Either this entry is a bestfree or it's smaller than
 		 * any of the bestfrees.
 		 */
 		xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup);
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			goto out_buf;
 
 		/* Move on. */
 		newlen = be16_to_cpu(dup->length);
@@ -546,6 +561,8 @@ xfs_scrub_directory_leaf1_bestfree(
 	}
 	if (leafhdr.stale != stale)
 		xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
 
 	/* Check all the bestfree entries. */
 	for (i = 0; i < bestcount; i++, bestp++) {
@@ -556,9 +573,11 @@ xfs_scrub_directory_leaf1_bestfree(
 				i * args->geo->fsbcount, -1, &dbp);
 		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
 				&error))
-			continue;
+			break;
 		xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
 		xfs_trans_brelse(sc->tp, dbp);
+		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+			goto out;
 	}
 out:
 	return error;
@@ -607,7 +626,7 @@ xfs_scrub_directory_free_bestfree(
 				-1, &dbp);
 		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
 				&error))
-			continue;
+			break;
 		xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
 		xfs_trans_brelse(sc->tp, dbp);
 	}
@@ -656,7 +675,7 @@ xfs_scrub_directory_blocks(
 
 	/* Iterate all the data extents in the directory... */
 	found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
-	while (found) {
+	while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
 		/* Block directories only have a single block at offset 0. */
 		if (is_block &&
 		    (got.br_startoff > 0 ||
@@ -719,7 +738,7 @@ xfs_scrub_directory_blocks(
 	/* Scan for free blocks */
 	lblk = free_lblk;
 	found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
-	while (found) {
+	while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
 		/*
 		 * Dirs can't have blocks mapped above 2^32.
 		 * Single-block dirs shouldn't even be here.
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 1fb88c18d455..fc336807e156 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -147,6 +147,9 @@ xfs_scrub_parent_validate(
 
 	*try_again = false;
 
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		goto out;
+
 	/* '..' must not point to ourselves. */
 	if (sc->ip->i_ino == dnum) {
 		xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);

From 87d9d609c22dbfe3a97deeaa9665eb7c823fcfc1 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:33 -0700
Subject: [PATCH 068/121] xfs: quota scrub should use bmapbtd scrubber

Replace the quota scrubber's open-coded data fork scrubber with a
redirected call to the bmapbtd scrubber.  This strengthens the quota
scrub to include all the cross-referencing that it does.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/common.c | 57 +++++++++++++++++++++++++++++
 fs/xfs/scrub/common.h |  2 +
 fs/xfs/scrub/quota.c  | 85 +++++++++++++++++++++----------------------
 3 files changed, 100 insertions(+), 44 deletions(-)

diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 2ac8576c4670..62b33c99efe4 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -44,6 +44,8 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_log.h"
 #include "xfs_trans_priv.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -787,3 +789,58 @@ xfs_scrub_buffer_recheck(
 	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 	trace_xfs_scrub_block_error(sc, bp->b_bn, fa);
 }
+
+/*
+ * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
+ * pointed to by sc->ip and the ILOCK must be held.
+ */
+int
+xfs_scrub_metadata_inode_forks(
+	struct xfs_scrub_context	*sc)
+{
+	__u32				smtype;
+	bool				shared;
+	int				error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	/* Metadata inodes don't live on the rt device. */
+	if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+		xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	/* They should never participate in reflink. */
+	if (xfs_is_reflink_inode(sc->ip)) {
+		xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	/* They also should never have extended attributes. */
+	if (xfs_inode_hasattr(sc->ip)) {
+		xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+		return 0;
+	}
+
+	/* Invoke the data fork scrubber. */
+	smtype = sc->sm->sm_type;
+	sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
+	error = xfs_scrub_bmap_data(sc);
+	sc->sm->sm_type = smtype;
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	/* Look for incorrect shared blocks. */
+	if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+				&shared);
+		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+				&error))
+			return error;
+		if (shared)
+			xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+	}
+
+	return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index a23ad7fa2b6c..5d78bb9602ab 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -155,4 +155,6 @@ static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
 			       XFS_SCRUB_OFLAG_XCORRUPT);
 }
 
+int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc);
+
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index d3d08978f53a..15ae4d23d6ac 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -206,65 +206,62 @@ xfs_scrub_quota_item(
 	return 0;
 }
 
+/* Check the quota's data fork. */
+STATIC int
+xfs_scrub_quota_data_fork(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_bmbt_irec		irec = { 0 };
+	struct xfs_iext_cursor		icur;
+	struct xfs_quotainfo		*qi = sc->mp->m_quotainfo;
+	struct xfs_ifork		*ifp;
+	xfs_fileoff_t			max_dqid_off;
+	int				error = 0;
+
+	/* Invoke the fork scrubber. */
+	error = xfs_scrub_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
+	/* Check for data fork problems that apply only to quota files. */
+	max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+	ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+	for_each_xfs_iext(ifp, &icur, &irec) {
+		if (xfs_scrub_should_terminate(sc, &error))
+			break;
+		/*
+		 * delalloc extents or blocks mapped above the highest
+		 * quota id shouldn't happen.
+		 */
+		if (isnullstartblock(irec.br_startblock) ||
+		    irec.br_startoff > max_dqid_off ||
+		    irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+			xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+					irec.br_startoff);
+			break;
+		}
+	}
+
+	return error;
+}
+
 /* Scrub all of a quota type's items. */
 int
 xfs_scrub_quota(
 	struct xfs_scrub_context	*sc)
 {
-	struct xfs_bmbt_irec		irec = { 0 };
 	struct xfs_scrub_quota_info	sqi;
 	struct xfs_mount		*mp = sc->mp;
 	struct xfs_quotainfo		*qi = mp->m_quotainfo;
-	xfs_fileoff_t			max_dqid_off;
-	xfs_fileoff_t			off = 0;
 	uint				dqtype;
-	int				nimaps;
 	int				error = 0;
 
 	dqtype = xfs_scrub_quota_to_dqtype(sc);
 
 	/* Look for problem extents. */
-	if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
-		xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+	error = xfs_scrub_quota_data_fork(sc);
+	if (error)
 		goto out;
-	}
-	max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
-	while (1) {
-		if (xfs_scrub_should_terminate(sc, &error))
-			break;
-
-		off = irec.br_startoff + irec.br_blockcount;
-		nimaps = 1;
-		error = xfs_bmapi_read(sc->ip, off, -1, &irec, &nimaps,
-				XFS_BMAPI_ENTIRE);
-		if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
-				&error))
-			goto out;
-		if (!nimaps)
-			break;
-		if (irec.br_startblock == HOLESTARTBLOCK)
-			continue;
-
-		/* Check the extent record doesn't point to crap. */
-		if (irec.br_startblock + irec.br_blockcount <=
-		    irec.br_startblock)
-			xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
-					irec.br_startoff);
-		if (!xfs_verify_fsbno(mp, irec.br_startblock) ||
-		    !xfs_verify_fsbno(mp, irec.br_startblock +
-					irec.br_blockcount - 1))
-			xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
-					irec.br_startoff);
-
-		/*
-		 * Unwritten extents or blocks mapped above the highest
-		 * quota id shouldn't happen.
-		 */
-		if (isnullstartblock(irec.br_startblock) ||
-		    irec.br_startoff > max_dqid_off ||
-		    irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
-			xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
-	}
 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
 		goto out;
 

From 517b32b7fa0e7d89f644651cc5f048e77fd6e91e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:33 -0700
Subject: [PATCH 069/121] xfs: scrub the data fork of the realtime inodes

The realtime bitmap and summary inodes live on the metadata device, so
we can scrub their data forks with the regular scrubbers.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/rtbitmap.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8b048f107af2..0fa3ef5c83b8 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -82,6 +82,11 @@ xfs_scrub_rtbitmap(
 {
 	int				error;
 
+	/* Invoke the fork scrubber. */
+	error = xfs_scrub_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		return error;
+
 	error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc);
 	if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
 		goto out;
@@ -95,8 +100,35 @@ int
 xfs_scrub_rtsummary(
 	struct xfs_scrub_context	*sc)
 {
+	struct xfs_inode		*rsumip = sc->mp->m_rsumip;
+	struct xfs_inode		*old_ip = sc->ip;
+	uint				old_ilock_flags = sc->ilock_flags;
+	int				error = 0;
+
+	/*
+	 * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
+	 * rt summary ip in compliance with the rt inode locking rules.
+	 *
+	 * Since we switch sc->ip to rsumip we have to save the old ilock
+	 * flags so that we don't mix up the inode state that @sc tracks.
+	 */
+	sc->ip = rsumip;
+	sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
+	xfs_ilock(sc->ip, sc->ilock_flags);
+
+	/* Invoke the fork scrubber. */
+	error = xfs_scrub_metadata_inode_forks(sc);
+	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+		goto out;
+
 	/* XXX: implement this some day */
-	return -ENOENT;
+	xfs_scrub_set_incomplete(sc);
+out:
+	/* Switch back to the rtbitmap inode and lock flags. */
+	xfs_iunlock(sc->ip, sc->ilock_flags);
+	sc->ilock_flags = old_ilock_flags;
+	sc->ip = old_ip;
+	return error;
 }
 
 
From ddd10c2fe20e7ca6d11ddf84f905edba080b26a7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:34 -0700
Subject: [PATCH 070/121] xfs: avoid ABBA deadlock when scrubbing parent
 pointers

In normal operation, the XFS convention is to take an inode's iolock
and then allocate a transaction.  However, when scrubbing parent inodes
this is inverted -- we allocated the transaction to do the scrub, and
now we're trying to grab the parent's iolock.  This can lead to ABBA
deadlocks: some thread grabbed the parent's iolock and is waiting for
space for a transaction while our parent scrubber is sitting on a
transaction trying to get the parent's iolock.

Therefore, convert all iolock attempts to use trylock; if that fails,
they can use the existing mechanisms to back off and try again.

The ABBA deadlock didn't happen with a non-repair scrub because the
transactions don't reserve any space, but repair scrubs require
reservation in order to update metadata.  However, any other concurrent
metadata update (e.g. directory create in the parent) could also induce
this deadlock with the parent scrubber.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/scrub/common.c | 22 ++++++++++++++++++++++
 fs/xfs/scrub/common.h |  1 +
 fs/xfs/scrub/parent.c | 16 ++++++++++++++--
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 62b33c99efe4..518bff2be0c9 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -844,3 +844,25 @@ xfs_scrub_metadata_inode_forks(
 
 	return error;
 }
+
+/*
+ * Try to lock an inode in violation of the usual locking order rules.  For
+ * example, trying to get the IOLOCK while in transaction context, or just
+ * plain breaking AG-order or inode-order inode locking rules.  Either way,
+ * the only way to avoid an ABBA deadlock is to use trylock and back off if
+ * we can't.
+ */
+int
+xfs_scrub_ilock_inverted(
+	struct xfs_inode	*ip,
+	uint			lock_mode)
+{
+	int			i;
+
+	for (i = 0; i < 20; i++) {
+		if (xfs_ilock_nowait(ip, lock_mode))
+			return 0;
+		delay(1);
+	}
+	return -EDEADLOCK;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 5d78bb9602ab..119d9b6db887 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -156,5 +156,6 @@ static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
 }
 
 int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc);
+int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
 
 #endif	/* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index fc336807e156..77c6b22c6bfd 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -214,7 +214,9 @@ xfs_scrub_parent_validate(
 	 */
 	xfs_iunlock(sc->ip, sc->ilock_flags);
 	sc->ilock_flags = 0;
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
+	error = xfs_scrub_ilock_inverted(dp, XFS_IOLOCK_SHARED);
+	if (error)
+		goto out_rele;
 
 	/* Go looking for our dentry. */
 	error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
@@ -223,8 +225,10 @@ xfs_scrub_parent_validate(
 
 	/* Drop the parent lock, relock this inode. */
 	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+	error = xfs_scrub_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL);
+	if (error)
+		goto out_rele;
 	sc->ilock_flags = XFS_IOLOCK_EXCL;
-	xfs_ilock(sc->ip, sc->ilock_flags);
 
 	/*
 	 * If we're an unlinked directory, the parent /won't/ have a link
@@ -326,5 +330,13 @@ xfs_scrub_parent(
 	if (try_again && tries == 20)
 		xfs_scrub_set_incomplete(sc);
 out:
+	/*
+	 * If we failed to lock the parent inode even after a retry, just mark
+	 * this scrub incomplete and return.
+	 */
+	if (sc->try_harder && error == -EDEADLOCK) {
+		error = 0;
+		xfs_scrub_set_incomplete(sc);
+	}
 	return error;
 }

From 9f3a080ef19b1c182a8fb1edbfb707fdb811437c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:34 -0700
Subject: [PATCH 071/121] xfs: hoist xfs_scrub_agfl_walk to libxfs as
 xfs_agfl_walk

This function is basically a generic AGFL block iterator, so promote it
to libxfs ahead of online repair wanting to use it.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc.c | 37 +++++++++++++++++++
 fs/xfs/libxfs/xfs_alloc.h |  5 +++
 fs/xfs/scrub/agheader.c   | 78 +++++++--------------------------------
 fs/xfs/scrub/common.h     |  4 --
 4 files changed, 55 insertions(+), 69 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 5410635893df..dc9dd3805d97 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3180,3 +3180,40 @@ xfs_alloc_has_record(
 
 	return xfs_btree_has_record(cur, &low, &high, exists);
 }
+
+/*
+ * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
+ * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ */
+int
+xfs_agfl_walk(
+	struct xfs_mount	*mp,
+	struct xfs_agf		*agf,
+	struct xfs_buf		*agflbp,
+	xfs_agfl_walk_fn	walk_fn,
+	void			*priv)
+{
+	__be32			*agfl_bno;
+	unsigned int		i;
+	int			error;
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	i = be32_to_cpu(agf->agf_flfirst);
+
+	/* Nothing to walk in an empty AGFL. */
+	if (agf->agf_flcount == cpu_to_be32(0))
+		return 0;
+
+	/* Otherwise, walk from first to last, wrapping as needed. */
+	for (;;) {
+		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
+		if (error)
+			return error;
+		if (i == be32_to_cpu(agf->agf_fllast))
+			break;
+		if (++i == xfs_agfl_size(mp))
+			i = 0;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 46d48c6f83b7..0747adcd57d6 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -262,4 +262,9 @@ bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
 int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		xfs_extlen_t len, bool *exist);
 
+typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
+		void *priv);
+int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
+		struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 831acc0a328f..1f71793f7db4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -38,68 +38,6 @@
 #include "scrub/common.h"
 #include "scrub/trace.h"
 
-/*
- * Walk all the blocks in the AGFL.  The fn function can return any negative
- * error code or XFS_BTREE_QUERY_RANGE_ABORT.
- */
-int
-xfs_scrub_walk_agfl(
-	struct xfs_scrub_context	*sc,
-	int				(*fn)(struct xfs_scrub_context *,
-					      xfs_agblock_t bno, void *),
-	void				*priv)
-{
-	struct xfs_agf			*agf;
-	__be32				*agfl_bno;
-	struct xfs_mount		*mp = sc->mp;
-	unsigned int			flfirst;
-	unsigned int			fllast;
-	int				i;
-	int				error;
-
-	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
-	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp);
-	flfirst = be32_to_cpu(agf->agf_flfirst);
-	fllast = be32_to_cpu(agf->agf_fllast);
-
-	/* Nothing to walk in an empty AGFL. */
-	if (agf->agf_flcount == cpu_to_be32(0))
-		return 0;
-
-	/* first to last is a consecutive list. */
-	if (fllast >= flfirst) {
-		for (i = flfirst; i <= fllast; i++) {
-			error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
-			if (error)
-				return error;
-			if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-				return error;
-		}
-
-		return 0;
-	}
-
-	/* first to the end */
-	for (i = flfirst; i < xfs_agfl_size(mp); i++) {
-		error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
-		if (error)
-			return error;
-		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-			return error;
-	}
-
-	/* the start to last. */
-	for (i = 0; i <= fllast; i++) {
-		error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
-		if (error)
-			return error;
-		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
-			return error;
-	}
-
-	return 0;
-}
-
 /* Superblock */
 
 /* Cross-reference with the other btrees. */
@@ -678,6 +616,7 @@ struct xfs_scrub_agfl_info {
 	unsigned int			sz_entries;
 	unsigned int			nr_entries;
 	xfs_agblock_t			*entries;
+	struct xfs_scrub_context	*sc;
 };
 
 /* Cross-reference with the other btrees. */
@@ -699,12 +638,12 @@ xfs_scrub_agfl_block_xref(
 /* Scrub an AGFL block. */
 STATIC int
 xfs_scrub_agfl_block(
-	struct xfs_scrub_context	*sc,
+	struct xfs_mount		*mp,
 	xfs_agblock_t			agbno,
 	void				*priv)
 {
-	struct xfs_mount		*mp = sc->mp;
 	struct xfs_scrub_agfl_info	*sai = priv;
+	struct xfs_scrub_context	*sc = sai->sc;
 	xfs_agnumber_t			agno = sc->sa.agno;
 
 	if (xfs_verify_agbno(mp, agno, agbno) &&
@@ -715,6 +654,9 @@ xfs_scrub_agfl_block(
 
 	xfs_scrub_agfl_block_xref(sc, agbno, priv);
 
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return XFS_BTREE_QUERY_RANGE_ABORT;
+
 	return 0;
 }
 
@@ -794,6 +736,7 @@ xfs_scrub_agfl(
 		goto out;
 	}
 	memset(&sai, 0, sizeof(sai));
+	sai.sc = sc;
 	sai.sz_entries = agflcount;
 	sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
 			KM_MAYFAIL);
@@ -804,7 +747,12 @@ xfs_scrub_agfl(
 
 	/* Check the blocks in the AGFL. */
 	xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
-	error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai);
+	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+			sc->sa.agfl_bp, xfs_scrub_agfl_block, &sai);
+	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+		error = 0;
+		goto out_free;
+	}
 	if (error)
 		goto out_free;
 
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 119d9b6db887..a660087b606e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -129,10 +129,6 @@ int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
 void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa);
 int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
 			    struct xfs_scrub_ag *sa);
-int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
-			int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno,
-				  void *),
-			void *priv);
 int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc,
 				    struct xfs_btree_cur *cur,
 				    struct xfs_owner_info *oinfo,

From 7cf199ba5a70dbc744276efc94442fb4436dac15 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:34 -0700
Subject: [PATCH 072/121] xfs: make xfs_bmapi_remapi work with attribute forks

Add a new flags argument to xfs_bmapi_remapi so that we can pass BMAPI
flags into the function.  This enables us to pass in BMAPI_ATTRFORK so
that we can remap things into the attribute fork.  Eventually the
online repair code will use this to rebuild attribute forks, so make it
non-static.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 28 ++++++++++++++++------------
 fs/xfs/libxfs/xfs_bmap.h |  4 ++++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 0fd051064ff0..b63e15a114f3 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4520,30 +4520,34 @@ error0:
 	return error;
 }
 
-static int
+int
 xfs_bmapi_remap(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	xfs_fileoff_t		bno,
 	xfs_filblks_t		len,
 	xfs_fsblock_t		startblock,
-	struct xfs_defer_ops	*dfops)
+	struct xfs_defer_ops	*dfops,
+	int			flags)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp;
 	struct xfs_btree_cur	*cur = NULL;
 	xfs_fsblock_t		firstblock = NULLFSBLOCK;
 	struct xfs_bmbt_irec	got;
 	struct xfs_iext_cursor	icur;
+	int			whichfork = xfs_bmapi_whichfork(flags);
 	int			logflags = 0, error;
 
+	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(len > 0);
 	ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK)));
 
 	if (unlikely(XFS_TEST_ERROR(
-	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
-	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
 	     mp, XFS_ERRTAG_BMAPIFORMAT))) {
 		XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
 		return -EFSCORRUPTED;
@@ -4553,7 +4557,7 @@ xfs_bmapi_remap(
 		return -EIO;
 
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		error = xfs_iread_extents(tp, ip, whichfork);
 		if (error)
 			return error;
 	}
@@ -4568,7 +4572,7 @@ xfs_bmapi_remap(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	if (ifp->if_flags & XFS_IFBROOT) {
-		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
 		cur->bc_private.b.firstblock = firstblock;
 		cur->bc_private.b.dfops = dfops;
 		cur->bc_private.b.flags = 0;
@@ -4579,16 +4583,16 @@ xfs_bmapi_remap(
 	got.br_blockcount = len;
 	got.br_state = XFS_EXT_NORM;
 
-	error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur,
-			&cur, &got, &firstblock, dfops, &logflags, 0);
+	error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur,
+			&cur, &got, &firstblock, dfops, &logflags, flags);
 	if (error)
 		goto error0;
 
-	if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+	if (xfs_bmap_wants_extents(ip, whichfork)) {
 		int		tmp_logflags = 0;
 
 		error = xfs_bmap_btree_to_extents(tp, ip, cur,
-			&tmp_logflags, XFS_DATA_FORK);
+			&tmp_logflags, whichfork);
 		logflags |= tmp_logflags;
 	}
 
@@ -6162,7 +6166,7 @@ xfs_bmap_finish_one(
 	switch (type) {
 	case XFS_BMAP_MAP:
 		error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
-				startblock, dfops);
+				startblock, dfops, 0);
 		*blockcount = 0;
 		break;
 	case XFS_BMAP_UNMAP:
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 6046012674c8..2c233f9f1a26 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -297,4 +297,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork)
 xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *irec);
 
+int	xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
+		struct xfs_defer_ops *dfops, int flags);
+
 #endif	/* __XFS_BMAP_H__ */

From 7644bd988d911168c80599bc034bb489dc851dcf Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:35 -0700
Subject: [PATCH 073/121] xfs: teach xfs_bmapi_remap to accept some bmapi flags

Teach xfs_bmapi_remap how to map in unwritten extent and to skip rmap
updates.  This enables us to rebuild real and unwritten extents from the
rmapbt.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b63e15a114f3..7b0e2b551e23 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4543,7 +4543,10 @@ xfs_bmapi_remap(
 	ASSERT(len > 0);
 	ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK)));
+	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
+			   XFS_BMAPI_NORMAP)));
+	ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
+			(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
 
 	if (unlikely(XFS_TEST_ERROR(
 	    (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4581,7 +4584,10 @@ xfs_bmapi_remap(
 	got.br_startoff = bno;
 	got.br_startblock = startblock;
 	got.br_blockcount = len;
-	got.br_state = XFS_EXT_NORM;
+	if (flags & XFS_BMAPI_PREALLOC)
+		got.br_state = XFS_EXT_UNWRITTEN;
+	else
+		got.br_state = XFS_EXT_NORM;
 
 	error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur,
 			&cur, &got, &firstblock, dfops, &logflags, flags);

From 718fa74b153f26623e5cacfb24522f72b7ae9d80 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:35 -0700
Subject: [PATCH 074/121] xfs: create tracepoints for online repair

These tracepoints will be used to debug the online repair routines.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/trace.h | 258 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 258 insertions(+)

diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 5d2b1c241be5..794d56bb1af8 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -69,6 +69,8 @@ DEFINE_EVENT(xfs_scrub_class, name, \
 DEFINE_SCRUB_EVENT(xfs_scrub_start);
 DEFINE_SCRUB_EVENT(xfs_scrub_done);
 DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry);
+DEFINE_SCRUB_EVENT(xfs_repair_attempt);
+DEFINE_SCRUB_EVENT(xfs_repair_done);
 
 TRACE_EVENT(xfs_scrub_op_error,
 	TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
@@ -492,6 +494,262 @@ TRACE_EVENT(xfs_scrub_xref_error,
 		  __entry->ret_ip)
 );
 
+/* repair tracepoints */
+#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
+
+DECLARE_EVENT_CLASS(xfs_repair_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_REPAIR_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_repair_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len), \
+	TP_ARGS(mp, agno, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_dispose_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_collect_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_agfl_insert);
+
+DECLARE_EVENT_CLASS(xfs_repair_rmap_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_extlen_t len,
+		 uint64_t owner, uint64_t offset, unsigned int flags),
+	TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->owner = owner;
+		__entry->offset = offset;
+		__entry->flags = flags;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
+#define DEFINE_REPAIR_RMAP_EVENT(name) \
+DEFINE_EVENT(xfs_repair_rmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_extlen_t len, \
+		 uint64_t owner, uint64_t offset, unsigned int flags), \
+	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_alloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_rmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_bmap_extent_fn);
+
+TRACE_EVENT(xfs_repair_refcount_extent_fn,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount)
+)
+
+TRACE_EVENT(xfs_repair_init_btblock,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 xfs_btnum_t btnum),
+	TP_ARGS(mp, agno, agbno, btnum),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(uint32_t, btnum)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->btnum = btnum;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u btnum %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->btnum)
+)
+TRACE_EVENT(xfs_repair_findroot_block,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+		 uint32_t magic, uint16_t level),
+	TP_ARGS(mp, agno, agbno, magic, level),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(uint32_t, magic)
+		__field(uint16_t, level)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->magic = magic;
+		__entry->level = level;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->magic,
+		  __entry->level)
+)
+TRACE_EVENT(xfs_repair_calc_ag_resblks,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen,
+		 xfs_agblock_t usedlen),
+	TP_ARGS(mp, agno, icount, aglen, freelen, usedlen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, icount)
+		__field(xfs_agblock_t, aglen)
+		__field(xfs_agblock_t, freelen)
+		__field(xfs_agblock_t, usedlen)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->icount = icount;
+		__entry->aglen = aglen;
+		__entry->freelen = freelen;
+		__entry->usedlen = usedlen;
+	),
+	TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->icount,
+		  __entry->aglen,
+		  __entry->freelen,
+		  __entry->usedlen)
+)
+TRACE_EVENT(xfs_repair_calc_ag_resblks_btsize,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz,
+		 xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz),
+	TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, bnobt_sz)
+		__field(xfs_agblock_t, inobt_sz)
+		__field(xfs_agblock_t, rmapbt_sz)
+		__field(xfs_agblock_t, refcbt_sz)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->bnobt_sz = bnobt_sz;
+		__entry->inobt_sz = inobt_sz;
+		__entry->rmapbt_sz = rmapbt_sz;
+		__entry->refcbt_sz = refcbt_sz;
+	),
+	TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->bnobt_sz,
+		  __entry->inobt_sz,
+		  __entry->rmapbt_sz,
+		  __entry->refcbt_sz)
+)
+TRACE_EVENT(xfs_repair_reset_counters,
+	TP_PROTO(struct xfs_mount *mp),
+	TP_ARGS(mp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+	),
+	TP_printk("dev %d:%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev))
+)
+
+TRACE_EVENT(xfs_repair_ialloc_insert,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino, uint16_t holemask, uint8_t count,
+		 uint8_t freecount, uint64_t freemask),
+	TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(uint16_t, holemask)
+		__field(uint8_t, count)
+		__field(uint8_t, freecount)
+		__field(uint64_t, freemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+		__entry->holemask = holemask;
+		__entry->count = count;
+		__entry->freecount = freecount;
+		__entry->freemask = freemask;
+	),
+	TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startino,
+		  __entry->holemask,
+		  __entry->count,
+		  __entry->freecount,
+		  __entry->freemask)
+)
+
+#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
+
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH

From 84d42ea6b6269aee7eb3d91a4425a08b8965fd4a Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 14 May 2018 06:34:36 -0700
Subject: [PATCH 075/121] xfs: implement the metadata repair ioctl flag

Plumb in the pieces necessary to make the "scrub" subfunction of
the scrub ioctl actually work.  This means that we make the IFLAG_REPAIR
flag to the scrub ioctl actually do something, and we add an errortag
knob so that xfstests can force the kernel to rebuild a metadata
structure even if there's nothing wrong with it.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/Kconfig               |  18 +++++
 fs/xfs/Makefile              |   7 ++
 fs/xfs/libxfs/xfs_errortag.h |   4 +-
 fs/xfs/libxfs/xfs_fs.h       |   9 ++-
 fs/xfs/scrub/repair.c        | 130 +++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h        |  56 ++++++++++++++
 fs/xfs/scrub/scrub.c         | 138 +++++++++++++++++++++++++++++++++--
 fs/xfs/scrub/scrub.h         |   3 +
 fs/xfs/xfs_error.c           |   3 +
 9 files changed, 359 insertions(+), 9 deletions(-)
 create mode 100644 fs/xfs/scrub/repair.c
 create mode 100644 fs/xfs/scrub/repair.h

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 46bcf0e649f5..457ac9f97377 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -85,6 +85,24 @@ config XFS_ONLINE_SCRUB
 
 	  If unsure, say N.
 
+config XFS_ONLINE_REPAIR
+	bool "XFS online metadata repair support"
+	default n
+	depends on XFS_FS && XFS_ONLINE_SCRUB
+	help
+	  If you say Y here you will be able to repair metadata on a
+	  mounted XFS filesystem.  This feature is intended to reduce
+	  filesystem downtime by fixing minor problems before they cause the
+	  filesystem to go down.  However, it requires that the filesystem be
+	  formatted with secondary metadata, such as reverse mappings and inode
+	  parent pointers.
+
+	  This feature is considered EXPERIMENTAL.  Use with caution!
+
+	  See the xfs_scrub man page in section 8 for additional information.
+
+	  If unsure, say N.
+
 config XFS_WARN
 	bool "XFS Verbose Warnings"
 	depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7ceb41a9786a..57de3f4bb2c7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -163,4 +163,11 @@ xfs-y				+= $(addprefix scrub/, \
 
 xfs-$(CONFIG_XFS_RT)		+= scrub/rtbitmap.o
 xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
+
+# online repair
+ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
+xfs-y				+= $(addprefix scrub/, \
+				   repair.o \
+				   )
+endif
 endif
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index bc1789d95152..d47b91625945 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -65,7 +65,8 @@
 #define XFS_ERRTAG_LOG_BAD_CRC				29
 #define XFS_ERRTAG_LOG_ITEM_PIN				30
 #define XFS_ERRTAG_BUF_LRU_REF				31
-#define XFS_ERRTAG_MAX					32
+#define XFS_ERRTAG_FORCE_SCRUB_REPAIR			32
+#define XFS_ERRTAG_MAX					33
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -102,5 +103,6 @@
 #define XFS_RANDOM_LOG_BAD_CRC				1
 #define XFS_RANDOM_LOG_ITEM_PIN				1
 #define XFS_RANDOM_BUF_LRU_REF				2
+#define XFS_RANDOM_FORCE_SCRUB_REPAIR			1
 
 #endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index faf1a4edd618..dddc75e4f1f6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -542,13 +542,20 @@ struct xfs_scrub_metadata {
 /* o: Metadata object looked funny but isn't corrupt. */
 #define XFS_SCRUB_OFLAG_WARNING		(1 << 6)
 
+/*
+ * o: IFLAG_REPAIR was set but metadata object did not need fixing or
+ *    optimization and has therefore not been altered.
+ */
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+
 #define XFS_SCRUB_FLAGS_IN	(XFS_SCRUB_IFLAG_REPAIR)
 #define XFS_SCRUB_FLAGS_OUT	(XFS_SCRUB_OFLAG_CORRUPT | \
 				 XFS_SCRUB_OFLAG_PREEN | \
 				 XFS_SCRUB_OFLAG_XFAIL | \
 				 XFS_SCRUB_OFLAG_XCORRUPT | \
 				 XFS_SCRUB_OFLAG_INCOMPLETE | \
-				 XFS_SCRUB_OFLAG_WARNING)
+				 XFS_SCRUB_OFLAG_WARNING | \
+				 XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
 #define XFS_SCRUB_FLAGS_ALL	(XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
 
 /*
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
new file mode 100644
index 000000000000..be30825c47c6
--- /dev/null
+++ b/fs/xfs/scrub/repair.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Attempt to repair some metadata, if the metadata is corrupt and userspace
+ * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
+ * and will set *fixed to true if it thinks it repaired anything.
+ */
+int
+xfs_repair_attempt(
+	struct xfs_inode		*ip,
+	struct xfs_scrub_context	*sc,
+	bool				*fixed)
+{
+	int				error = 0;
+
+	trace_xfs_repair_attempt(ip, sc->sm, error);
+
+	xfs_scrub_ag_btcur_free(&sc->sa);
+
+	/* Repair whatever's broken. */
+	ASSERT(sc->ops->repair);
+	error = sc->ops->repair(sc);
+	trace_xfs_repair_done(ip, sc->sm, error);
+	switch (error) {
+	case 0:
+		/*
+		 * Repair succeeded.  Commit the fixes and perform a second
+		 * scrub so that we can tell userspace if we fixed the problem.
+		 */
+		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+		*fixed = true;
+		return -EAGAIN;
+	case -EDEADLOCK:
+	case -EAGAIN:
+		/* Tell the caller to try again having grabbed all the locks. */
+		if (!sc->try_harder) {
+			sc->try_harder = true;
+			return -EAGAIN;
+		}
+		/*
+		 * We tried harder but still couldn't grab all the resources
+		 * we needed to fix it.  The corruption has not been fixed,
+		 * so report back to userspace.
+		 */
+		return -EFSCORRUPTED;
+	default:
+		return error;
+	}
+}
+
+/*
+ * Complain about unfixable problems in the filesystem.  We don't log
+ * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
+ * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
+ * administrator isn't running xfs_scrub in no-repairs mode.
+ *
+ * Use this helper function because _ratelimited silently declares a static
+ * structure to track rate limiting information.
+ */
+void
+xfs_repair_failure(
+	struct xfs_mount		*mp)
+{
+	xfs_alert_ratelimited(mp,
+"Corruption not fixed during online repair.  Unmount and run xfs_repair.");
+}
+
+/*
+ * Repair probe -- userspace uses this to probe if we're willing to repair a
+ * given mountpoint.
+ */
+int
+xfs_repair_probe(
+	struct xfs_scrub_context	*sc)
+{
+	int				error = 0;
+
+	if (xfs_scrub_should_terminate(sc, &error))
+		return error;
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
new file mode 100644
index 000000000000..83170dd3388c
--- /dev/null
+++ b/fs/xfs/scrub/repair.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_REPAIR_H__
+#define __XFS_SCRUB_REPAIR_H__
+
+static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+
+/* Repair helpers */
+
+int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
+		bool *fixed);
+void xfs_repair_failure(struct xfs_mount *mp);
+
+/* Metadata repairers */
+
+int xfs_repair_probe(struct xfs_scrub_context *sc);
+
+#else
+
+static inline int xfs_repair_attempt(
+	struct xfs_inode		*ip,
+	struct xfs_scrub_context	*sc,
+	bool				*fixed)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void xfs_repair_failure(struct xfs_mount *mp) {}
+
+#define xfs_repair_probe		xfs_repair_notsupported
+
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif	/* __XFS_SCRUB_REPAIR_H__ */
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index ad6eda4fa123..c5999c28c20c 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -44,11 +44,16 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_quota.h"
 #include "xfs_qm.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/btree.h"
+#include "scrub/repair.h"
 
 /*
  * Online Scrub and Repair
@@ -122,6 +127,24 @@
  * XCORRUPT flag; btree query function errors are noted by setting the
  * XFAIL flag and deleting the cursor to prevent further attempts to
  * cross-reference with a defective btree.
+ *
+ * If a piece of metadata proves corrupt or suboptimal, the userspace
+ * program can ask the kernel to apply some tender loving care (TLC) to
+ * the metadata object by setting the REPAIR flag and re-calling the
+ * scrub ioctl.  "Corruption" is defined by metadata violating the
+ * on-disk specification; operations cannot continue if the violation is
+ * left untreated.  It is possible for XFS to continue if an object is
+ * "suboptimal", however performance may be degraded.  Repairs are
+ * usually performed by rebuilding the metadata entirely out of
+ * redundant metadata.  Optimizing, on the other hand, can sometimes be
+ * done without rebuilding entire structures.
+ *
+ * Generally speaking, the repair code has the following code structure:
+ * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
+ * The first check helps us figure out if we need to rebuild or simply
+ * optimize the structure so that the rebuild knows what to do.  The
+ * second check evaluates the completeness of the repair; that is what
+ * is reported to userspace.
  */
 
 /*
@@ -157,7 +180,10 @@ xfs_scrub_teardown(
 {
 	xfs_scrub_ag_free(sc, &sc->sa);
 	if (sc->tp) {
-		xfs_trans_cancel(sc->tp);
+		if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+			error = xfs_trans_commit(sc->tp);
+		else
+			xfs_trans_cancel(sc->tp);
 		sc->tp = NULL;
 	}
 	if (sc->ip) {
@@ -184,126 +210,150 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
 		.type	= ST_NONE,
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_probe,
+		.repair = xfs_repair_probe,
 	},
 	[XFS_SCRUB_TYPE_SB] = {		/* superblock */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_superblock,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_AGF] = {	/* agf */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_agf,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_agfl,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_AGI] = {	/* agi */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_agi,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_BNOBT] = {	/* bnobt */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_ag_allocbt,
 		.scrub	= xfs_scrub_bnobt,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_CNTBT] = {	/* cntbt */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_ag_allocbt,
 		.scrub	= xfs_scrub_cntbt,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_INOBT] = {	/* inobt */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_ag_iallocbt,
 		.scrub	= xfs_scrub_inobt,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_FINOBT] = {	/* finobt */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_ag_iallocbt,
 		.scrub	= xfs_scrub_finobt,
 		.has	= xfs_sb_version_hasfinobt,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_RMAPBT] = {	/* rmapbt */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_ag_rmapbt,
 		.scrub	= xfs_scrub_rmapbt,
 		.has	= xfs_sb_version_hasrmapbt,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_REFCNTBT] = {	/* refcountbt */
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_ag_refcountbt,
 		.scrub	= xfs_scrub_refcountbt,
 		.has	= xfs_sb_version_hasreflink,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_INODE] = {	/* inode record */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_inode,
 		.scrub	= xfs_scrub_inode,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_BMBTD] = {	/* inode data fork */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_inode_bmap,
 		.scrub	= xfs_scrub_bmap_data,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_BMBTA] = {	/* inode attr fork */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_inode_bmap,
 		.scrub	= xfs_scrub_bmap_attr,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_BMBTC] = {	/* inode CoW fork */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_inode_bmap,
 		.scrub	= xfs_scrub_bmap_cow,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_DIR] = {	/* directory */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_directory,
 		.scrub	= xfs_scrub_directory,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_XATTR] = {	/* extended attributes */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_xattr,
 		.scrub	= xfs_scrub_xattr,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_SYMLINK] = {	/* symbolic link */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_symlink,
 		.scrub	= xfs_scrub_symlink,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_PARENT] = {	/* parent pointers */
 		.type	= ST_INODE,
 		.setup	= xfs_scrub_setup_parent,
 		.scrub	= xfs_scrub_parent,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_RTBITMAP] = {	/* realtime bitmap */
 		.type	= ST_FS,
 		.setup	= xfs_scrub_setup_rt,
 		.scrub	= xfs_scrub_rtbitmap,
 		.has	= xfs_sb_version_hasrealtime,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_RTSUM] = {	/* realtime summary */
 		.type	= ST_FS,
 		.setup	= xfs_scrub_setup_rt,
 		.scrub	= xfs_scrub_rtsummary,
 		.has	= xfs_sb_version_hasrealtime,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_UQUOTA] = {	/* user quota */
 		.type	= ST_FS,
 		.setup	= xfs_scrub_setup_quota,
 		.scrub	= xfs_scrub_quota,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_GQUOTA] = {	/* group quota */
 		.type	= ST_FS,
 		.setup	= xfs_scrub_setup_quota,
 		.scrub	= xfs_scrub_quota,
+		.repair	= xfs_repair_notsupported,
 	},
 	[XFS_SCRUB_TYPE_PQUOTA] = {	/* project quota */
 		.type	= ST_FS,
 		.setup	= xfs_scrub_setup_quota,
 		.scrub	= xfs_scrub_quota,
+		.repair	= xfs_repair_notsupported,
 	},
 };
 
@@ -383,15 +433,54 @@ xfs_scrub_validate_inputs(
 	if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
 		goto out;
 
-	/* We don't know how to repair anything yet. */
-	if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
-		goto out;
+	/*
+	 * We only want to repair read-write v5+ filesystems.  Defer the check
+	 * for ops->repair until after our scrub confirms that we need to
+	 * perform repairs so that we avoid failing due to not supporting
+	 * repairing an object that doesn't need repairs.
+	 */
+	if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+		error = -EOPNOTSUPP;
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			goto out;
+
+		error = -EROFS;
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			goto out;
+	}
 
 	error = 0;
 out:
 	return error;
 }
 
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
+{
+	/*
+	 * Userspace asked us to repair something, we repaired it, rescanned
+	 * it, and the rescan says it's still broken.  Scream about this in
+	 * the system logs.
+	 */
+	if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+	    (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				 XFS_SCRUB_OFLAG_XCORRUPT)))
+		xfs_repair_failure(sc->mp);
+}
+#else
+static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
+{
+	/*
+	 * Userspace asked us to scrub something, it's broken, and we have no
+	 * way of fixing it.  Scream in the logs.
+	 */
+	if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+				XFS_SCRUB_OFLAG_XCORRUPT))
+		xfs_alert_ratelimited(sc->mp,
+				"Corruption detected during scrub.");
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
 /* Dispatch metadata scrubbing. */
 int
 xfs_scrub_metadata(
@@ -401,6 +490,7 @@ xfs_scrub_metadata(
 	struct xfs_scrub_context	sc;
 	struct xfs_mount		*mp = ip->i_mount;
 	bool				try_harder = false;
+	bool				already_fixed = false;
 	int				error = 0;
 
 	BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -450,10 +540,44 @@ retry_op:
 	} else if (error)
 		goto out_teardown;
 
-	if (sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
-			    XFS_SCRUB_OFLAG_XCORRUPT))
-		xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
+	if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
+		bool needs_fix;
 
+		/* Let debug users force us into the repair routines. */
+		if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+			sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+		needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+						XFS_SCRUB_OFLAG_XCORRUPT |
+						XFS_SCRUB_OFLAG_PREEN));
+		/*
+		 * If userspace asked for a repair but it wasn't necessary,
+		 * report that back to userspace.
+		 */
+		if (!needs_fix) {
+			sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+			goto out_nofix;
+		}
+
+		/*
+		 * If it's broken, userspace wants us to fix it, and we haven't
+		 * already tried to fix it, then attempt a repair.
+		 */
+		error = xfs_repair_attempt(ip, &sc, &already_fixed);
+		if (error == -EAGAIN) {
+			if (sc.try_harder)
+				try_harder = true;
+			error = xfs_scrub_teardown(&sc, ip, 0);
+			if (error) {
+				xfs_repair_failure(mp);
+				goto out;
+			}
+			goto retry_op;
+		}
+	}
+
+out_nofix:
+	xfs_scrub_postmortem(&sc);
 out_teardown:
 	error = xfs_scrub_teardown(&sc, ip, error);
 out:
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 5d797319fc9a..2f89a84a0e10 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -38,6 +38,9 @@ struct xfs_scrub_meta_ops {
 	/* Examine metadata for errors. */
 	int		(*scrub)(struct xfs_scrub_context *);
 
+	/* Repair or optimize the metadata. */
+	int		(*repair)(struct xfs_scrub_context *);
+
 	/* Decide if we even have this piece of metadata. */
 	bool		(*has)(struct xfs_sb *);
 
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index a63f5083f497..7975634cb8fe 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = {
 	XFS_RANDOM_LOG_BAD_CRC,
 	XFS_RANDOM_LOG_ITEM_PIN,
 	XFS_RANDOM_BUF_LRU_REF,
+	XFS_RANDOM_FORCE_SCRUB_REPAIR,
 };
 
 struct xfs_errortag_attr {
@@ -167,6 +168,7 @@ XFS_ERRORTAG_ATTR_RW(drop_writes,	XFS_ERRTAG_DROP_WRITES);
 XFS_ERRORTAG_ATTR_RW(log_bad_crc,	XFS_ERRTAG_LOG_BAD_CRC);
 XFS_ERRORTAG_ATTR_RW(log_item_pin,	XFS_ERRTAG_LOG_ITEM_PIN);
 XFS_ERRORTAG_ATTR_RW(buf_lru_ref,	XFS_ERRTAG_BUF_LRU_REF);
+XFS_ERRORTAG_ATTR_RW(force_repair,	XFS_ERRTAG_FORCE_SCRUB_REPAIR);
 
 static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -201,6 +203,7 @@ static struct attribute *xfs_errortag_attrs[] = {
 	XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
 	XFS_ERRORTAG_ATTR_LIST(log_item_pin),
 	XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
+	XFS_ERRORTAG_ATTR_LIST(force_repair),
 	NULL,
 };
 

From 879de98ead5106ffd5486aa6c11a3fad141049d9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:05 -0700
Subject: [PATCH 076/121] xfs: one-shot cached buffers

For the new growfs work, we want to ensure that we serialise
secondary superblock updates with other operations (e.g. scrub)
correctly, but we don't want to cache the buffers for long term
reuse. We need cached buffers for serialisation, however.

To solve this, introduce a "oneshot" buffer which will be marshalled
through the cache but then released once the last current reference
goes away. If the buffer is already cached, then we ignore the
"one-shot" behaviour and leave the buffer in the state it was prior
to the one-shot command being run. This means we don't perturb
either the working set or existing cached buffer state by a one-shot
operation.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 830e2f6c064a..f5f2b71c2fde 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -347,6 +347,18 @@ extern void xfs_buf_terminate(void);
 
 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
 
+/*
+ * If the buffer is already on the LRU, do nothing. Otherwise set the buffer
+ * up with a reference count of 0 so it will be tossed from the cache when
+ * released.
+ */
+static inline void xfs_buf_oneshot(struct xfs_buf *bp)
+{
+	if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1)
+		return;
+	atomic_set(&bp->b_lru_ref, 0);
+}
+
 static inline int xfs_buf_ispinned(struct xfs_buf *bp)
 {
 	return atomic_read(&bp->b_pin_count);

From cce77bcf48f50e40a2c08fb2a102e761464d2a15 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:05 -0700
Subject: [PATCH 077/121] xfs: factor out AG header initialisation from growfs
 core

The intialisation of new AG headers is mostly common with the
userspace mkfs code and growfs in the kernel, so start factoring it
out so we can move it to libxfs and use it in both places.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 637 +++++++++++++++++++++++----------------------
 1 file changed, 331 insertions(+), 306 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 523792768080..391d1938a6c8 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -71,20 +71,344 @@ xfs_growfs_get_hdr_buf(
 	return bp;
 }
 
+/*
+ * Write new AG headers to disk. Non-transactional, but written
+ * synchronously so they are completed prior to the growfs transaction
+ * being logged.
+ */
+static int
+xfs_grow_ag_headers(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		agsize,
+	xfs_rfsblock_t		*nfree)
+{
+	struct xfs_agf		*agf;
+	struct xfs_agi		*agi;
+	struct xfs_agfl		*agfl;
+	__be32			*agfl_bno;
+	xfs_alloc_rec_t		*arec;
+	struct xfs_buf		*bp;
+	int			bucket;
+	xfs_extlen_t		tmpsize;
+	int			error = 0;
+
+	/*
+	 * AG freespace header block
+	 */
+	bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0,
+			&xfs_agf_buf_ops);
+	if (!bp) {
+		error = -ENOMEM;
+		goto out_error;
+	}
+
+	agf = XFS_BUF_TO_AGF(bp);
+	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+	agf->agf_seqno = cpu_to_be32(agno);
+	agf->agf_length = cpu_to_be32(agsize);
+	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+	agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		agf->agf_roots[XFS_BTNUM_RMAPi] =
+					cpu_to_be32(XFS_RMAP_BLOCK(mp));
+		agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+		agf->agf_rmap_blocks = cpu_to_be32(1);
+	}
+
+	agf->agf_flfirst = cpu_to_be32(1);
+	agf->agf_fllast = 0;
+	agf->agf_flcount = 0;
+	tmpsize = agsize - mp->m_ag_prealloc_blocks;
+	agf->agf_freeblks = cpu_to_be32(tmpsize);
+	agf->agf_longest = cpu_to_be32(tmpsize);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		agf->agf_refcount_root = cpu_to_be32(
+				xfs_refc_block(mp));
+		agf->agf_refcount_level = cpu_to_be32(1);
+		agf->agf_refcount_blocks = cpu_to_be32(1);
+	}
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto out_error;
+
+	/*
+	 * AG freelist header block
+	 */
+	bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0,
+			&xfs_agfl_buf_ops);
+	if (!bp) {
+		error = -ENOMEM;
+		goto out_error;
+	}
+
+	agfl = XFS_BUF_TO_AGFL(bp);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+		agfl->agfl_seqno = cpu_to_be32(agno);
+		uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+	}
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
+		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto out_error;
+
+	/*
+	 * AG inode header block
+	 */
+	bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0,
+			&xfs_agi_buf_ops);
+	if (!bp) {
+		error = -ENOMEM;
+		goto out_error;
+	}
+
+	agi = XFS_BUF_TO_AGI(bp);
+	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+	agi->agi_seqno = cpu_to_be32(agno);
+	agi->agi_length = cpu_to_be32(agsize);
+	agi->agi_count = 0;
+	agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+	agi->agi_level = cpu_to_be32(1);
+	agi->agi_freecount = 0;
+	agi->agi_newino = cpu_to_be32(NULLAGINO);
+	agi->agi_dirino = cpu_to_be32(NULLAGINO);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+		agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+		agi->agi_free_level = cpu_to_be32(1);
+	}
+	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto out_error;
+
+	/*
+	 * BNO btree root block
+	 */
+	bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+			BTOBB(mp->m_sb.sb_blocksize), 0,
+			&xfs_allocbt_buf_ops);
+
+	if (!bp) {
+		error = -ENOMEM;
+		goto out_error;
+	}
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
+
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+	arec->ar_blockcount = cpu_to_be32(
+		agsize - be32_to_cpu(arec->ar_startblock));
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto out_error;
+
+	/*
+	 * CNT btree root block
+	 */
+	bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+			BTOBB(mp->m_sb.sb_blocksize), 0,
+			&xfs_allocbt_buf_ops);
+	if (!bp) {
+		error = -ENOMEM;
+		goto out_error;
+	}
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
+
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+	arec->ar_blockcount = cpu_to_be32(
+		agsize - be32_to_cpu(arec->ar_startblock));
+	*nfree += be32_to_cpu(arec->ar_blockcount);
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto out_error;
+
+	/* RMAP btree root block */
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		struct xfs_rmap_rec	*rrec;
+		struct xfs_btree_block	*block;
+
+		bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
+			BTOBB(mp->m_sb.sb_blocksize), 0,
+			&xfs_rmapbt_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto out_error;
+		}
+
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
+					agno, 0);
+		block = XFS_BUF_TO_BLOCK(bp);
+
+
+		/*
+		 * mark the AG header regions as static metadata The BNO
+		 * btree block is the first block after the headers, so
+		 * it's location defines the size of region the static
+		 * metadata consumes.
+		 *
+		 * Note: unlike mkfs, we never have to account for log
+		 * space when growing the data regions
+		 */
+		rrec = XFS_RMAP_REC_ADDR(block, 1);
+		rrec->rm_startblock = 0;
+		rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+
+		/* account freespace btree root blocks */
+		rrec = XFS_RMAP_REC_ADDR(block, 2);
+		rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+		rrec->rm_blockcount = cpu_to_be32(2);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+
+		/* account inode btree root blocks */
+		rrec = XFS_RMAP_REC_ADDR(block, 3);
+		rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+		rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+						XFS_IBT_BLOCK(mp));
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+
+		/* account for rmap btree root */
+		rrec = XFS_RMAP_REC_ADDR(block, 4);
+		rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+		rrec->rm_blockcount = cpu_to_be32(1);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+
+		/* account for refc btree root */
+		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+			rrec = XFS_RMAP_REC_ADDR(block, 5);
+			rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+			rrec->rm_blockcount = cpu_to_be32(1);
+			rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+			rrec->rm_offset = 0;
+			be16_add_cpu(&block->bb_numrecs, 1);
+		}
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto out_error;
+	}
+
+	/*
+	 * INO btree root block
+	 */
+	bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+			BTOBB(mp->m_sb.sb_blocksize), 0,
+			&xfs_inobt_buf_ops);
+	if (!bp) {
+		error = -ENOMEM;
+		goto out_error;
+	}
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
+
+	error = xfs_bwrite(bp);
+	xfs_buf_relse(bp);
+	if (error)
+		goto out_error;
+
+	/*
+	 * FINO btree root block
+	 */
+	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+		bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+			BTOBB(mp->m_sb.sb_blocksize), 0,
+			&xfs_inobt_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto out_error;
+		}
+
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
+					     0, 0, agno, 0);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto out_error;
+	}
+
+	/*
+	 * refcount btree root block
+	 */
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		bp = xfs_growfs_get_hdr_buf(mp,
+			XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
+			BTOBB(mp->m_sb.sb_blocksize), 0,
+			&xfs_refcountbt_buf_ops);
+		if (!bp) {
+			error = -ENOMEM;
+			goto out_error;
+		}
+
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
+				     0, 0, agno, 0);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto out_error;
+	}
+
+out_error:
+	return error;
+}
+
 static int
 xfs_growfs_data_private(
 	xfs_mount_t		*mp,		/* mount point for filesystem */
 	xfs_growfs_data_t	*in)		/* growfs data input struct */
 {
 	xfs_agf_t		*agf;
-	struct xfs_agfl		*agfl;
 	xfs_agi_t		*agi;
 	xfs_agnumber_t		agno;
 	xfs_extlen_t		agsize;
-	xfs_extlen_t		tmpsize;
-	xfs_alloc_rec_t		*arec;
 	xfs_buf_t		*bp;
-	int			bucket;
 	int			dpct;
 	int			error, saved_error = 0;
 	xfs_agnumber_t		nagcount;
@@ -141,318 +465,19 @@ xfs_growfs_data_private(
 	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
-		__be32	*agfl_bno;
 
-		/*
-		 * AG freespace header block
-		 */
-		bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-				XFS_FSS_TO_BB(mp, 1), 0,
-				&xfs_agf_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
-			goto error0;
-		}
-
-		agf = XFS_BUF_TO_AGF(bp);
-		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
-		agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
-		agf->agf_seqno = cpu_to_be32(agno);
 		if (agno == nagcount - 1)
-			agsize =
-				nb -
+			agsize = nb -
 				(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
 		else
 			agsize = mp->m_sb.sb_agblocks;
-		agf->agf_length = cpu_to_be32(agsize);
-		agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
-		agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
-		agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
-		agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
-		if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
-			agf->agf_roots[XFS_BTNUM_RMAPi] =
-						cpu_to_be32(XFS_RMAP_BLOCK(mp));
-			agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
-			agf->agf_rmap_blocks = cpu_to_be32(1);
-		}
 
-		agf->agf_flfirst = cpu_to_be32(1);
-		agf->agf_fllast = 0;
-		agf->agf_flcount = 0;
-		tmpsize = agsize - mp->m_ag_prealloc_blocks;
-		agf->agf_freeblks = cpu_to_be32(tmpsize);
-		agf->agf_longest = cpu_to_be32(tmpsize);
-		if (xfs_sb_version_hascrc(&mp->m_sb))
-			uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
-		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-			agf->agf_refcount_root = cpu_to_be32(
-					xfs_refc_block(mp));
-			agf->agf_refcount_level = cpu_to_be32(1);
-			agf->agf_refcount_blocks = cpu_to_be32(1);
-		}
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
+		error = xfs_grow_ag_headers(mp, agno, agsize, &nfree);
 		if (error)
 			goto error0;
-
-		/*
-		 * AG freelist header block
-		 */
-		bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-				XFS_FSS_TO_BB(mp, 1), 0,
-				&xfs_agfl_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
-			goto error0;
-		}
-
-		agfl = XFS_BUF_TO_AGFL(bp);
-		if (xfs_sb_version_hascrc(&mp->m_sb)) {
-			agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
-			agfl->agfl_seqno = cpu_to_be32(agno);
-			uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
-		}
-
-		agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
-		for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
-			agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-		if (error)
-			goto error0;
-
-		/*
-		 * AG inode header block
-		 */
-		bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-				XFS_FSS_TO_BB(mp, 1), 0,
-				&xfs_agi_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
-			goto error0;
-		}
-
-		agi = XFS_BUF_TO_AGI(bp);
-		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
-		agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
-		agi->agi_seqno = cpu_to_be32(agno);
-		agi->agi_length = cpu_to_be32(agsize);
-		agi->agi_count = 0;
-		agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
-		agi->agi_level = cpu_to_be32(1);
-		agi->agi_freecount = 0;
-		agi->agi_newino = cpu_to_be32(NULLAGINO);
-		agi->agi_dirino = cpu_to_be32(NULLAGINO);
-		if (xfs_sb_version_hascrc(&mp->m_sb))
-			uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
-		if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-			agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
-			agi->agi_free_level = cpu_to_be32(1);
-		}
-		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
-			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-		if (error)
-			goto error0;
-
-		/*
-		 * BNO btree root block
-		 */
-		bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0,
-				&xfs_allocbt_buf_ops);
-
-		if (!bp) {
-			error = -ENOMEM;
-			goto error0;
-		}
-
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
-
-		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-		arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-		arec->ar_blockcount = cpu_to_be32(
-			agsize - be32_to_cpu(arec->ar_startblock));
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-		if (error)
-			goto error0;
-
-		/*
-		 * CNT btree root block
-		 */
-		bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0,
-				&xfs_allocbt_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
-			goto error0;
-		}
-
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
-
-		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-		arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-		arec->ar_blockcount = cpu_to_be32(
-			agsize - be32_to_cpu(arec->ar_startblock));
-		nfree += be32_to_cpu(arec->ar_blockcount);
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-		if (error)
-			goto error0;
-
-		/* RMAP btree root block */
-		if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
-			struct xfs_rmap_rec	*rrec;
-			struct xfs_btree_block	*block;
-
-			bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0,
-				&xfs_rmapbt_buf_ops);
-			if (!bp) {
-				error = -ENOMEM;
-				goto error0;
-			}
-
-			xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
-						agno, 0);
-			block = XFS_BUF_TO_BLOCK(bp);
-
-
-			/*
-			 * mark the AG header regions as static metadata The BNO
-			 * btree block is the first block after the headers, so
-			 * it's location defines the size of region the static
-			 * metadata consumes.
-			 *
-			 * Note: unlike mkfs, we never have to account for log
-			 * space when growing the data regions
-			 */
-			rrec = XFS_RMAP_REC_ADDR(block, 1);
-			rrec->rm_startblock = 0;
-			rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
-			rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
-			rrec->rm_offset = 0;
-			be16_add_cpu(&block->bb_numrecs, 1);
-
-			/* account freespace btree root blocks */
-			rrec = XFS_RMAP_REC_ADDR(block, 2);
-			rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
-			rrec->rm_blockcount = cpu_to_be32(2);
-			rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
-			rrec->rm_offset = 0;
-			be16_add_cpu(&block->bb_numrecs, 1);
-
-			/* account inode btree root blocks */
-			rrec = XFS_RMAP_REC_ADDR(block, 3);
-			rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
-			rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
-							XFS_IBT_BLOCK(mp));
-			rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
-			rrec->rm_offset = 0;
-			be16_add_cpu(&block->bb_numrecs, 1);
-
-			/* account for rmap btree root */
-			rrec = XFS_RMAP_REC_ADDR(block, 4);
-			rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
-			rrec->rm_blockcount = cpu_to_be32(1);
-			rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
-			rrec->rm_offset = 0;
-			be16_add_cpu(&block->bb_numrecs, 1);
-
-			/* account for refc btree root */
-			if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-				rrec = XFS_RMAP_REC_ADDR(block, 5);
-				rrec->rm_startblock = cpu_to_be32(
-						xfs_refc_block(mp));
-				rrec->rm_blockcount = cpu_to_be32(1);
-				rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
-				rrec->rm_offset = 0;
-				be16_add_cpu(&block->bb_numrecs, 1);
-			}
-
-			error = xfs_bwrite(bp);
-			xfs_buf_relse(bp);
-			if (error)
-				goto error0;
-		}
-
-		/*
-		 * INO btree root block
-		 */
-		bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0,
-				&xfs_inobt_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
-			goto error0;
-		}
-
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-		if (error)
-			goto error0;
-
-		/*
-		 * FINO btree root block
-		 */
-		if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-			bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0,
-				&xfs_inobt_buf_ops);
-			if (!bp) {
-				error = -ENOMEM;
-				goto error0;
-			}
-
-			xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
-						     0, 0, agno, 0);
-
-			error = xfs_bwrite(bp);
-			xfs_buf_relse(bp);
-			if (error)
-				goto error0;
-		}
-
-		/*
-		 * refcount btree root block
-		 */
-		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-			bp = xfs_growfs_get_hdr_buf(mp,
-				XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0,
-				&xfs_refcountbt_buf_ops);
-			if (!bp) {
-				error = -ENOMEM;
-				goto error0;
-			}
-
-			xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
-					     0, 0, agno, 0);
-
-			error = xfs_bwrite(bp);
-			xfs_buf_relse(bp);
-			if (error)
-				goto error0;
-		}
 	}
 	xfs_trans_agblocks_delta(tp, nfree);
+
 	/*
 	 * There are new blocks in the old last a.g.
 	 */

From 9aebe805a58b70d9b4a91517803bada0288540da Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:06 -0700
Subject: [PATCH 078/121] xfs: convert growfs AG header init to use buffer
 lists

We currently write all new AG headers synchronously, which can be
slow for large grow operations. All we really need to do is ensure
all the headers are on disk before we run the growfs transaction, so
convert this to a buffer list and a delayed write operation. We
block waiting for the delayed write buffer submission to complete,
so this will fulfill the requirement to have all the buffers written
correctly before proceeding.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 74 +++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 391d1938a6c8..4b560caaf397 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -81,7 +81,8 @@ xfs_grow_ag_headers(
 	struct xfs_mount	*mp,
 	xfs_agnumber_t		agno,
 	xfs_extlen_t		agsize,
-	xfs_rfsblock_t		*nfree)
+	xfs_rfsblock_t		*nfree,
+	struct list_head	*buffer_list)
 {
 	struct xfs_agf		*agf;
 	struct xfs_agi		*agi;
@@ -135,11 +136,8 @@ xfs_grow_ag_headers(
 		agf->agf_refcount_level = cpu_to_be32(1);
 		agf->agf_refcount_blocks = cpu_to_be32(1);
 	}
-
-	error = xfs_bwrite(bp);
+	xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
-	if (error)
-		goto out_error;
 
 	/*
 	 * AG freelist header block
@@ -164,10 +162,8 @@ xfs_grow_ag_headers(
 	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
 		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
 
-	error = xfs_bwrite(bp);
+	xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
-	if (error)
-		goto out_error;
 
 	/*
 	 * AG inode header block
@@ -201,10 +197,8 @@ xfs_grow_ag_headers(
 	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
 		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
 
-	error = xfs_bwrite(bp);
+	xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
-	if (error)
-		goto out_error;
 
 	/*
 	 * BNO btree root block
@@ -226,10 +220,8 @@ xfs_grow_ag_headers(
 	arec->ar_blockcount = cpu_to_be32(
 		agsize - be32_to_cpu(arec->ar_startblock));
 
-	error = xfs_bwrite(bp);
+	xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
-	if (error)
-		goto out_error;
 
 	/*
 	 * CNT btree root block
@@ -251,10 +243,8 @@ xfs_grow_ag_headers(
 		agsize - be32_to_cpu(arec->ar_startblock));
 	*nfree += be32_to_cpu(arec->ar_blockcount);
 
-	error = xfs_bwrite(bp);
+	xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
-	if (error)
-		goto out_error;
 
 	/* RMAP btree root block */
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
@@ -326,10 +316,8 @@ xfs_grow_ag_headers(
 			be16_add_cpu(&block->bb_numrecs, 1);
 		}
 
-		error = xfs_bwrite(bp);
+		xfs_buf_delwri_queue(bp, buffer_list);
 		xfs_buf_relse(bp);
-		if (error)
-			goto out_error;
 	}
 
 	/*
@@ -345,11 +333,8 @@ xfs_grow_ag_headers(
 	}
 
 	xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
-
-	error = xfs_bwrite(bp);
+	xfs_buf_delwri_queue(bp, buffer_list);
 	xfs_buf_relse(bp);
-	if (error)
-		goto out_error;
 
 	/*
 	 * FINO btree root block
@@ -364,13 +349,9 @@ xfs_grow_ag_headers(
 			goto out_error;
 		}
 
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
-					     0, 0, agno, 0);
-
-		error = xfs_bwrite(bp);
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, 0, 0, agno, 0);
+		xfs_buf_delwri_queue(bp, buffer_list);
 		xfs_buf_relse(bp);
-		if (error)
-			goto out_error;
 	}
 
 	/*
@@ -386,13 +367,9 @@ xfs_grow_ag_headers(
 			goto out_error;
 		}
 
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
-				     0, 0, agno, 0);
-
-		error = xfs_bwrite(bp);
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, 0, 0, agno, 0);
+		xfs_buf_delwri_queue(bp, buffer_list);
 		xfs_buf_relse(bp);
-		if (error)
-			goto out_error;
 	}
 
 out_error:
@@ -419,6 +396,7 @@ xfs_growfs_data_private(
 	xfs_agnumber_t		oagcount;
 	int			pct;
 	xfs_trans_t		*tp;
+	LIST_HEAD		(buffer_list);
 
 	nb = in->newblocks;
 	pct = in->imaxpct;
@@ -459,9 +437,16 @@ xfs_growfs_data_private(
 		return error;
 
 	/*
-	 * Write new AG headers to disk. Non-transactional, but written
-	 * synchronously so they are completed prior to the growfs transaction
-	 * being logged.
+	 * Write new AG headers to disk. Non-transactional, but need to be
+	 * written and completed prior to the growfs transaction being logged.
+	 * To do this, we use a delayed write buffer list and wait for
+	 * submission and IO completion of the list as a whole. This allows the
+	 * IO subsystem to merge all the AG headers in a single AG into a single
+	 * IO and hide most of the latency of the IO from us.
+	 *
+	 * This also means that if we get an error whilst building the buffer
+	 * list to write, we can cancel the entire list without having written
+	 * anything.
 	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
@@ -472,10 +457,17 @@ xfs_growfs_data_private(
 		else
 			agsize = mp->m_sb.sb_agblocks;
 
-		error = xfs_grow_ag_headers(mp, agno, agsize, &nfree);
-		if (error)
+		error = xfs_grow_ag_headers(mp, agno, agsize, &nfree,
+					    &buffer_list);
+		if (error) {
+			xfs_buf_delwri_cancel(&buffer_list);
 			goto error0;
+		}
 	}
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error)
+		goto error0;
+
 	xfs_trans_agblocks_delta(tp, nfree);
 
 	/*

From 0410c3bb2b882cd4476e5ee8d74247ac21f4bfac Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:06 -0700
Subject: [PATCH 079/121] xfs: factor ag btree root block initialisation

Cookie cutter code, easily factored.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 487 ++++++++++++++++++++++++---------------------
 1 file changed, 262 insertions(+), 225 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 4b560caaf397..ec3c51ee8134 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -71,46 +71,140 @@ xfs_growfs_get_hdr_buf(
 	return bp;
 }
 
+struct aghdr_init_data {
+	/* per ag data */
+	xfs_agnumber_t		agno;		/* ag to init */
+	xfs_extlen_t		agsize;		/* new AG size */
+	struct list_head	buffer_list;	/* buffer writeback list */
+	xfs_rfsblock_t		nfree;		/* cumulative new free space */
+
+	/* per header data */
+	xfs_daddr_t		daddr;		/* header location */
+	size_t			numblks;	/* size of header */
+	xfs_btnum_t		type;		/* type of btree root block */
+};
+
 /*
- * Write new AG headers to disk. Non-transactional, but written
- * synchronously so they are completed prior to the growfs transaction
- * being logged.
+ * Generic btree root block init function
  */
-static int
-xfs_grow_ag_headers(
+static void
+xfs_btroot_init(
 	struct xfs_mount	*mp,
-	xfs_agnumber_t		agno,
-	xfs_extlen_t		agsize,
-	xfs_rfsblock_t		*nfree,
-	struct list_head	*buffer_list)
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
 {
-	struct xfs_agf		*agf;
-	struct xfs_agi		*agi;
-	struct xfs_agfl		*agfl;
-	__be32			*agfl_bno;
-	xfs_alloc_rec_t		*arec;
-	struct xfs_buf		*bp;
-	int			bucket;
-	xfs_extlen_t		tmpsize;
-	int			error = 0;
+	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+}
+
+/*
+ * Alloc btree root block init functions
+ */
+static void
+xfs_bnoroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_alloc_rec	*arec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+	arec->ar_blockcount = cpu_to_be32(id->agsize -
+					  be32_to_cpu(arec->ar_startblock));
+}
+
+static void
+xfs_cntroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_alloc_rec	*arec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+	arec->ar_blockcount = cpu_to_be32(id->agsize -
+					  be32_to_cpu(arec->ar_startblock));
+}
+
+/*
+ * Reverse map root block init
+ */
+static void
+xfs_rmaproot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_rmap_rec	*rrec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
 
 	/*
-	 * AG freespace header block
+	 * mark the AG header regions as static metadata The BNO
+	 * btree block is the first block after the headers, so
+	 * it's location defines the size of region the static
+	 * metadata consumes.
+	 *
+	 * Note: unlike mkfs, we never have to account for log
+	 * space when growing the data regions
 	 */
-	bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0,
-			&xfs_agf_buf_ops);
-	if (!bp) {
-		error = -ENOMEM;
-		goto out_error;
-	}
+	rrec = XFS_RMAP_REC_ADDR(block, 1);
+	rrec->rm_startblock = 0;
+	rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+	rrec->rm_offset = 0;
+
+	/* account freespace btree root blocks */
+	rrec = XFS_RMAP_REC_ADDR(block, 2);
+	rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(2);
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+	rrec->rm_offset = 0;
+
+	/* account inode btree root blocks */
+	rrec = XFS_RMAP_REC_ADDR(block, 3);
+	rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+					  XFS_IBT_BLOCK(mp));
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+	rrec->rm_offset = 0;
+
+	/* account for rmap btree root */
+	rrec = XFS_RMAP_REC_ADDR(block, 4);
+	rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(1);
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+	rrec->rm_offset = 0;
+
+	/* account for refc btree root */
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		rrec = XFS_RMAP_REC_ADDR(block, 5);
+		rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+		rrec->rm_blockcount = cpu_to_be32(1);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+	}
+}
+
+
+static void
+xfs_agfblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(bp);
+	xfs_extlen_t		tmpsize;
 
-	agf = XFS_BUF_TO_AGF(bp);
 	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
-	agf->agf_seqno = cpu_to_be32(agno);
-	agf->agf_length = cpu_to_be32(agsize);
+	agf->agf_seqno = cpu_to_be32(id->agno);
+	agf->agf_length = cpu_to_be32(id->agsize);
 	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
 	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
 	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
@@ -125,7 +219,7 @@ xfs_grow_ag_headers(
 	agf->agf_flfirst = cpu_to_be32(1);
 	agf->agf_fllast = 0;
 	agf->agf_flcount = 0;
-	tmpsize = agsize - mp->m_ag_prealloc_blocks;
+	tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
 	agf->agf_freeblks = cpu_to_be32(tmpsize);
 	agf->agf_longest = cpu_to_be32(tmpsize);
 	if (xfs_sb_version_hascrc(&mp->m_sb))
@@ -136,52 +230,42 @@ xfs_grow_ag_headers(
 		agf->agf_refcount_level = cpu_to_be32(1);
 		agf->agf_refcount_blocks = cpu_to_be32(1);
 	}
-	xfs_buf_delwri_queue(bp, buffer_list);
-	xfs_buf_relse(bp);
+}
 
-	/*
-	 * AG freelist header block
-	 */
-	bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0,
-			&xfs_agfl_buf_ops);
-	if (!bp) {
-		error = -ENOMEM;
-		goto out_error;
-	}
+static void
+xfs_agflblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agfl		*agfl = XFS_BUF_TO_AGFL(bp);
+	__be32			*agfl_bno;
+	int			bucket;
 
-	agfl = XFS_BUF_TO_AGFL(bp);
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
-		agfl->agfl_seqno = cpu_to_be32(agno);
+		agfl->agfl_seqno = cpu_to_be32(id->agno);
 		uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
 	}
 
 	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
 	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
 		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+}
 
-	xfs_buf_delwri_queue(bp, buffer_list);
-	xfs_buf_relse(bp);
+static void
+xfs_agiblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(bp);
+	int			bucket;
 
-	/*
-	 * AG inode header block
-	 */
-	bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0,
-			&xfs_agi_buf_ops);
-	if (!bp) {
-		error = -ENOMEM;
-		goto out_error;
-	}
-
-	agi = XFS_BUF_TO_AGI(bp);
 	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
 	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
-	agi->agi_seqno = cpu_to_be32(agno);
-	agi->agi_length = cpu_to_be32(agsize);
+	agi->agi_seqno = cpu_to_be32(id->agno);
+	agi->agi_length = cpu_to_be32(id->agsize);
 	agi->agi_count = 0;
 	agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
 	agi->agi_level = cpu_to_be32(1);
@@ -196,180 +280,133 @@ xfs_grow_ag_headers(
 	}
 	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
 		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+}
 
-	xfs_buf_delwri_queue(bp, buffer_list);
+static int
+xfs_growfs_init_aghdr(
+	struct xfs_mount	*mp,
+	struct aghdr_init_data	*id,
+	void			(*work)(struct xfs_mount *, struct xfs_buf *,
+					struct aghdr_init_data *),
+	const struct xfs_buf_ops *ops)
+
+{
+	struct xfs_buf		*bp;
+
+	bp = xfs_growfs_get_hdr_buf(mp, id->daddr, id->numblks, 0, ops);
+	if (!bp)
+		return -ENOMEM;
+
+	(*work)(mp, bp, id);
+
+	xfs_buf_delwri_queue(bp, &id->buffer_list);
 	xfs_buf_relse(bp);
+	return 0;
+}
 
-	/*
-	 * BNO btree root block
-	 */
-	bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0,
-			&xfs_allocbt_buf_ops);
+/*
+ * Write new AG headers to disk. Non-transactional, but written
+ * synchronously so they are completed prior to the growfs transaction
+ * being logged.
+ */
+static int
+xfs_grow_ag_headers(
+	struct xfs_mount	*mp,
+	struct aghdr_init_data	*id)
 
-	if (!bp) {
-		error = -ENOMEM;
+{
+	int			error = 0;
+
+	/* Account for AG free space in new AG */
+	id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+
+	/* AG freespace header block */
+	id->daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp));
+	id->numblks = XFS_FSS_TO_BB(mp, 1);
+	error = xfs_growfs_init_aghdr(mp, id, xfs_agfblock_init,
+					&xfs_agf_buf_ops);
+	if (error)
 		goto out_error;
-	}
 
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
-
-	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-	arec->ar_blockcount = cpu_to_be32(
-		agsize - be32_to_cpu(arec->ar_startblock));
-
-	xfs_buf_delwri_queue(bp, buffer_list);
-	xfs_buf_relse(bp);
-
-	/*
-	 * CNT btree root block
-	 */
-	bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0,
-			&xfs_allocbt_buf_ops);
-	if (!bp) {
-		error = -ENOMEM;
+	/* AG freelist header block */
+	id->daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp));
+	id->numblks = XFS_FSS_TO_BB(mp, 1);
+	error = xfs_growfs_init_aghdr(mp, id, xfs_agflblock_init,
+					&xfs_agfl_buf_ops);
+	if (error)
 		goto out_error;
-	}
 
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
+	/* AG inode header block */
+	id->daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp));
+	id->numblks = XFS_FSS_TO_BB(mp, 1);
+	error = xfs_growfs_init_aghdr(mp, id, xfs_agiblock_init,
+					&xfs_agi_buf_ops);
+	if (error)
+		goto out_error;
 
-	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-	arec->ar_blockcount = cpu_to_be32(
-		agsize - be32_to_cpu(arec->ar_startblock));
-	*nfree += be32_to_cpu(arec->ar_blockcount);
 
-	xfs_buf_delwri_queue(bp, buffer_list);
-	xfs_buf_relse(bp);
+	/* BNO btree root block */
+	id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp));
+	id->numblks = BTOBB(mp->m_sb.sb_blocksize);
+	error = xfs_growfs_init_aghdr(mp, id, xfs_bnoroot_init,
+				   &xfs_allocbt_buf_ops);
+	if (error)
+		goto out_error;
+
+
+	/* CNT btree root block */
+	id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp));
+	id->numblks = BTOBB(mp->m_sb.sb_blocksize);
+	error = xfs_growfs_init_aghdr(mp, id, xfs_cntroot_init,
+				   &xfs_allocbt_buf_ops);
+	if (error)
+		goto out_error;
 
 	/* RMAP btree root block */
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
-		struct xfs_rmap_rec	*rrec;
-		struct xfs_btree_block	*block;
-
-		bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0,
-			&xfs_rmapbt_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
+		id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp));
+		id->numblks = BTOBB(mp->m_sb.sb_blocksize);
+		error = xfs_growfs_init_aghdr(mp, id, xfs_rmaproot_init,
+					   &xfs_rmapbt_buf_ops);
+		if (error)
 			goto out_error;
-		}
 
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
-					agno, 0);
-		block = XFS_BUF_TO_BLOCK(bp);
-
-
-		/*
-		 * mark the AG header regions as static metadata The BNO
-		 * btree block is the first block after the headers, so
-		 * it's location defines the size of region the static
-		 * metadata consumes.
-		 *
-		 * Note: unlike mkfs, we never have to account for log
-		 * space when growing the data regions
-		 */
-		rrec = XFS_RMAP_REC_ADDR(block, 1);
-		rrec->rm_startblock = 0;
-		rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
-		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
-		rrec->rm_offset = 0;
-		be16_add_cpu(&block->bb_numrecs, 1);
-
-		/* account freespace btree root blocks */
-		rrec = XFS_RMAP_REC_ADDR(block, 2);
-		rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
-		rrec->rm_blockcount = cpu_to_be32(2);
-		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
-		rrec->rm_offset = 0;
-		be16_add_cpu(&block->bb_numrecs, 1);
-
-		/* account inode btree root blocks */
-		rrec = XFS_RMAP_REC_ADDR(block, 3);
-		rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
-		rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
-						XFS_IBT_BLOCK(mp));
-		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
-		rrec->rm_offset = 0;
-		be16_add_cpu(&block->bb_numrecs, 1);
-
-		/* account for rmap btree root */
-		rrec = XFS_RMAP_REC_ADDR(block, 4);
-		rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
-		rrec->rm_blockcount = cpu_to_be32(1);
-		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
-		rrec->rm_offset = 0;
-		be16_add_cpu(&block->bb_numrecs, 1);
-
-		/* account for refc btree root */
-		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-			rrec = XFS_RMAP_REC_ADDR(block, 5);
-			rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
-			rrec->rm_blockcount = cpu_to_be32(1);
-			rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
-			rrec->rm_offset = 0;
-			be16_add_cpu(&block->bb_numrecs, 1);
-		}
-
-		xfs_buf_delwri_queue(bp, buffer_list);
-		xfs_buf_relse(bp);
 	}
 
-	/*
-	 * INO btree root block
-	 */
-	bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0,
-			&xfs_inobt_buf_ops);
-	if (!bp) {
-		error = -ENOMEM;
+	/* INO btree root block */
+	id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp));
+	id->numblks = BTOBB(mp->m_sb.sb_blocksize);
+	id->type = XFS_BTNUM_INO;
+	error = xfs_growfs_init_aghdr(mp, id, xfs_btroot_init,
+				   &xfs_inobt_buf_ops);
+	if (error)
 		goto out_error;
-	}
 
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
-	xfs_buf_delwri_queue(bp, buffer_list);
-	xfs_buf_relse(bp);
 
 	/*
 	 * FINO btree root block
 	 */
 	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-		bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0,
-			&xfs_inobt_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
+		id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp));
+		id->numblks = BTOBB(mp->m_sb.sb_blocksize);
+		id->type = XFS_BTNUM_FINO;
+		error = xfs_growfs_init_aghdr(mp, id, xfs_btroot_init,
+					   &xfs_inobt_buf_ops);
+		if (error)
 			goto out_error;
-		}
-
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, 0, 0, agno, 0);
-		xfs_buf_delwri_queue(bp, buffer_list);
-		xfs_buf_relse(bp);
 	}
 
 	/*
 	 * refcount btree root block
 	 */
 	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-		bp = xfs_growfs_get_hdr_buf(mp,
-			XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
-			BTOBB(mp->m_sb.sb_blocksize), 0,
-			&xfs_refcountbt_buf_ops);
-		if (!bp) {
-			error = -ENOMEM;
+		id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp));
+		id->numblks = BTOBB(mp->m_sb.sb_blocksize);
+		id->type = XFS_BTNUM_REFC;
+		error = xfs_growfs_init_aghdr(mp, id, xfs_btroot_init,
+					   &xfs_refcountbt_buf_ops);
+		if (error)
 			goto out_error;
-		}
-
-		xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, 0, 0, agno, 0);
-		xfs_buf_delwri_queue(bp, buffer_list);
-		xfs_buf_relse(bp);
 	}
 
 out_error:
@@ -384,7 +421,6 @@ xfs_growfs_data_private(
 	xfs_agf_t		*agf;
 	xfs_agi_t		*agi;
 	xfs_agnumber_t		agno;
-	xfs_extlen_t		agsize;
 	xfs_buf_t		*bp;
 	int			dpct;
 	int			error, saved_error = 0;
@@ -392,11 +428,11 @@ xfs_growfs_data_private(
 	xfs_agnumber_t		nagimax = 0;
 	xfs_rfsblock_t		nb, nb_mod;
 	xfs_rfsblock_t		new;
-	xfs_rfsblock_t		nfree;
 	xfs_agnumber_t		oagcount;
 	int			pct;
 	xfs_trans_t		*tp;
 	LIST_HEAD		(buffer_list);
+	struct aghdr_init_data	id = {};
 
 	nb = in->newblocks;
 	pct = in->imaxpct;
@@ -448,27 +484,28 @@ xfs_growfs_data_private(
 	 * list to write, we can cancel the entire list without having written
 	 * anything.
 	 */
-	nfree = 0;
-	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+	INIT_LIST_HEAD(&id.buffer_list);
+	for (id.agno = nagcount - 1;
+	     id.agno >= oagcount;
+	     id.agno--, new -= id.agsize) {
 
-		if (agno == nagcount - 1)
-			agsize = nb -
-				(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+		if (id.agno == nagcount - 1)
+			id.agsize = nb -
+				(id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
 		else
-			agsize = mp->m_sb.sb_agblocks;
+			id.agsize = mp->m_sb.sb_agblocks;
 
-		error = xfs_grow_ag_headers(mp, agno, agsize, &nfree,
-					    &buffer_list);
+		error = xfs_grow_ag_headers(mp, &id);
 		if (error) {
-			xfs_buf_delwri_cancel(&buffer_list);
+			xfs_buf_delwri_cancel(&id.buffer_list);
 			goto error0;
 		}
 	}
-	error = xfs_buf_delwri_submit(&buffer_list);
+	error = xfs_buf_delwri_submit(&id.buffer_list);
 	if (error)
 		goto error0;
 
-	xfs_trans_agblocks_delta(tp, nfree);
+	xfs_trans_agblocks_delta(tp, id.nfree);
 
 	/*
 	 * There are new blocks in the old last a.g.
@@ -479,7 +516,7 @@ xfs_growfs_data_private(
 		/*
 		 * Change the agi length.
 		 */
-		error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+		error = xfs_ialloc_read_agi(mp, tp, id.agno, &bp);
 		if (error) {
 			goto error0;
 		}
@@ -492,7 +529,7 @@ xfs_growfs_data_private(
 		/*
 		 * Change agf length.
 		 */
-		error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+		error = xfs_alloc_read_agf(mp, tp, id.agno, 0, &bp);
 		if (error) {
 			goto error0;
 		}
@@ -511,13 +548,13 @@ xfs_growfs_data_private(
 		 * this doesn't actually exist in the rmap btree.
 		 */
 		xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
-		error = xfs_rmap_free(tp, bp, agno,
+		error = xfs_rmap_free(tp, bp, id.agno,
 				be32_to_cpu(agf->agf_length) - new,
 				new, &oinfo);
 		if (error)
 			goto error0;
 		error = xfs_free_extent(tp,
-				XFS_AGB_TO_FSB(mp, agno,
+				XFS_AGB_TO_FSB(mp, id.agno,
 					be32_to_cpu(agf->agf_length) - new),
 				new, &oinfo, XFS_AG_RESV_NONE);
 		if (error)
@@ -534,8 +571,8 @@ xfs_growfs_data_private(
 	if (nb > mp->m_sb.sb_dblocks)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
 				 nb - mp->m_sb.sb_dblocks);
-	if (nfree)
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+	if (id.nfree)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
 	if (dpct)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
 	xfs_trans_set_sync(tp);
@@ -562,7 +599,7 @@ xfs_growfs_data_private(
 	if (new) {
 		struct xfs_perag	*pag;
 
-		pag = xfs_perag_get(mp, agno);
+		pag = xfs_perag_get(mp, id.agno);
 		error = xfs_ag_resv_free(pag);
 		xfs_perag_put(pag);
 		if (error)

From 532ff647d8303049e3913bf21e467b3bcc7f0bce Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:06 -0700
Subject: [PATCH 080/121] xfs: turn ag header initialisation into a table
 driven operation

There's still more cookie cutter code in setting up each AG header.
Separate all the variables into a simple structure and iterate a
table of header definitions to initialise everything.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 182 +++++++++++++++++++++++----------------------
 1 file changed, 92 insertions(+), 90 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ec3c51ee8134..fb283d2cf42b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -282,12 +282,13 @@ xfs_agiblock_init(
 		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
 }
 
+typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
+				  struct aghdr_init_data *id);
 static int
 xfs_growfs_init_aghdr(
 	struct xfs_mount	*mp,
 	struct aghdr_init_data	*id,
-	void			(*work)(struct xfs_mount *, struct xfs_buf *,
-					struct aghdr_init_data *),
+	aghdr_init_work_f	work,
 	const struct xfs_buf_ops *ops)
 
 {
@@ -304,6 +305,15 @@ xfs_growfs_init_aghdr(
 	return 0;
 }
 
+struct xfs_aghdr_grow_data {
+	xfs_daddr_t		daddr;
+	size_t			numblks;
+	const struct xfs_buf_ops *ops;
+	aghdr_init_work_f	work;
+	xfs_btnum_t		type;
+	bool			need_init;
+};
+
 /*
  * Write new AG headers to disk. Non-transactional, but written
  * synchronously so they are completed prior to the growfs transaction
@@ -315,101 +325,93 @@ xfs_grow_ag_headers(
 	struct aghdr_init_data	*id)
 
 {
+	struct xfs_aghdr_grow_data aghdr_data[] = {
+	{ /* AGF */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agf_buf_ops,
+		.work = &xfs_agfblock_init,
+		.need_init = true
+	},
+	{ /* AGFL */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agfl_buf_ops,
+		.work = &xfs_agflblock_init,
+		.need_init = true
+	},
+	{ /* AGI */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agi_buf_ops,
+		.work = &xfs_agiblock_init,
+		.need_init = true
+	},
+	{ /* BNO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_allocbt_buf_ops,
+		.work = &xfs_bnoroot_init,
+		.need_init = true
+	},
+	{ /* CNT root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_allocbt_buf_ops,
+		.work = &xfs_cntroot_init,
+		.need_init = true
+	},
+	{ /* INO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_inobt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_INO,
+		.need_init = true
+	},
+	{ /* FINO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_inobt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_FINO,
+		.need_init =  xfs_sb_version_hasfinobt(&mp->m_sb)
+	},
+	{ /* RMAP root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_rmapbt_buf_ops,
+		.work = &xfs_rmaproot_init,
+		.need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
+	},
+	{ /* REFC root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_refcountbt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_REFC,
+		.need_init = xfs_sb_version_hasreflink(&mp->m_sb)
+	},
+	{ /* NULL terminating block */
+		.daddr = XFS_BUF_DADDR_NULL,
+	}
+	};
+	struct  xfs_aghdr_grow_data *dp;
 	int			error = 0;
 
 	/* Account for AG free space in new AG */
 	id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+	for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
+		if (!dp->need_init)
+			continue;
 
-	/* AG freespace header block */
-	id->daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp));
-	id->numblks = XFS_FSS_TO_BB(mp, 1);
-	error = xfs_growfs_init_aghdr(mp, id, xfs_agfblock_init,
-					&xfs_agf_buf_ops);
-	if (error)
-		goto out_error;
-
-	/* AG freelist header block */
-	id->daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp));
-	id->numblks = XFS_FSS_TO_BB(mp, 1);
-	error = xfs_growfs_init_aghdr(mp, id, xfs_agflblock_init,
-					&xfs_agfl_buf_ops);
-	if (error)
-		goto out_error;
-
-	/* AG inode header block */
-	id->daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp));
-	id->numblks = XFS_FSS_TO_BB(mp, 1);
-	error = xfs_growfs_init_aghdr(mp, id, xfs_agiblock_init,
-					&xfs_agi_buf_ops);
-	if (error)
-		goto out_error;
-
-
-	/* BNO btree root block */
-	id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp));
-	id->numblks = BTOBB(mp->m_sb.sb_blocksize);
-	error = xfs_growfs_init_aghdr(mp, id, xfs_bnoroot_init,
-				   &xfs_allocbt_buf_ops);
-	if (error)
-		goto out_error;
-
-
-	/* CNT btree root block */
-	id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp));
-	id->numblks = BTOBB(mp->m_sb.sb_blocksize);
-	error = xfs_growfs_init_aghdr(mp, id, xfs_cntroot_init,
-				   &xfs_allocbt_buf_ops);
-	if (error)
-		goto out_error;
-
-	/* RMAP btree root block */
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
-		id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp));
-		id->numblks = BTOBB(mp->m_sb.sb_blocksize);
-		error = xfs_growfs_init_aghdr(mp, id, xfs_rmaproot_init,
-					   &xfs_rmapbt_buf_ops);
+		id->daddr = dp->daddr;
+		id->numblks = dp->numblks;
+		id->type = dp->type;
+		error = xfs_growfs_init_aghdr(mp, id, dp->work, dp->ops);
 		if (error)
-			goto out_error;
-
+			break;
 	}
-
-	/* INO btree root block */
-	id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp));
-	id->numblks = BTOBB(mp->m_sb.sb_blocksize);
-	id->type = XFS_BTNUM_INO;
-	error = xfs_growfs_init_aghdr(mp, id, xfs_btroot_init,
-				   &xfs_inobt_buf_ops);
-	if (error)
-		goto out_error;
-
-
-	/*
-	 * FINO btree root block
-	 */
-	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-		id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp));
-		id->numblks = BTOBB(mp->m_sb.sb_blocksize);
-		id->type = XFS_BTNUM_FINO;
-		error = xfs_growfs_init_aghdr(mp, id, xfs_btroot_init,
-					   &xfs_inobt_buf_ops);
-		if (error)
-			goto out_error;
-	}
-
-	/*
-	 * refcount btree root block
-	 */
-	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-		id->daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp));
-		id->numblks = BTOBB(mp->m_sb.sb_blocksize);
-		id->type = XFS_BTNUM_REFC;
-		error = xfs_growfs_init_aghdr(mp, id, xfs_btroot_init,
-					   &xfs_refcountbt_buf_ops);
-		if (error)
-			goto out_error;
-	}
-
-out_error:
 	return error;
 }
 

From 87444b8c267a5c607931eb5b5e65611f4972672b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:07 -0700
Subject: [PATCH 081/121] xfs: make imaxpct changes in growfs separate

When growfs changes the imaxpct value of the filesystem, it runs
through all the "change size" growfs code, whether it needs to or
not. Separate out changing imaxpct into it's own function and
transaction to simplify the rest of the growfs code.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 69 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fb283d2cf42b..4e66d390e2a4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -424,25 +424,21 @@ xfs_growfs_data_private(
 	xfs_agi_t		*agi;
 	xfs_agnumber_t		agno;
 	xfs_buf_t		*bp;
-	int			dpct;
 	int			error, saved_error = 0;
 	xfs_agnumber_t		nagcount;
 	xfs_agnumber_t		nagimax = 0;
 	xfs_rfsblock_t		nb, nb_mod;
 	xfs_rfsblock_t		new;
 	xfs_agnumber_t		oagcount;
-	int			pct;
 	xfs_trans_t		*tp;
 	LIST_HEAD		(buffer_list);
 	struct aghdr_init_data	id = {};
 
 	nb = in->newblocks;
-	pct = in->imaxpct;
-	if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+	if (nb < mp->m_sb.sb_dblocks)
 		return -EINVAL;
 	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
 		return error;
-	dpct = pct - mp->m_sb.sb_imax_pct;
 	error = xfs_buf_read_uncached(mp->m_ddev_targp,
 				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
 				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
@@ -575,8 +571,6 @@ xfs_growfs_data_private(
 				 nb - mp->m_sb.sb_dblocks);
 	if (id.nfree)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
-	if (dpct)
-		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
 	xfs_trans_set_sync(tp);
 	error = xfs_trans_commit(tp);
 	if (error)
@@ -585,12 +579,6 @@ xfs_growfs_data_private(
 	/* New allocation groups fully initialized, so update mount struct */
 	if (nagimax)
 		mp->m_maxagi = nagimax;
-	if (mp->m_sb.sb_imax_pct) {
-		uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
-		do_div(icount, 100);
-		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
-	} else
-		mp->m_maxicount = 0;
 	xfs_set_low_space_thresholds(mp);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
@@ -694,25 +682,68 @@ xfs_growfs_log_private(
 	return -ENOSYS;
 }
 
+static int
+xfs_growfs_imaxpct(
+	struct xfs_mount	*mp,
+	__u32			imaxpct)
+{
+	struct xfs_trans	*tp;
+	int64_t			dpct;
+	int			error;
+
+	if (imaxpct > 100)
+		return -EINVAL;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	dpct = (int64_t)imaxpct - mp->m_sb.sb_imax_pct;
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp);
+}
+
 /*
  * protected versions of growfs function acquire and release locks on the mount
  * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
  * XFS_IOC_FSGROWFSRT
  */
-
-
 int
 xfs_growfs_data(
-	xfs_mount_t		*mp,
-	xfs_growfs_data_t	*in)
+	struct xfs_mount	*mp,
+	struct xfs_growfs_data	*in)
 {
-	int error;
+	int			error = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (!mutex_trylock(&mp->m_growlock))
 		return -EWOULDBLOCK;
-	error = xfs_growfs_data_private(mp, in);
+
+	/* update imaxpct separately to the physical grow of the filesystem */
+	if (in->imaxpct != mp->m_sb.sb_imax_pct) {
+		error = xfs_growfs_imaxpct(mp, in->imaxpct);
+		if (error)
+			goto out_error;
+	}
+
+	if (in->newblocks != mp->m_sb.sb_dblocks) {
+		error = xfs_growfs_data_private(mp, in);
+		if (error)
+			goto out_error;
+	}
+
+	/* Post growfs calculations needed to reflect new state in operations */
+	if (mp->m_sb.sb_imax_pct) {
+		uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+
+out_error:
 	/*
 	 * Increment the generation unconditionally, the error could be from
 	 * updating the secondary superblocks, in which case the new size

From 83a7f86e39ff5d60ba2ea2e24879e136d8e9a114 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:07 -0700
Subject: [PATCH 082/121] xfs: separate secondary sb update in growfs

This happens after all the transactions to update the superblock
occur, and errors need to be handled slightly differently. Seperate
out the code into it's own function, and clean up the error goto
stack in the core growfs code as it is now much simpler.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 159 ++++++++++++++++++++++++++-------------------
 1 file changed, 92 insertions(+), 67 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 4e66d390e2a4..a718bb002cf4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -422,9 +422,8 @@ xfs_growfs_data_private(
 {
 	xfs_agf_t		*agf;
 	xfs_agi_t		*agi;
-	xfs_agnumber_t		agno;
 	xfs_buf_t		*bp;
-	int			error, saved_error = 0;
+	int			error;
 	xfs_agnumber_t		nagcount;
 	xfs_agnumber_t		nagimax = 0;
 	xfs_rfsblock_t		nb, nb_mod;
@@ -496,12 +495,12 @@ xfs_growfs_data_private(
 		error = xfs_grow_ag_headers(mp, &id);
 		if (error) {
 			xfs_buf_delwri_cancel(&id.buffer_list);
-			goto error0;
+			goto out_trans_cancel;
 		}
 	}
 	error = xfs_buf_delwri_submit(&id.buffer_list);
 	if (error)
-		goto error0;
+		goto out_trans_cancel;
 
 	xfs_trans_agblocks_delta(tp, id.nfree);
 
@@ -515,22 +514,23 @@ xfs_growfs_data_private(
 		 * Change the agi length.
 		 */
 		error = xfs_ialloc_read_agi(mp, tp, id.agno, &bp);
-		if (error) {
-			goto error0;
-		}
+		if (error)
+			goto out_trans_cancel;
+
 		ASSERT(bp);
 		agi = XFS_BUF_TO_AGI(bp);
 		be32_add_cpu(&agi->agi_length, new);
 		ASSERT(nagcount == oagcount ||
 		       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
 		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+
 		/*
 		 * Change agf length.
 		 */
 		error = xfs_alloc_read_agf(mp, tp, id.agno, 0, &bp);
-		if (error) {
-			goto error0;
-		}
+		if (error)
+			goto out_trans_cancel;
+
 		ASSERT(bp);
 		agf = XFS_BUF_TO_AGF(bp);
 		be32_add_cpu(&agf->agf_length, new);
@@ -550,13 +550,13 @@ xfs_growfs_data_private(
 				be32_to_cpu(agf->agf_length) - new,
 				new, &oinfo);
 		if (error)
-			goto error0;
+			goto out_trans_cancel;
 		error = xfs_free_extent(tp,
 				XFS_AGB_TO_FSB(mp, id.agno,
 					be32_to_cpu(agf->agf_length) - new),
 				new, &oinfo, XFS_AG_RESV_NONE);
 		if (error)
-			goto error0;
+			goto out_trans_cancel;
 	}
 
 	/*
@@ -593,16 +593,86 @@ xfs_growfs_data_private(
 		error = xfs_ag_resv_free(pag);
 		xfs_perag_put(pag);
 		if (error)
-			goto out;
+			return error;
 	}
 
-	/* Reserve AG metadata blocks. */
+	/*
+	 * Reserve AG metadata blocks. ENOSPC here does not mean there was a
+	 * growfs failure, just that there still isn't space for new user data
+	 * after the grow has been run.
+	 */
 	error = xfs_fs_reserve_ag_blocks(mp);
-	if (error && error != -ENOSPC)
-		goto out;
+	if (error == -ENOSPC)
+		error = 0;
+	return error;
+
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+static int
+xfs_growfs_log_private(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_growfs_log_t	*in)	/* growfs log input struct */
+{
+	xfs_extlen_t		nb;
+
+	nb = in->newblocks;
+	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+		return -EINVAL;
+	if (nb == mp->m_sb.sb_logblocks &&
+	    in->isint == (mp->m_sb.sb_logstart != 0))
+		return -EINVAL;
+	/*
+	 * Moving the log is hard, need new interfaces to sync
+	 * the log first, hold off all activity while moving it.
+	 * Can have shorter or longer log in the same space,
+	 * or transform internal to external log or vice versa.
+	 */
+	return -ENOSYS;
+}
+
+static int
+xfs_growfs_imaxpct(
+	struct xfs_mount	*mp,
+	__u32			imaxpct)
+{
+	struct xfs_trans	*tp;
+	int			dpct;
+	int			error;
+
+	if (imaxpct > 100)
+		return -EINVAL;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	dpct = imaxpct - mp->m_sb.sb_imax_pct;
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp);
+}
+
+/*
+ * After a grow operation, we need to update all the secondary superblocks
+ * to match the new state of the primary. Read/init the superblocks and update
+ * them appropriately.
+ */
+static int
+xfs_growfs_update_superblocks(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		oagcount)
+{
+	struct xfs_buf		*bp;
+	xfs_agnumber_t		agno;
+	int			saved_error = 0;
+	int			error = 0;
 
 	/* update secondary superblocks. */
-	for (agno = 1; agno < nagcount; agno++) {
+	for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
 		error = 0;
 		/*
 		 * new secondary superblocks need to be zeroed, not read from
@@ -652,57 +722,7 @@ xfs_growfs_data_private(
 		}
 	}
 
- out:
 	return saved_error ? saved_error : error;
-
- error0:
-	xfs_trans_cancel(tp);
-	return error;
-}
-
-static int
-xfs_growfs_log_private(
-	xfs_mount_t		*mp,	/* mount point for filesystem */
-	xfs_growfs_log_t	*in)	/* growfs log input struct */
-{
-	xfs_extlen_t		nb;
-
-	nb = in->newblocks;
-	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
-		return -EINVAL;
-	if (nb == mp->m_sb.sb_logblocks &&
-	    in->isint == (mp->m_sb.sb_logstart != 0))
-		return -EINVAL;
-	/*
-	 * Moving the log is hard, need new interfaces to sync
-	 * the log first, hold off all activity while moving it.
-	 * Can have shorter or longer log in the same space,
-	 * or transform internal to external log or vice versa.
-	 */
-	return -ENOSYS;
-}
-
-static int
-xfs_growfs_imaxpct(
-	struct xfs_mount	*mp,
-	__u32			imaxpct)
-{
-	struct xfs_trans	*tp;
-	int64_t			dpct;
-	int			error;
-
-	if (imaxpct > 100)
-		return -EINVAL;
-
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
-			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
-	if (error)
-		return error;
-
-	dpct = (int64_t)imaxpct - mp->m_sb.sb_imax_pct;
-	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
-	xfs_trans_set_sync(tp);
-	return xfs_trans_commit(tp);
 }
 
 /*
@@ -715,6 +735,7 @@ xfs_growfs_data(
 	struct xfs_mount	*mp,
 	struct xfs_growfs_data	*in)
 {
+	xfs_agnumber_t		oagcount;
 	int			error = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -729,6 +750,7 @@ xfs_growfs_data(
 			goto out_error;
 	}
 
+	oagcount = mp->m_sb.sb_agcount;
 	if (in->newblocks != mp->m_sb.sb_dblocks) {
 		error = xfs_growfs_data_private(mp, in);
 		if (error)
@@ -743,6 +765,9 @@ xfs_growfs_data(
 	} else
 		mp->m_maxicount = 0;
 
+	/* Update secondary superblocks now the physical grow has completed */
+	error = xfs_growfs_update_superblocks(mp, oagcount);
+
 out_error:
 	/*
 	 * Increment the generation unconditionally, the error could be from

From 8125147288db7008c872a946210028bbf6c0af86 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:08 -0700
Subject: [PATCH 083/121] xfs: rework secondary superblock updates in growfs

Right now we wait until we've committed changes to the primary
superblock before we initialise any of the new secondary
superblocks. This means that if we have any write errors for new
secondary superblocks we end up with garbage in place rather than
zeros or even an "in progress" superblock to indicate a grow
operation is being done.

To ensure we can write the secondary superblocks, initialise them
earlier in the same loop that initialises the AG headers. We stamp
the new secondary superblocks here with the old geometry, but set
the "sb_inprogress" field to indicate that updates are being done to
the superblock so they cannot be used.  This will result in the
secondary superblock fields being updated or triggering errors that
will abort the grow before we commit any permanent changes.

This also means we can change the update mechanism of the secondary
superblocks.  We know that we are going to wholly overwrite the
information in the struct xfs_sb in the buffer, so there's no point
reading it from disk. Just allocate an uncached buffer, zero it in
memory, stamp the new superblock structure in it and write it out.
If we fail to write it out, then we'll leave the existing sb (old or
new w/ inprogress) on disk for repair to deal with later.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsops.c | 102 +++++++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 37 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a718bb002cf4..ca3cf3f6239a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -191,6 +191,25 @@ xfs_rmaproot_init(
 	}
 }
 
+/*
+ * Initialise new secondary superblocks with the pre-grow geometry, but mark
+ * them as "in progress" so we know they haven't yet been activated. This will
+ * get cleared when the update with the new geometry information is done after
+ * changes to the primary are committed. This isn't strictly necessary, but we
+ * get it for free with the delayed buffer write lists and it means we can tell
+ * if a grow operation didn't complete properly after the fact.
+ */
+static void
+xfs_sbblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_dsb		*dsb = XFS_BUF_TO_SBP(bp);
+
+	xfs_sb_to_disk(dsb, &mp->m_sb);
+	dsb->sb_inprogress = 1;
+}
 
 static void
 xfs_agfblock_init(
@@ -326,6 +345,13 @@ xfs_grow_ag_headers(
 
 {
 	struct xfs_aghdr_grow_data aghdr_data[] = {
+	{ /* SB */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_sb_buf_ops,
+		.work = &xfs_sbblock_init,
+		.need_init = true
+	},
 	{ /* AGF */
 		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
 		.numblks = XFS_FSS_TO_BB(mp, 1),
@@ -658,43 +684,30 @@ xfs_growfs_imaxpct(
 
 /*
  * After a grow operation, we need to update all the secondary superblocks
- * to match the new state of the primary. Read/init the superblocks and update
- * them appropriately.
+ * to match the new state of the primary. Because we are completely overwriting
+ * all the existing fields in the secondary superblock buffers, there is no need
+ * to read them in from disk. Just get a new buffer, stamp it and write it.
+ *
+ * The sb buffers need to be cached here so that we serialise against scrub
+ * scanning secondary superblocks, but we don't want to keep it in memory once
+ * it is written so we mark it as a one-shot buffer.
  */
 static int
 xfs_growfs_update_superblocks(
-	struct xfs_mount	*mp,
-	xfs_agnumber_t		oagcount)
+	struct xfs_mount	*mp)
 {
-	struct xfs_buf		*bp;
 	xfs_agnumber_t		agno;
 	int			saved_error = 0;
 	int			error = 0;
+	LIST_HEAD		(buffer_list);
 
 	/* update secondary superblocks. */
 	for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
-		error = 0;
-		/*
-		 * new secondary superblocks need to be zeroed, not read from
-		 * disk as the contents of the new area we are growing into is
-		 * completely unknown.
-		 */
-		if (agno < oagcount) {
-			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0, &bp,
-				  &xfs_sb_buf_ops);
-		} else {
-			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
-				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0);
-			if (bp) {
-				bp->b_ops = &xfs_sb_buf_ops;
-				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-			} else
-				error = -ENOMEM;
-		}
+		struct xfs_buf		*bp;
 
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+				 XFS_FSS_TO_BB(mp, 1), 0);
 		/*
 		 * If we get an error reading or writing alternate superblocks,
 		 * continue.  xfs_repair chooses the "best" superblock based
@@ -702,25 +715,42 @@ xfs_growfs_update_superblocks(
 		 * superblocks un-updated than updated, and xfs_repair may
 		 * pick them over the properly-updated primary.
 		 */
-		if (error) {
+		if (!bp) {
 			xfs_warn(mp,
-		"error %d reading secondary superblock for ag %d",
-				error, agno);
-			saved_error = error;
+		"error allocating secondary superblock for ag %d",
+				agno);
+			if (!saved_error)
+				saved_error = -ENOMEM;
 			continue;
 		}
-		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
 
-		error = xfs_bwrite(bp);
+		bp->b_ops = &xfs_sb_buf_ops;
+		xfs_buf_oneshot(bp);
+		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+		xfs_buf_delwri_queue(bp, &buffer_list);
 		xfs_buf_relse(bp);
+
+		/* don't hold too many buffers at once */
+		if (agno % 16)
+			continue;
+
+		error = xfs_buf_delwri_submit(&buffer_list);
 		if (error) {
 			xfs_warn(mp,
-		"write error %d updating secondary superblock for ag %d",
+		"write error %d updating a secondary superblock near ag %d",
 				error, agno);
-			saved_error = error;
+			if (!saved_error)
+				saved_error = error;
 			continue;
 		}
 	}
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error) {
+		xfs_warn(mp,
+		"write error %d updating a secondary superblock near ag %d",
+			error, agno);
+	}
 
 	return saved_error ? saved_error : error;
 }
@@ -735,7 +765,6 @@ xfs_growfs_data(
 	struct xfs_mount	*mp,
 	struct xfs_growfs_data	*in)
 {
-	xfs_agnumber_t		oagcount;
 	int			error = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -750,7 +779,6 @@ xfs_growfs_data(
 			goto out_error;
 	}
 
-	oagcount = mp->m_sb.sb_agcount;
 	if (in->newblocks != mp->m_sb.sb_dblocks) {
 		error = xfs_growfs_data_private(mp, in);
 		if (error)
@@ -766,7 +794,7 @@ xfs_growfs_data(
 		mp->m_maxicount = 0;
 
 	/* Update secondary superblocks now the physical grow has completed */
-	error = xfs_growfs_update_superblocks(mp, oagcount);
+	error = xfs_growfs_update_superblocks(mp);
 
 out_error:
 	/*

From b16817b66b6c97d2a812d663d26faed40079892a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:08 -0700
Subject: [PATCH 084/121] xfs: move growfs core to libxfs

So it can be shared with userspace (e.g. mkfs) easily.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile        |   1 +
 fs/xfs/libxfs/xfs_ag.c | 404 ++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_ag.h |  25 +++
 fs/xfs/libxfs/xfs_sb.c |  73 +++++++
 fs/xfs/libxfs/xfs_sb.h |   9 +
 fs/xfs/xfs_fsops.c     | 479 +----------------------------------------
 6 files changed, 516 insertions(+), 475 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_ag.c
 create mode 100644 fs/xfs/libxfs/xfs_ag.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 57de3f4bb2c7..d73444528d9b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -28,6 +28,7 @@ xfs-y				+= xfs_trace.o
 
 # build the libxfs code first
 xfs-y				+= $(addprefix libxfs/, \
+				   xfs_ag.o \
 				   xfs_alloc.o \
 				   xfs_alloc_btree.o \
 				   xfs_attr.o \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
new file mode 100644
index 000000000000..5baa22c07095
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -0,0 +1,404 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct xfs_buf *
+xfs_get_aghdr_buf(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	int			flags,
+	const struct xfs_buf_ops *ops)
+{
+	struct xfs_buf		*bp;
+
+	bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+	if (!bp)
+		return NULL;
+
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	bp->b_bn = blkno;
+	bp->b_maps[0].bm_bn = blkno;
+	bp->b_ops = ops;
+
+	return bp;
+}
+
+/*
+ * Generic btree root block init function
+ */
+static void
+xfs_btroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+}
+
+/*
+ * Alloc btree root block init functions
+ */
+static void
+xfs_bnoroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_alloc_rec	*arec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+	arec->ar_blockcount = cpu_to_be32(id->agsize -
+					  be32_to_cpu(arec->ar_startblock));
+}
+
+static void
+xfs_cntroot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_alloc_rec	*arec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
+	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+	arec->ar_blockcount = cpu_to_be32(id->agsize -
+					  be32_to_cpu(arec->ar_startblock));
+}
+
+/*
+ * Reverse map root block init
+ */
+static void
+xfs_rmaproot_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_rmap_rec	*rrec;
+
+	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
+
+	/*
+	 * mark the AG header regions as static metadata The BNO
+	 * btree block is the first block after the headers, so
+	 * it's location defines the size of region the static
+	 * metadata consumes.
+	 *
+	 * Note: unlike mkfs, we never have to account for log
+	 * space when growing the data regions
+	 */
+	rrec = XFS_RMAP_REC_ADDR(block, 1);
+	rrec->rm_startblock = 0;
+	rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+	rrec->rm_offset = 0;
+
+	/* account freespace btree root blocks */
+	rrec = XFS_RMAP_REC_ADDR(block, 2);
+	rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(2);
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+	rrec->rm_offset = 0;
+
+	/* account inode btree root blocks */
+	rrec = XFS_RMAP_REC_ADDR(block, 3);
+	rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+					  XFS_IBT_BLOCK(mp));
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+	rrec->rm_offset = 0;
+
+	/* account for rmap btree root */
+	rrec = XFS_RMAP_REC_ADDR(block, 4);
+	rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+	rrec->rm_blockcount = cpu_to_be32(1);
+	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+	rrec->rm_offset = 0;
+
+	/* account for refc btree root */
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		rrec = XFS_RMAP_REC_ADDR(block, 5);
+		rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+		rrec->rm_blockcount = cpu_to_be32(1);
+		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+		rrec->rm_offset = 0;
+		be16_add_cpu(&block->bb_numrecs, 1);
+	}
+}
+
+/*
+ * Initialise new secondary superblocks with the pre-grow geometry, but mark
+ * them as "in progress" so we know they haven't yet been activated. This will
+ * get cleared when the update with the new geometry information is done after
+ * changes to the primary are committed. This isn't strictly necessary, but we
+ * get it for free with the delayed buffer write lists and it means we can tell
+ * if a grow operation didn't complete properly after the fact.
+ */
+static void
+xfs_sbblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_dsb		*dsb = XFS_BUF_TO_SBP(bp);
+
+	xfs_sb_to_disk(dsb, &mp->m_sb);
+	dsb->sb_inprogress = 1;
+}
+
+static void
+xfs_agfblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(bp);
+	xfs_extlen_t		tmpsize;
+
+	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+	agf->agf_seqno = cpu_to_be32(id->agno);
+	agf->agf_length = cpu_to_be32(id->agsize);
+	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+	agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		agf->agf_roots[XFS_BTNUM_RMAPi] =
+					cpu_to_be32(XFS_RMAP_BLOCK(mp));
+		agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+		agf->agf_rmap_blocks = cpu_to_be32(1);
+	}
+
+	agf->agf_flfirst = cpu_to_be32(1);
+	agf->agf_fllast = 0;
+	agf->agf_flcount = 0;
+	tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
+	agf->agf_freeblks = cpu_to_be32(tmpsize);
+	agf->agf_longest = cpu_to_be32(tmpsize);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+		agf->agf_refcount_root = cpu_to_be32(
+				xfs_refc_block(mp));
+		agf->agf_refcount_level = cpu_to_be32(1);
+		agf->agf_refcount_blocks = cpu_to_be32(1);
+	}
+}
+
+static void
+xfs_agflblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agfl		*agfl = XFS_BUF_TO_AGFL(bp);
+	__be32			*agfl_bno;
+	int			bucket;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+		agfl->agfl_seqno = cpu_to_be32(id->agno);
+		uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+	}
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
+		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+}
+
+static void
+xfs_agiblock_init(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct aghdr_init_data	*id)
+{
+	struct xfs_agi		*agi = XFS_BUF_TO_AGI(bp);
+	int			bucket;
+
+	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+	agi->agi_seqno = cpu_to_be32(id->agno);
+	agi->agi_length = cpu_to_be32(id->agsize);
+	agi->agi_count = 0;
+	agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+	agi->agi_level = cpu_to_be32(1);
+	agi->agi_freecount = 0;
+	agi->agi_newino = cpu_to_be32(NULLAGINO);
+	agi->agi_dirino = cpu_to_be32(NULLAGINO);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+		agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+		agi->agi_free_level = cpu_to_be32(1);
+	}
+	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+}
+
+typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
+				  struct aghdr_init_data *id);
+static int
+xfs_ag_init_hdr(
+	struct xfs_mount	*mp,
+	struct aghdr_init_data	*id,
+	aghdr_init_work_f	work,
+	const struct xfs_buf_ops *ops)
+
+{
+	struct xfs_buf		*bp;
+
+	bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+	if (!bp)
+		return -ENOMEM;
+
+	(*work)(mp, bp, id);
+
+	xfs_buf_delwri_queue(bp, &id->buffer_list);
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+struct xfs_aghdr_grow_data {
+	xfs_daddr_t		daddr;
+	size_t			numblks;
+	const struct xfs_buf_ops *ops;
+	aghdr_init_work_f	work;
+	xfs_btnum_t		type;
+	bool			need_init;
+};
+
+/*
+ * Prepare new AG headers to be written to disk. We use uncached buffers here,
+ * as it is assumed these new AG headers are currently beyond the currently
+ * valid filesystem address space. Using cached buffers would trip over EOFS
+ * corruption detection alogrithms in the buffer cache lookup routines.
+ *
+ * This is a non-transactional function, but the prepared buffers are added to a
+ * delayed write buffer list supplied by the caller so they can submit them to
+ * disk and wait on them as required.
+ */
+int
+xfs_ag_init_headers(
+	struct xfs_mount	*mp,
+	struct aghdr_init_data	*id)
+
+{
+	struct xfs_aghdr_grow_data aghdr_data[] = {
+	{ /* SB */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_sb_buf_ops,
+		.work = &xfs_sbblock_init,
+		.need_init = true
+	},
+	{ /* AGF */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agf_buf_ops,
+		.work = &xfs_agfblock_init,
+		.need_init = true
+	},
+	{ /* AGFL */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agfl_buf_ops,
+		.work = &xfs_agflblock_init,
+		.need_init = true
+	},
+	{ /* AGI */
+		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
+		.numblks = XFS_FSS_TO_BB(mp, 1),
+		.ops = &xfs_agi_buf_ops,
+		.work = &xfs_agiblock_init,
+		.need_init = true
+	},
+	{ /* BNO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_allocbt_buf_ops,
+		.work = &xfs_bnoroot_init,
+		.need_init = true
+	},
+	{ /* CNT root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_allocbt_buf_ops,
+		.work = &xfs_cntroot_init,
+		.need_init = true
+	},
+	{ /* INO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_inobt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_INO,
+		.need_init = true
+	},
+	{ /* FINO root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_inobt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_FINO,
+		.need_init =  xfs_sb_version_hasfinobt(&mp->m_sb)
+	},
+	{ /* RMAP root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_rmapbt_buf_ops,
+		.work = &xfs_rmaproot_init,
+		.need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
+	},
+	{ /* REFC root block */
+		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
+		.numblks = BTOBB(mp->m_sb.sb_blocksize),
+		.ops = &xfs_refcountbt_buf_ops,
+		.work = &xfs_btroot_init,
+		.type = XFS_BTNUM_REFC,
+		.need_init = xfs_sb_version_hasreflink(&mp->m_sb)
+	},
+	{ /* NULL terminating block */
+		.daddr = XFS_BUF_DADDR_NULL,
+	}
+	};
+	struct  xfs_aghdr_grow_data *dp;
+	int			error = 0;
+
+	/* Account for AG free space in new AG */
+	id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+	for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
+		if (!dp->need_init)
+			continue;
+
+		id->daddr = dp->daddr;
+		id->numblks = dp->numblks;
+		id->type = dp->type;
+		error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
+		if (error)
+			break;
+	}
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000000..69f2fd4be0ea
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __LIBXFS_AG_H
+#define __LIBXFS_AG_H 1
+
+struct aghdr_init_data {
+	/* per ag data */
+	xfs_agblock_t		agno;		/* ag to init */
+	xfs_extlen_t		agsize;		/* new AG size */
+	struct list_head	buffer_list;	/* buffer writeback list */
+	xfs_rfsblock_t		nfree;		/* cumulative new free space */
+
+	/* per header data */
+	xfs_daddr_t		daddr;		/* header location */
+	size_t			numblks;	/* size of header */
+	xfs_btnum_t		type;		/* type of btree root block */
+};
+
+int xfs_ag_init_headers( struct xfs_mount *mp, struct aghdr_init_data *id);
+
+#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d9bef41a3f26..a35344ae4951 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -888,6 +888,79 @@ xfs_sync_sb(
 	return xfs_trans_commit(tp);
 }
 
+/*
+ * Update all the secondary superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The sb buffers need to be cached here so that we serialise against other
+ * operations that access the secondary superblocks, but we don't want to keep
+ * them in memory once it is written so we mark it as a one-shot buffer.
+ */
+int
+xfs_update_secondary_sbs(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	int			saved_error = 0;
+	int			error = 0;
+	LIST_HEAD		(buffer_list);
+
+	/* update secondary superblocks. */
+	for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
+		struct xfs_buf		*bp;
+
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				 XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+				 XFS_FSS_TO_BB(mp, 1), 0);
+		/*
+		 * If we get an error reading or writing alternate superblocks,
+		 * continue.  xfs_repair chooses the "best" superblock based
+		 * on most matches; if we break early, we'll leave more
+		 * superblocks un-updated than updated, and xfs_repair may
+		 * pick them over the properly-updated primary.
+		 */
+		if (!bp) {
+			xfs_warn(mp,
+		"error allocating secondary superblock for ag %d",
+				agno);
+			if (!saved_error)
+				saved_error = -ENOMEM;
+			continue;
+		}
+
+		bp->b_ops = &xfs_sb_buf_ops;
+		xfs_buf_oneshot(bp);
+		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+		xfs_buf_delwri_queue(bp, &buffer_list);
+		xfs_buf_relse(bp);
+
+		/* don't hold too many buffers at once */
+		if (agno % 16)
+			continue;
+
+		error = xfs_buf_delwri_submit(&buffer_list);
+		if (error) {
+			xfs_warn(mp,
+		"write error %d updating a secondary superblock near ag %d",
+				error, agno);
+			if (!saved_error)
+				saved_error = error;
+			continue;
+		}
+	}
+	error = xfs_buf_delwri_submit(&buffer_list);
+	if (error) {
+		xfs_warn(mp,
+		"write error %d updating a secondary superblock near ag %d",
+			error, agno);
+	}
+
+	return saved_error ? saved_error : error;
+}
+
 int
 xfs_fs_geometry(
 	struct xfs_sb		*sbp,
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 5166d78b2c34..fc65d4e1433a 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -18,6 +18,13 @@
 #ifndef __XFS_SB_H__
 #define	__XFS_SB_H__
 
+struct xfs_mount;
+struct xfs_sb;
+struct xfs_dsb;
+struct xfs_trans;
+struct xfs_fsop_geom;
+struct xfs_perag;
+
 /*
  * perag get/put wrappers for ref counting
  */
@@ -34,6 +41,8 @@ extern void	xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
 extern void	xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
 extern void	xfs_sb_quota_from_disk(struct xfs_sb *sbp);
 
+extern int	xfs_update_secondary_sbs(struct xfs_mount *mp);
+
 #define XFS_FS_GEOM_MAX_STRUCT_VER	(4)
 extern int	xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
 				int struct_version);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ca3cf3f6239a..c5087b79bcea 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,11 +24,7 @@
 #include "xfs_sb.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
 #include "xfs_trans.h"
-#include "xfs_inode_item.h"
 #include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_alloc_btree.h"
@@ -36,411 +32,17 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_fsops.h"
-#include "xfs_itable.h"
 #include "xfs_trans_space.h"
 #include "xfs_rtalloc.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
-#include "xfs_filestream.h"
 #include "xfs_rmap.h"
+#include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 
 /*
- * File system operations
+ * growfs operations
  */
-
-static struct xfs_buf *
-xfs_growfs_get_hdr_buf(
-	struct xfs_mount	*mp,
-	xfs_daddr_t		blkno,
-	size_t			numblks,
-	int			flags,
-	const struct xfs_buf_ops *ops)
-{
-	struct xfs_buf		*bp;
-
-	bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
-	if (!bp)
-		return NULL;
-
-	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-	bp->b_bn = blkno;
-	bp->b_maps[0].bm_bn = blkno;
-	bp->b_ops = ops;
-
-	return bp;
-}
-
-struct aghdr_init_data {
-	/* per ag data */
-	xfs_agnumber_t		agno;		/* ag to init */
-	xfs_extlen_t		agsize;		/* new AG size */
-	struct list_head	buffer_list;	/* buffer writeback list */
-	xfs_rfsblock_t		nfree;		/* cumulative new free space */
-
-	/* per header data */
-	xfs_daddr_t		daddr;		/* header location */
-	size_t			numblks;	/* size of header */
-	xfs_btnum_t		type;		/* type of btree root block */
-};
-
-/*
- * Generic btree root block init function
- */
-static void
-xfs_btroot_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
-}
-
-/*
- * Alloc btree root block init functions
- */
-static void
-xfs_bnoroot_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_alloc_rec	*arec;
-
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
-	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-	arec->ar_blockcount = cpu_to_be32(id->agsize -
-					  be32_to_cpu(arec->ar_startblock));
-}
-
-static void
-xfs_cntroot_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_alloc_rec	*arec;
-
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
-	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
-	arec->ar_blockcount = cpu_to_be32(id->agsize -
-					  be32_to_cpu(arec->ar_startblock));
-}
-
-/*
- * Reverse map root block init
- */
-static void
-xfs_rmaproot_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_rmap_rec	*rrec;
-
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
-
-	/*
-	 * mark the AG header regions as static metadata The BNO
-	 * btree block is the first block after the headers, so
-	 * it's location defines the size of region the static
-	 * metadata consumes.
-	 *
-	 * Note: unlike mkfs, we never have to account for log
-	 * space when growing the data regions
-	 */
-	rrec = XFS_RMAP_REC_ADDR(block, 1);
-	rrec->rm_startblock = 0;
-	rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
-	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
-	rrec->rm_offset = 0;
-
-	/* account freespace btree root blocks */
-	rrec = XFS_RMAP_REC_ADDR(block, 2);
-	rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
-	rrec->rm_blockcount = cpu_to_be32(2);
-	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
-	rrec->rm_offset = 0;
-
-	/* account inode btree root blocks */
-	rrec = XFS_RMAP_REC_ADDR(block, 3);
-	rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
-	rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
-					  XFS_IBT_BLOCK(mp));
-	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
-	rrec->rm_offset = 0;
-
-	/* account for rmap btree root */
-	rrec = XFS_RMAP_REC_ADDR(block, 4);
-	rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
-	rrec->rm_blockcount = cpu_to_be32(1);
-	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
-	rrec->rm_offset = 0;
-
-	/* account for refc btree root */
-	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-		rrec = XFS_RMAP_REC_ADDR(block, 5);
-		rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
-		rrec->rm_blockcount = cpu_to_be32(1);
-		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
-		rrec->rm_offset = 0;
-		be16_add_cpu(&block->bb_numrecs, 1);
-	}
-}
-
-/*
- * Initialise new secondary superblocks with the pre-grow geometry, but mark
- * them as "in progress" so we know they haven't yet been activated. This will
- * get cleared when the update with the new geometry information is done after
- * changes to the primary are committed. This isn't strictly necessary, but we
- * get it for free with the delayed buffer write lists and it means we can tell
- * if a grow operation didn't complete properly after the fact.
- */
-static void
-xfs_sbblock_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_dsb		*dsb = XFS_BUF_TO_SBP(bp);
-
-	xfs_sb_to_disk(dsb, &mp->m_sb);
-	dsb->sb_inprogress = 1;
-}
-
-static void
-xfs_agfblock_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_agf		*agf = XFS_BUF_TO_AGF(bp);
-	xfs_extlen_t		tmpsize;
-
-	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
-	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
-	agf->agf_seqno = cpu_to_be32(id->agno);
-	agf->agf_length = cpu_to_be32(id->agsize);
-	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
-	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
-	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
-	agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
-		agf->agf_roots[XFS_BTNUM_RMAPi] =
-					cpu_to_be32(XFS_RMAP_BLOCK(mp));
-		agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
-		agf->agf_rmap_blocks = cpu_to_be32(1);
-	}
-
-	agf->agf_flfirst = cpu_to_be32(1);
-	agf->agf_fllast = 0;
-	agf->agf_flcount = 0;
-	tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
-	agf->agf_freeblks = cpu_to_be32(tmpsize);
-	agf->agf_longest = cpu_to_be32(tmpsize);
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
-	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
-		agf->agf_refcount_root = cpu_to_be32(
-				xfs_refc_block(mp));
-		agf->agf_refcount_level = cpu_to_be32(1);
-		agf->agf_refcount_blocks = cpu_to_be32(1);
-	}
-}
-
-static void
-xfs_agflblock_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_agfl		*agfl = XFS_BUF_TO_AGFL(bp);
-	__be32			*agfl_bno;
-	int			bucket;
-
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
-		agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
-		agfl->agfl_seqno = cpu_to_be32(id->agno);
-		uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
-	}
-
-	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
-	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
-		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
-}
-
-static void
-xfs_agiblock_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	struct xfs_agi		*agi = XFS_BUF_TO_AGI(bp);
-	int			bucket;
-
-	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
-	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
-	agi->agi_seqno = cpu_to_be32(id->agno);
-	agi->agi_length = cpu_to_be32(id->agsize);
-	agi->agi_count = 0;
-	agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
-	agi->agi_level = cpu_to_be32(1);
-	agi->agi_freecount = 0;
-	agi->agi_newino = cpu_to_be32(NULLAGINO);
-	agi->agi_dirino = cpu_to_be32(NULLAGINO);
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
-	if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
-		agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
-		agi->agi_free_level = cpu_to_be32(1);
-	}
-	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
-		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-}
-
-typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
-				  struct aghdr_init_data *id);
-static int
-xfs_growfs_init_aghdr(
-	struct xfs_mount	*mp,
-	struct aghdr_init_data	*id,
-	aghdr_init_work_f	work,
-	const struct xfs_buf_ops *ops)
-
-{
-	struct xfs_buf		*bp;
-
-	bp = xfs_growfs_get_hdr_buf(mp, id->daddr, id->numblks, 0, ops);
-	if (!bp)
-		return -ENOMEM;
-
-	(*work)(mp, bp, id);
-
-	xfs_buf_delwri_queue(bp, &id->buffer_list);
-	xfs_buf_relse(bp);
-	return 0;
-}
-
-struct xfs_aghdr_grow_data {
-	xfs_daddr_t		daddr;
-	size_t			numblks;
-	const struct xfs_buf_ops *ops;
-	aghdr_init_work_f	work;
-	xfs_btnum_t		type;
-	bool			need_init;
-};
-
-/*
- * Write new AG headers to disk. Non-transactional, but written
- * synchronously so they are completed prior to the growfs transaction
- * being logged.
- */
-static int
-xfs_grow_ag_headers(
-	struct xfs_mount	*mp,
-	struct aghdr_init_data	*id)
-
-{
-	struct xfs_aghdr_grow_data aghdr_data[] = {
-	{ /* SB */
-		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
-		.numblks = XFS_FSS_TO_BB(mp, 1),
-		.ops = &xfs_sb_buf_ops,
-		.work = &xfs_sbblock_init,
-		.need_init = true
-	},
-	{ /* AGF */
-		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
-		.numblks = XFS_FSS_TO_BB(mp, 1),
-		.ops = &xfs_agf_buf_ops,
-		.work = &xfs_agfblock_init,
-		.need_init = true
-	},
-	{ /* AGFL */
-		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
-		.numblks = XFS_FSS_TO_BB(mp, 1),
-		.ops = &xfs_agfl_buf_ops,
-		.work = &xfs_agflblock_init,
-		.need_init = true
-	},
-	{ /* AGI */
-		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
-		.numblks = XFS_FSS_TO_BB(mp, 1),
-		.ops = &xfs_agi_buf_ops,
-		.work = &xfs_agiblock_init,
-		.need_init = true
-	},
-	{ /* BNO root block */
-		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
-		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_allocbt_buf_ops,
-		.work = &xfs_bnoroot_init,
-		.need_init = true
-	},
-	{ /* CNT root block */
-		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
-		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_allocbt_buf_ops,
-		.work = &xfs_cntroot_init,
-		.need_init = true
-	},
-	{ /* INO root block */
-		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
-		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_inobt_buf_ops,
-		.work = &xfs_btroot_init,
-		.type = XFS_BTNUM_INO,
-		.need_init = true
-	},
-	{ /* FINO root block */
-		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
-		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_inobt_buf_ops,
-		.work = &xfs_btroot_init,
-		.type = XFS_BTNUM_FINO,
-		.need_init =  xfs_sb_version_hasfinobt(&mp->m_sb)
-	},
-	{ /* RMAP root block */
-		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
-		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_rmapbt_buf_ops,
-		.work = &xfs_rmaproot_init,
-		.need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
-	},
-	{ /* REFC root block */
-		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
-		.numblks = BTOBB(mp->m_sb.sb_blocksize),
-		.ops = &xfs_refcountbt_buf_ops,
-		.work = &xfs_btroot_init,
-		.type = XFS_BTNUM_REFC,
-		.need_init = xfs_sb_version_hasreflink(&mp->m_sb)
-	},
-	{ /* NULL terminating block */
-		.daddr = XFS_BUF_DADDR_NULL,
-	}
-	};
-	struct  xfs_aghdr_grow_data *dp;
-	int			error = 0;
-
-	/* Account for AG free space in new AG */
-	id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
-	for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
-		if (!dp->need_init)
-			continue;
-
-		id->daddr = dp->daddr;
-		id->numblks = dp->numblks;
-		id->type = dp->type;
-		error = xfs_growfs_init_aghdr(mp, id, dp->work, dp->ops);
-		if (error)
-			break;
-	}
-	return error;
-}
-
 static int
 xfs_growfs_data_private(
 	xfs_mount_t		*mp,		/* mount point for filesystem */
@@ -518,7 +120,7 @@ xfs_growfs_data_private(
 		else
 			id.agsize = mp->m_sb.sb_agblocks;
 
-		error = xfs_grow_ag_headers(mp, &id);
+		error = xfs_ag_init_headers(mp, &id);
 		if (error) {
 			xfs_buf_delwri_cancel(&id.buffer_list);
 			goto out_trans_cancel;
@@ -682,79 +284,6 @@ xfs_growfs_imaxpct(
 	return xfs_trans_commit(tp);
 }
 
-/*
- * After a grow operation, we need to update all the secondary superblocks
- * to match the new state of the primary. Because we are completely overwriting
- * all the existing fields in the secondary superblock buffers, there is no need
- * to read them in from disk. Just get a new buffer, stamp it and write it.
- *
- * The sb buffers need to be cached here so that we serialise against scrub
- * scanning secondary superblocks, but we don't want to keep it in memory once
- * it is written so we mark it as a one-shot buffer.
- */
-static int
-xfs_growfs_update_superblocks(
-	struct xfs_mount	*mp)
-{
-	xfs_agnumber_t		agno;
-	int			saved_error = 0;
-	int			error = 0;
-	LIST_HEAD		(buffer_list);
-
-	/* update secondary superblocks. */
-	for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
-		struct xfs_buf		*bp;
-
-		bp = xfs_buf_get(mp->m_ddev_targp,
-				 XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
-				 XFS_FSS_TO_BB(mp, 1), 0);
-		/*
-		 * If we get an error reading or writing alternate superblocks,
-		 * continue.  xfs_repair chooses the "best" superblock based
-		 * on most matches; if we break early, we'll leave more
-		 * superblocks un-updated than updated, and xfs_repair may
-		 * pick them over the properly-updated primary.
-		 */
-		if (!bp) {
-			xfs_warn(mp,
-		"error allocating secondary superblock for ag %d",
-				agno);
-			if (!saved_error)
-				saved_error = -ENOMEM;
-			continue;
-		}
-
-		bp->b_ops = &xfs_sb_buf_ops;
-		xfs_buf_oneshot(bp);
-		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
-		xfs_buf_delwri_queue(bp, &buffer_list);
-		xfs_buf_relse(bp);
-
-		/* don't hold too many buffers at once */
-		if (agno % 16)
-			continue;
-
-		error = xfs_buf_delwri_submit(&buffer_list);
-		if (error) {
-			xfs_warn(mp,
-		"write error %d updating a secondary superblock near ag %d",
-				error, agno);
-			if (!saved_error)
-				saved_error = error;
-			continue;
-		}
-	}
-	error = xfs_buf_delwri_submit(&buffer_list);
-	if (error) {
-		xfs_warn(mp,
-		"write error %d updating a secondary superblock near ag %d",
-			error, agno);
-	}
-
-	return saved_error ? saved_error : error;
-}
-
 /*
  * protected versions of growfs function acquire and release locks on the mount
  * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
@@ -794,7 +323,7 @@ xfs_growfs_data(
 		mp->m_maxicount = 0;
 
 	/* Update secondary superblocks now the physical grow has completed */
-	error = xfs_growfs_update_superblocks(mp);
+	error = xfs_update_secondary_sbs(mp);
 
 out_error:
 	/*

From 49dd56f26ea7d18d4ca71f43a082dbb92798ebd3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Sun, 13 May 2018 23:10:08 -0700
Subject: [PATCH 085/121] xfs: factor the ag length extension code into libxfs

Growfs currently manually codes the extension of the last AG in a
filesytem during the growfs process. Factor that out of the growfs
code and move it into libxfs along with teh rest of the AG header
modification code.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_ag.c | 60 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_ag.h |  7 ++++-
 fs/xfs/xfs_fsops.c     | 58 ++--------------------------------------
 3 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5baa22c07095..9345802c99f7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -16,6 +16,7 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_alloc.h"
+#include "xfs_ialloc.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
 
@@ -402,3 +403,62 @@ xfs_ag_init_headers(
 	}
 	return error;
 }
+
+/*
+ * Extent the AG indicated by the @id by the length passed in
+ */
+int
+xfs_ag_extend_space(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct aghdr_init_data	*id,
+	xfs_extlen_t		len)
+{
+	struct xfs_owner_info	oinfo;
+	struct xfs_buf		*bp;
+	struct xfs_agi		*agi;
+	struct xfs_agf		*agf;
+	int			error;
+
+	/*
+	 * Change the agi length.
+	 */
+	error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+	if (error)
+		return error;
+
+	agi = XFS_BUF_TO_AGI(bp);
+	be32_add_cpu(&agi->agi_length, len);
+	ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
+	       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
+	xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+
+	/*
+	 * Change agf length.
+	 */
+	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+	if (error)
+		return error;
+
+	agf = XFS_BUF_TO_AGF(bp);
+	be32_add_cpu(&agf->agf_length, len);
+	ASSERT(agf->agf_length == agi->agi_length);
+	xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
+	/*
+	 * Free the new space.
+	 *
+	 * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+	 * this doesn't actually exist in the rmap btree.
+	 */
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+	error = xfs_rmap_free(tp, bp, id->agno,
+				be32_to_cpu(agf->agf_length) - len,
+				len, &oinfo);
+	if (error)
+		return error;
+
+	return  xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+					be32_to_cpu(agf->agf_length) - len),
+				len, &oinfo, XFS_AG_RESV_NONE);
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 69f2fd4be0ea..412702e23f61 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -7,6 +7,9 @@
 #ifndef __LIBXFS_AG_H
 #define __LIBXFS_AG_H 1
 
+struct xfs_mount;
+struct xfs_trans;
+
 struct aghdr_init_data {
 	/* per ag data */
 	xfs_agblock_t		agno;		/* ag to init */
@@ -20,6 +23,8 @@ struct aghdr_init_data {
 	xfs_btnum_t		type;		/* type of btree root block */
 };
 
-int xfs_ag_init_headers( struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
+			struct aghdr_init_data *id, xfs_extlen_t len);
 
 #endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c5087b79bcea..bc7ef18da243 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -27,16 +27,12 @@
 #include "xfs_trans.h"
 #include "xfs_error.h"
 #include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
-#include "xfs_rmap_btree.h"
-#include "xfs_ialloc.h"
 #include "xfs_fsops.h"
 #include "xfs_trans_space.h"
 #include "xfs_rtalloc.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
-#include "xfs_rmap.h"
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 
@@ -48,8 +44,6 @@ xfs_growfs_data_private(
 	xfs_mount_t		*mp,		/* mount point for filesystem */
 	xfs_growfs_data_t	*in)		/* growfs data input struct */
 {
-	xfs_agf_t		*agf;
-	xfs_agi_t		*agi;
 	xfs_buf_t		*bp;
 	int			error;
 	xfs_agnumber_t		nagcount;
@@ -132,57 +126,9 @@ xfs_growfs_data_private(
 
 	xfs_trans_agblocks_delta(tp, id.nfree);
 
-	/*
-	 * There are new blocks in the old last a.g.
-	 */
+	/* If there are new blocks in the old last AG, extend it. */
 	if (new) {
-		struct xfs_owner_info	oinfo;
-
-		/*
-		 * Change the agi length.
-		 */
-		error = xfs_ialloc_read_agi(mp, tp, id.agno, &bp);
-		if (error)
-			goto out_trans_cancel;
-
-		ASSERT(bp);
-		agi = XFS_BUF_TO_AGI(bp);
-		be32_add_cpu(&agi->agi_length, new);
-		ASSERT(nagcount == oagcount ||
-		       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
-		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
-
-		/*
-		 * Change agf length.
-		 */
-		error = xfs_alloc_read_agf(mp, tp, id.agno, 0, &bp);
-		if (error)
-			goto out_trans_cancel;
-
-		ASSERT(bp);
-		agf = XFS_BUF_TO_AGF(bp);
-		be32_add_cpu(&agf->agf_length, new);
-		ASSERT(be32_to_cpu(agf->agf_length) ==
-		       be32_to_cpu(agi->agi_length));
-
-		xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
-
-		/*
-		 * Free the new space.
-		 *
-		 * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
-		 * this doesn't actually exist in the rmap btree.
-		 */
-		xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
-		error = xfs_rmap_free(tp, bp, id.agno,
-				be32_to_cpu(agf->agf_length) - new,
-				new, &oinfo);
-		if (error)
-			goto out_trans_cancel;
-		error = xfs_free_extent(tp,
-				XFS_AGB_TO_FSB(mp, id.agno,
-					be32_to_cpu(agf->agf_length) - new),
-				new, &oinfo, XFS_AG_RESV_NONE);
+		error = xfs_ag_extend_space(mp, tp, &id, new);
 		if (error)
 			goto out_trans_cancel;
 	}

From 62750d040bd137fd6f541e216502d9158e07d348 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 15 May 2018 13:20:03 -0700
Subject: [PATCH 086/121] fs: copy BTRFS_IOC_[SG]ET_FSLABEL to vfs

This retains 256 chars as the maximum size through the interface, which
is the btrfs limit and AFAIK exceeds any other filesystem's maximum
label size.

This just copies the ioctl for now and leaves it in place for btrfs
for the time being.  A later patch will allow btrfs to use the new
common ioctl definition, but it may be sent after this is merged.

(Note, Reviewed-by's were originally given for the combined vfs+btrfs
patch, some license taken here.)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: David Sterba <dsterba@suse.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 Documentation/ioctl/ioctl-number.txt | 3 ++-
 include/uapi/linux/fs.h              | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 7f7413e597f3..27c1b7b78504 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -296,7 +296,8 @@ Code  Seq#(hex)	Include File		Comments
 0x90	00	drivers/cdrom/sbpcd.h
 0x92	00-0F	drivers/usb/mon/mon_bin.c
 0x93	60-7F	linux/auto_fs.h
-0x94	all	fs/btrfs/ioctl.h
+0x94	all	fs/btrfs/ioctl.h	Btrfs filesystem
+		and linux/fs.h		some lifted to vfs/generic
 0x97	00-7F	fs/ceph/ioctl.h		Ceph file system
 0x99	00-0F				537-Addinboard driver
 					<mailto:buk@buks.ipn.de>
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index d2a8313fabd7..9d132f8f2df8 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -242,6 +242,8 @@ struct fsxattr {
 #define FICLONERANGE	_IOW(0x94, 13, struct file_clone_range)
 #define FIDEDUPERANGE	_IOWR(0x94, 54, struct file_dedupe_range)
 
+#define FSLABEL_MAX 256	/* Max chars for the interface; each fs may differ */
+
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
 #define	FS_IOC_GETVERSION		_IOR('v', 1, long)
@@ -251,8 +253,10 @@ struct fsxattr {
 #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
 #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
-#define FS_IOC_FSGETXATTR		_IOR ('X', 31, struct fsxattr)
-#define FS_IOC_FSSETXATTR		_IOW ('X', 32, struct fsxattr)
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+#define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
+#define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
 
 /*
  * File system encryption support

From f7664b31975bd893190708e76b2c424328f0c49b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 15 May 2018 13:21:48 -0700
Subject: [PATCH 087/121] xfs: implement online get/set fs label

The GET ioctl is trivial, just return the current label.

The SET ioctl is more involved:
It transactionally modifies the superblock to write a new filesystem
label to the primary super.

A new variant of xfs_sync_sb then writes the superblock buffer
immediately to disk so that the change is visible from userspace.

It then invalidates any page cache that userspace might have previously
read on the block device so that i.e. blkid can see the change
immediately, and updates all secondary superblocks as userspace relable
does.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
[darrick: use dchinner's new xfs_update_secondary_sbs function]
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h |  7 +++-
 fs/xfs/libxfs/xfs_sb.c     | 30 +++++++++++++
 fs/xfs/libxfs/xfs_sb.h     |  1 +
 fs/xfs/xfs_ioctl.c         | 86 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 42956d8d95ed..c1cb29a5f4f6 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,6 +98,9 @@ struct xfs_ifork;
 	 XFS_SB_VERSION2_PROJID32BIT	| \
 	 XFS_SB_VERSION2_FTYPE)
 
+/* Maximum size of the xfs filesystem label, no terminating NULL */
+#define XFSLABEL_MAX			12
+
 /*
  * Superblock - in core version.  Must match the ondisk version below.
  * Must be padded to 64 bit alignment.
@@ -122,7 +125,7 @@ typedef struct xfs_sb {
 	uint16_t	sb_sectsize;	/* volume sector size, bytes */
 	uint16_t	sb_inodesize;	/* inode size, bytes */
 	uint16_t	sb_inopblock;	/* inodes per block */
-	char		sb_fname[12];	/* file system name */
+	char		sb_fname[XFSLABEL_MAX]; /* file system name */
 	uint8_t		sb_blocklog;	/* log2 of sb_blocksize */
 	uint8_t		sb_sectlog;	/* log2 of sb_sectsize */
 	uint8_t		sb_inodelog;	/* log2 of sb_inodesize */
@@ -213,7 +216,7 @@ typedef struct xfs_dsb {
 	__be16		sb_sectsize;	/* volume sector size, bytes */
 	__be16		sb_inodesize;	/* inode size, bytes */
 	__be16		sb_inopblock;	/* inodes per block */
-	char		sb_fname[12];	/* file system name */
+	char		sb_fname[XFSLABEL_MAX]; /* file system name */
 	__u8		sb_blocklog;	/* log2 of sb_blocksize */
 	__u8		sb_sectlog;	/* log2 of sb_sectsize */
 	__u8		sb_inodelog;	/* log2 of sb_inodesize */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a35344ae4951..ea6c85a4c27c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -961,6 +961,36 @@ xfs_update_secondary_sbs(
 	return saved_error ? saved_error : error;
 }
 
+/*
+ * Same behavior as xfs_sync_sb, except that it is always synchronous and it
+ * also writes the superblock buffer to disk sector 0 immediately.
+ */
+int
+xfs_sync_sb_buf(
+	struct xfs_mount	*mp)
+{
+	struct xfs_trans	*tp;
+	int			error;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_log_sb(tp);
+	xfs_trans_bhold(tp, mp->m_sb_bp);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out;
+	/*
+	 * write out the sb buffer to get the changes to disk
+	 */
+	error = xfs_bwrite(mp->m_sb_bp);
+out:
+	xfs_buf_relse(mp->m_sb_bp);
+	return error;
+}
+
 int
 xfs_fs_geometry(
 	struct xfs_sb		*sbp,
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index fc65d4e1433a..03e0cc6bf3a6 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -36,6 +36,7 @@ extern int	xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
 
 extern void	xfs_log_sb(struct xfs_trans *tp);
 extern int	xfs_sync_sb(struct xfs_mount *mp, bool wait);
+extern int	xfs_sync_sb_buf(struct xfs_mount *mp);
 extern void	xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
 extern void	xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
 extern void	xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 89fb1eb80aae..f9c97ea52961 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1811,6 +1811,88 @@ xfs_ioc_swapext(
 	return error;
 }
 
+static int
+xfs_ioc_getlabel(
+	struct xfs_mount	*mp,
+	char			__user *user_label)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	char			label[XFSLABEL_MAX + 1];
+
+	/* Paranoia */
+	BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
+
+	spin_lock(&mp->m_sb_lock);
+	strncpy(label, sbp->sb_fname, sizeof(sbp->sb_fname));
+	spin_unlock(&mp->m_sb_lock);
+
+	/* xfs on-disk label is 12 chars, be sure we send a null to user */
+	label[XFSLABEL_MAX] = '\0';
+	if (copy_to_user(user_label, label, sizeof(sbp->sb_fname)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+xfs_ioc_setlabel(
+	struct file		*filp,
+	struct xfs_mount	*mp,
+	char			__user *newlabel)
+{
+	struct xfs_sb		*sbp = &mp->m_sb;
+	char			label[XFSLABEL_MAX + 1];
+	size_t			len;
+	int			error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	/*
+	 * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much
+	 * smaller, at 12 bytes.  We copy one more to be sure we find the
+	 * (required) NULL character to test the incoming label length.
+	 * NB: The on disk label doesn't need to be null terminated.
+	 */
+	if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1))
+		return -EFAULT;
+	len = strnlen(label, XFSLABEL_MAX + 1);
+	if (len > sizeof(sbp->sb_fname))
+		return -EINVAL;
+
+	error = mnt_want_write_file(filp);
+	if (error)
+		return error;
+
+	spin_lock(&mp->m_sb_lock);
+	memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname));
+	strncpy(sbp->sb_fname, label, sizeof(sbp->sb_fname));
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * Now we do several things to satisfy userspace.
+	 * In addition to normal logging of the primary superblock, we also
+	 * immediately write these changes to sector zero for the primary, then
+	 * update all backup supers (as xfs_db does for a label change), then
+	 * invalidate the block device page cache.  This is so that any prior
+	 * buffered reads from userspace (i.e. from blkid) are invalidated,
+	 * and userspace will see the newly-written label.
+	 */
+	error = xfs_sync_sb_buf(mp);
+	if (error)
+		goto out;
+	/*
+	 * growfs also updates backup supers so lock against that.
+	 */
+	mutex_lock(&mp->m_growlock);
+	error = xfs_update_secondary_sbs(mp);
+	mutex_unlock(&mp->m_growlock);
+
+	invalidate_bdev(mp->m_ddev_targp->bt_bdev);
+
+out:
+	mnt_drop_write_file(filp);
+	return error;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -1834,6 +1916,10 @@ xfs_file_ioctl(
 	switch (cmd) {
 	case FITRIM:
 		return xfs_ioc_trim(mp, arg);
+	case FS_IOC_GETFSLABEL:
+		return xfs_ioc_getlabel(mp, arg);
+	case FS_IOC_SETFSLABEL:
+		return xfs_ioc_setlabel(filp, mp, arg);
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_RESVSP:

From ec601924df0f8fc4f66ac04eaa6bccde87b2f569 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 16 May 2018 11:13:34 -0700
Subject: [PATCH 088/121] iomap: provide more useful errors for invalid swap
 files

Currently, for an invalid swap file, we print the same error message
regardless of the reason. This isn't very useful for an admin, who will
likely want to know why exactly they can't use their swap file. So,
let's add specific error messages for each reason, and also move the
bdev check after the flags checks, since the latter are more
fundamental.

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index d193390a1c20..89517442e296 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1214,26 +1214,37 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
 	struct iomap_swapfile_info *isi = data;
 	int error;
 
+	/* No inline data. */
+	if (iomap->flags & IOMAP_F_DATA_INLINE) {
+		pr_err("swapon: file is inline\n");
+		return -EINVAL;
+	}
+
 	/* Skip holes. */
 	if (iomap->type == IOMAP_HOLE)
 		goto out;
 
-	/* Only one bdev per swap file. */
-	if (iomap->bdev != isi->sis->bdev)
-		goto err;
-
 	/* Only real or unwritten extents. */
-	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN)
-		goto err;
+	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN) {
+		pr_err("swapon: file has unallocated extents\n");
+		return -EINVAL;
+	}
 
-	/* No uncommitted metadata or shared blocks or inline data. */
-	if (iomap->flags & (IOMAP_F_DIRTY | IOMAP_F_SHARED |
-			    IOMAP_F_DATA_INLINE))
-		goto err;
+	/* No uncommitted metadata or shared blocks. */
+	if (iomap->flags & IOMAP_F_DIRTY) {
+		pr_err("swapon: file is not committed\n");
+		return -EINVAL;
+	}
+	if (iomap->flags & IOMAP_F_SHARED) {
+		pr_err("swapon: file has shared extents\n");
+		return -EINVAL;
+	}
 
-	/* No null physical addresses. */
-	if (iomap->addr == IOMAP_NULL_ADDR)
-		goto err;
+	/* Only one bdev per swap file. */
+	if (iomap->bdev != isi->sis->bdev) {
+		pr_err("swapon: file is on multiple devices\n");
+		return -EINVAL;
+	}
 
 	if (isi->iomap.length == 0) {
 		/* No accumulated extent, so just store it. */
@@ -1250,9 +1261,6 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
 	}
 out:
 	return count;
-err:
-	pr_err("swapon: file cannot be used for swap\n");
-	return -EINVAL;
 }
 
 /*

From 19e129618d47987618c71c4f129ba8dfa2669569 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 16 May 2018 11:13:34 -0700
Subject: [PATCH 089/121] iomap: don't allow holes in swapfiles

generic_swapfile_activate() doesn't allow holes, so we should be
consistent here. This is also a bit safer: if the user creates a
swapfile with, say, truncate -s $SIZE followed by mkswap, they should
really get an error and not much less swap space than they expected.
swapon(8) will error out before calling swapon(2) if the file has holes,
anyways.

Fixes: 9d93388b0afe ("iomap: add a swapfile activation function")
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 89517442e296..f2456d0d8ddd 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1220,10 +1220,6 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
 		return -EINVAL;
 	}
 
-	/* Skip holes. */
-	if (iomap->type == IOMAP_HOLE)
-		goto out;
-
 	/* Only real or unwritten extents. */
 	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN) {
 		pr_err("swapon: file has unallocated extents\n");
@@ -1259,7 +1255,6 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
 			return error;
 		memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
 	}
-out:
 	return count;
 }
 

From aee9a4a555b3f1440cc3943d7b9ab5fa072ec62e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 22 May 2018 11:48:08 -0700
Subject: [PATCH 090/121] fs: clear writeback errors in inode_init_always

In inode_init_always(), we clear the inode mapping flags, which clears
any retained error (AS_EIO, AS_ENOSPC) bits.  Unfortunately, we do not
also clear wb_err, which means that old mapping errors can leak through
to new inodes.

This is crucial for the XFS inode allocation path because we recycle old
in-core inodes and we do not want error state from an old file to leak
into the new file.  This bug was discovered by running generic/036 and
generic/047 in a loop and noticing that the EIOs generated by the
collision of direct and buffered writes in generic/036 would survive the
remount between 036 and 047, and get reported to the fsyncs (on
different files!) in generic/047.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/inode.c b/fs/inode.c
index 13ceb98c3bd3..3b55391072f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -178,6 +178,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	mapping->a_ops = &empty_aops;
 	mapping->host = inode;
 	mapping->flags = 0;
+	mapping->wb_err = 0;
 	atomic_set(&mapping->i_mmap_writable, 0);
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->private_data = NULL;

From 2e050e648ad6c74a2f0a28f645155128be0626ca Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 24 May 2018 08:54:59 -0700
Subject: [PATCH 091/121] xfs: fix inobt magic number check

In commit a6a781a58befcbd467c ("xfs: have buffer verifier functions
report failing address") the bad magic number return was ported
incorrectly.

Fixes: a6a781a58befcbd467ce843af4eaca3906aa1f08
Reported-by: syzbot+08ab33be0178b76851c8@syzkaller.appspotmail.com
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index ba053a4c124f..b04c55512159 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -296,7 +296,7 @@ xfs_inobt_verify(
 	case cpu_to_be32(XFS_FIBT_MAGIC):
 		break;
 	default:
-		return NULL;
+		return __this_address;
 	}
 
 	/* level verification */

From 05edd888d1b21b7c6af10fa3e0cdc1e5b4578493 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Tue, 29 May 2018 10:39:03 -0700
Subject: [PATCH 092/121] fs: xfs: Change return type to vm_fault_t

Use new return type vm_fault_t for fault handlers.

Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_file.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 28be51908254..0e3fb8978344 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1020,7 +1020,7 @@ xfs_file_llseek(
  *       page_lock (MM)
  *         i_lock (XFS - extent map serialisation)
  */
-static int
+static vm_fault_t
 __xfs_filemap_fault(
 	struct vm_fault		*vmf,
 	enum page_entry_size	pe_size,
@@ -1028,7 +1028,7 @@ __xfs_filemap_fault(
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret;
+	vm_fault_t		ret;
 
 	trace_xfs_filemap_fault(ip, pe_size, write_fault);
 
@@ -1057,7 +1057,7 @@ __xfs_filemap_fault(
 	return ret;
 }
 
-static int
+static vm_fault_t
 xfs_filemap_fault(
 	struct vm_fault		*vmf)
 {
@@ -1067,7 +1067,7 @@ xfs_filemap_fault(
 			(vmf->flags & FAULT_FLAG_WRITE));
 }
 
-static int
+static vm_fault_t
 xfs_filemap_huge_fault(
 	struct vm_fault		*vmf,
 	enum page_entry_size	pe_size)
@@ -1080,7 +1080,7 @@ xfs_filemap_huge_fault(
 			(vmf->flags & FAULT_FLAG_WRITE));
 }
 
-static int
+static vm_fault_t
 xfs_filemap_page_mkwrite(
 	struct vm_fault		*vmf)
 {
@@ -1092,7 +1092,7 @@ xfs_filemap_page_mkwrite(
  * on write faults. In reality, it needs to serialise against truncate and
  * prepare memory for writing so handle is as standard write fault.
  */
-static int
+static vm_fault_t
 xfs_filemap_pfn_mkwrite(
 	struct vm_fault		*vmf)
 {

From 51863d7dd77dd27a35b12b37c7caf8679903b6ae Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:24:44 -0700
Subject: [PATCH 093/121] xfs: grab the per-ag structure whenever relevant

Grab and hold the per-AG data across a scrub run whenever relevant.
This helps us avoid repeated trips through rcu and the radix tree
in the repair code.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/common.c | 17 +++++++++++++++++
 fs/xfs/scrub/common.h |  1 +
 fs/xfs/scrub/scrub.h  |  1 +
 3 files changed, 19 insertions(+)

diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 518bff2be0c9..d3e5adc96411 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -541,6 +541,10 @@ xfs_scrub_ag_free(
 		xfs_trans_brelse(sc->tp, sa->agi_bp);
 		sa->agi_bp = NULL;
 	}
+	if (sa->pag) {
+		xfs_perag_put(sa->pag);
+		sa->pag = NULL;
+	}
 	sa->agno = NULLAGNUMBER;
 }
 
@@ -568,6 +572,19 @@ xfs_scrub_ag_init(
 	return xfs_scrub_ag_btcur_init(sc, sa);
 }
 
+/*
+ * Grab the per-ag structure if we haven't already gotten it.  Teardown of the
+ * xfs_scrub_ag will release it for us.
+ */
+void
+xfs_scrub_perag_get(
+	struct xfs_mount	*mp,
+	struct xfs_scrub_ag	*sa)
+{
+	if (!sa->pag)
+		sa->pag = xfs_perag_get(mp, sa->agno);
+}
+
 /* Per-scrubber setup functions */
 
 /*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index a660087b606e..fbb91a7144fd 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -123,6 +123,7 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
 void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
 int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
 		      struct xfs_scrub_ag *sa);
+void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa);
 int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
 			      struct xfs_buf **agi, struct xfs_buf **agf,
 			      struct xfs_buf **agfl);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 2f89a84a0e10..636424d5e2ee 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -51,6 +51,7 @@ struct xfs_scrub_meta_ops {
 /* Buffer pointers and btree cursors for an entire AG. */
 struct xfs_scrub_ag {
 	xfs_agnumber_t			agno;
+	struct xfs_perag		*pag;
 
 	/* AG btree roots */
 	struct xfs_buf			*agf_bp;

From 0a9633fa2f9a7ae52e2068fd706e2dee10be94a0 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:08 -0700
Subject: [PATCH 094/121] xfs: add helpers to deal with transaction allocation
 and rolling

For repairs, we need to reserve at least as many blocks as we think
we're going to need to rebuild the data structure, and we're going to
need some helpers to roll transactions while maintaining locks on the AG
headers so that other threads cannot wander into the middle of a repair.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
---
 fs/xfs/scrub/bmap.c   |   2 +-
 fs/xfs/scrub/common.c |  21 +++++-
 fs/xfs/scrub/common.h |   2 +-
 fs/xfs/scrub/inode.c  |   4 +-
 fs/xfs/scrub/repair.c | 160 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |  12 ++++
 6 files changed, 194 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 42a115e83739..eeadb33a701c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -74,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
 	}
 
 	/* Got the inode, lock it and we're ready to go. */
-	error = xfs_scrub_trans_alloc(sc);
+	error = xfs_scrub_trans_alloc(sc, 0);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index d3e5adc96411..41198a5f872c 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -51,6 +51,7 @@
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/btree.h"
+#include "scrub/repair.h"
 
 /* Common code for the metadata scrubbers. */
 
@@ -590,11 +591,22 @@ xfs_scrub_perag_get(
 /*
  * Grab an empty transaction so that we can re-grab locked buffers if
  * one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest possible
+ * log reservation so that we can handle the worst case scenario for metadata
+ * updates while rebuilding a metadata item.  We also need to reserve as many
+ * blocks in the head transaction as we think we're going to need to rebuild
+ * the metadata object.
  */
 int
 xfs_scrub_trans_alloc(
-	struct xfs_scrub_context	*sc)
+	struct xfs_scrub_context	*sc,
+	uint				resblks)
 {
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+				resblks, 0, 0, &sc->tp);
+
 	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
 }
 
@@ -604,7 +616,10 @@ xfs_scrub_setup_fs(
 	struct xfs_scrub_context	*sc,
 	struct xfs_inode		*ip)
 {
-	return xfs_scrub_trans_alloc(sc);
+	uint				resblks;
+
+	resblks = xfs_repair_calc_ag_resblks(sc);
+	return xfs_scrub_trans_alloc(sc, resblks);
 }
 
 /* Set us up with AG headers and btree cursors. */
@@ -734,7 +749,7 @@ xfs_scrub_setup_inode_contents(
 	/* Got the inode, lock it and we're ready to go. */
 	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xfs_scrub_trans_alloc(sc);
+	error = xfs_scrub_trans_alloc(sc, resblks);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index fbb91a7144fd..76bb2d1d808c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -38,7 +38,7 @@ xfs_scrub_should_terminate(
 	return false;
 }
 
-int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc);
+int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks);
 bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
 		xfs_agblock_t bno, int *error);
 bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 550c0cf70a92..0c696f7018de 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -67,7 +67,7 @@ xfs_scrub_setup_inode(
 		break;
 	case -EFSCORRUPTED:
 	case -EFSBADCRC:
-		return xfs_scrub_trans_alloc(sc);
+		return xfs_scrub_trans_alloc(sc, 0);
 	default:
 		return error;
 	}
@@ -75,7 +75,7 @@ xfs_scrub_setup_inode(
 	/* Got the inode, lock it and we're ready to go. */
 	sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 	xfs_ilock(sc->ip, sc->ilock_flags);
-	error = xfs_scrub_trans_alloc(sc);
+	error = xfs_scrub_trans_alloc(sc, 0);
 	if (error)
 		goto out;
 	sc->ilock_flags |= XFS_ILOCK_EXCL;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index be30825c47c6..d86f8731a78f 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -128,3 +128,163 @@ xfs_repair_probe(
 
 	return 0;
 }
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xfs_repair_roll_ag_trans(
+	struct xfs_scrub_context	*sc)
+{
+	int				error;
+
+	/* Keep the AG header buffers locked so we can keep going. */
+	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+	/* Roll the transaction. */
+	error = xfs_trans_roll(&sc->tp);
+	if (error)
+		goto out_release;
+
+	/* Join AG headers to the new transaction. */
+	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+
+	return 0;
+
+out_release:
+	/*
+	 * Rolling failed, so release the hold on the buffers.  The
+	 * buffers will be released during teardown on our way out
+	 * of the kernel.
+	 */
+	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+
+	return error;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree?  Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xfs_repair_ag_has_space(
+	struct xfs_perag		*pag,
+	xfs_extlen_t			nr_blocks,
+	enum xfs_ag_resv_type		type)
+{
+	return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
+		!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+		pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/*
+ * Figure out how many blocks to reserve for an AG repair.  We calculate the
+ * worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-AG btree.
+ */
+xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_scrub_metadata	*sm = sc->sm;
+	struct xfs_perag		*pag;
+	struct xfs_buf			*bp;
+	xfs_agino_t			icount = 0;
+	xfs_extlen_t			aglen = 0;
+	xfs_extlen_t			usedlen;
+	xfs_extlen_t			freelen;
+	xfs_extlen_t			bnobt_sz;
+	xfs_extlen_t			inobt_sz;
+	xfs_extlen_t			rmapbt_sz;
+	xfs_extlen_t			refcbt_sz;
+	int				error;
+
+	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+		return 0;
+
+	/* Use in-core counters if possible. */
+	pag = xfs_perag_get(mp, sm->sm_agno);
+	if (pag->pagi_init)
+		icount = pag->pagi_count;
+
+	/*
+	 * Otherwise try to get the actual counters from disk; if not, make
+	 * some worst case assumptions.
+	 */
+	if (icount == 0) {
+		error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+		if (error) {
+			icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
+		} else {
+			icount = pag->pagi_count;
+			xfs_buf_relse(bp);
+		}
+	}
+
+	/* Now grab the block counters from the AGF. */
+	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+	if (error) {
+		aglen = mp->m_sb.sb_agblocks;
+		freelen = aglen;
+		usedlen = aglen;
+	} else {
+		aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
+		freelen = pag->pagf_freeblks;
+		usedlen = aglen - freelen;
+		xfs_buf_relse(bp);
+	}
+	xfs_perag_put(pag);
+
+	trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+			freelen, usedlen);
+
+	/*
+	 * Figure out how many blocks we'd need worst case to rebuild
+	 * each type of btree.  Note that we can only rebuild the
+	 * bnobt/cntbt or inobt/finobt as pairs.
+	 */
+	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+				XFS_INODES_PER_HOLEMASK_BIT);
+	else
+		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+				XFS_INODES_PER_CHUNK);
+	if (xfs_sb_version_hasfinobt(&mp->m_sb))
+		inobt_sz *= 2;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+	else
+		refcbt_sz = 0;
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		/*
+		 * Guess how many blocks we need to rebuild the rmapbt.
+		 * For non-reflink filesystems we can't have more records than
+		 * used blocks.  However, with reflink it's possible to have
+		 * more than one rmap record per AG block.  We don't know how
+		 * many rmaps there could be in the AG, so we start off with
+		 * what we hope is an generous over-estimation.
+		 */
+		if (xfs_sb_version_hasreflink(&mp->m_sb))
+			rmapbt_sz = xfs_rmapbt_calc_size(mp,
+					(unsigned long long)aglen * 2);
+		else
+			rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+	} else {
+		rmapbt_sz = 0;
+	}
+
+	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+			inobt_sz, rmapbt_sz, refcbt_sz);
+
+	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 83170dd3388c..8d181dce6171 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -32,6 +32,10 @@ static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
 int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
 		bool *fixed);
 void xfs_repair_failure(struct xfs_mount *mp);
+int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
+bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+		enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
 
 /* Metadata repairers */
 
@@ -49,6 +53,14 @@ static inline int xfs_repair_attempt(
 
 static inline void xfs_repair_failure(struct xfs_mount *mp) {}
 
+static inline xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+	struct xfs_scrub_context	*sc)
+{
+	ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
+	return 0;
+}
+
 #define xfs_repair_probe		xfs_repair_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */

From 73d6b42aa4dcdd7cdc07a945f035b0c1ca48f891 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:09 -0700
Subject: [PATCH 095/121] xfs: add helpers to allocate and initialize fresh
 btree roots

Add a pair of helper functions to allocate and initialize fresh btree
roots.  The repair functions will use these as part of recreating
corrupted metadata.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
---
 fs/xfs/scrub/repair.c | 80 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |  6 ++++
 2 files changed, 86 insertions(+)

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index d86f8731a78f..be21a2984001 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -288,3 +288,83 @@ xfs_repair_calc_ag_resblks(
 
 	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 }
+
+/* Allocate a block in an AG. */
+int
+xfs_repair_alloc_ag_block(
+	struct xfs_scrub_context	*sc,
+	struct xfs_owner_info		*oinfo,
+	xfs_fsblock_t			*fsbno,
+	enum xfs_ag_resv_type		resv)
+{
+	struct xfs_alloc_arg		args = {0};
+	xfs_agblock_t			bno;
+	int				error;
+
+	switch (resv) {
+	case XFS_AG_RESV_AGFL:
+	case XFS_AG_RESV_RMAPBT:
+		error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+		if (error)
+			return error;
+		if (bno == NULLAGBLOCK)
+			return -ENOSPC;
+		xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
+				1, false);
+		*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+		if (resv == XFS_AG_RESV_RMAPBT)
+			xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
+		return 0;
+	default:
+		break;
+	}
+
+	args.tp = sc->tp;
+	args.mp = sc->mp;
+	args.oinfo = *oinfo;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+	args.minlen = 1;
+	args.maxlen = 1;
+	args.prod = 1;
+	args.type = XFS_ALLOCTYPE_THIS_AG;
+	args.resv = resv;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		return error;
+	if (args.fsbno == NULLFSBLOCK)
+		return -ENOSPC;
+	ASSERT(args.len == 1);
+	*fsbno = args.fsbno;
+
+	return 0;
+}
+
+/* Initialize a new AG btree root block with zero entries. */
+int
+xfs_repair_init_btblock(
+	struct xfs_scrub_context	*sc,
+	xfs_fsblock_t			fsb,
+	struct xfs_buf			**bpp,
+	xfs_btnum_t			btnum,
+	const struct xfs_buf_ops	*ops)
+{
+	struct xfs_trans		*tp = sc->tp;
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*bp;
+
+	trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
+			XFS_FSB_TO_AGBNO(mp, fsb), btnum);
+
+	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
+			XFS_FSB_TO_BB(mp, 1), 0);
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+	xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+	bp->b_ops = ops;
+	*bpp = bp;
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 8d181dce6171..40990fa5f381 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -36,6 +36,12 @@ int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
 bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
 		enum xfs_ag_resv_type type);
 xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
+int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc,
+		struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+		enum xfs_ag_resv_type resv);
+int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
+		struct xfs_buf **bpp, xfs_btnum_t btnum,
+		const struct xfs_buf_ops *ops);
 
 /* Metadata repairers */
 

From 64a39d876e77264dbd06ec27564d9e4eed1c6786 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:09 -0700
Subject: [PATCH 096/121] xfs: add helpers to collect and sift btree block
 pointers during repair

Add some helpers to assemble a list of fs block extents.  Generally,
repair functions will iterate the rmapbt to make a list (1) of all
extents owned by the nominal owner of the metadata structure; then they
will iterate all other structures with the same rmap owner to make a
list (2) of active blocks; and finally we have a subtraction function to
subtract all the blocks in (2) from (1), with the result that (1) is now
a list of blocks that were owned by the old btree and must be disposed.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 217 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |  28 ++++++
 2 files changed, 245 insertions(+)

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index be21a2984001..fca8e3c7887d 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -368,3 +368,220 @@ xfs_repair_init_btblock(
 
 	return 0;
 }
+
+/*
+ * Reconstructing per-AG Btrees
+ *
+ * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
+ * we scan secondary space metadata to derive the records that should be in
+ * the damaged btree, initialize a fresh btree root, and insert the records.
+ * Note that for rebuilding the rmapbt we scan all the primary data to
+ * generate the new records.
+ *
+ * However, that leaves the matter of removing all the metadata describing the
+ * old broken structure.  For primary metadata we use the rmap data to collect
+ * every extent with a matching rmap owner (exlist); we then iterate all other
+ * metadata structures with the same rmap owner to collect the extents that
+ * cannot be removed (sublist).  We then subtract sublist from exlist to
+ * derive the blocks that were used by the old btree.  These blocks can be
+ * reaped.
+ *
+ * For rmapbt reconstructions we must use different tactics for extent
+ * collection.  First we iterate all primary metadata (this excludes the old
+ * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
+ * records are collected as exlist.  The bnobt records are collected as
+ * sublist.  As with the other btrees we subtract sublist from exlist, and the
+ * result (since the rmapbt lives in the free space) are the blocks from the
+ * old rmapbt.
+ */
+
+/* Collect a dead btree extent for later disposal. */
+int
+xfs_repair_collect_btree_extent(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist,
+	xfs_fsblock_t			fsbno,
+	xfs_extlen_t			len)
+{
+	struct xfs_repair_extent	*rex;
+
+	trace_xfs_repair_collect_btree_extent(sc->mp,
+			XFS_FSB_TO_AGNO(sc->mp, fsbno),
+			XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
+
+	rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
+	if (!rex)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&rex->list);
+	rex->fsbno = fsbno;
+	rex->len = len;
+	list_add_tail(&rex->list, &exlist->list);
+
+	return 0;
+}
+
+/*
+ * An error happened during the rebuild so the transaction will be cancelled.
+ * The fs will shut down, and the administrator has to unmount and run repair.
+ * Therefore, free all the memory associated with the list so we can die.
+ */
+void
+xfs_repair_cancel_btree_extents(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist)
+{
+	struct xfs_repair_extent	*rex;
+	struct xfs_repair_extent	*n;
+
+	for_each_xfs_repair_extent_safe(rex, n, exlist) {
+		list_del(&rex->list);
+		kmem_free(rex);
+	}
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_btree_extent_cmp(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_repair_extent	*ap;
+	struct xfs_repair_extent	*bp;
+
+	ap = container_of(a, struct xfs_repair_extent, list);
+	bp = container_of(b, struct xfs_repair_extent, list);
+
+	if (ap->fsbno > bp->fsbno)
+		return 1;
+	if (ap->fsbno < bp->fsbno)
+		return -1;
+	return 0;
+}
+
+/*
+ * Remove all the blocks mentioned in @sublist from the extents in @exlist.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @exlist; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sublist.  This routine subtracts all the extents
+ * mentioned in sublist from all the extents linked in @exlist, which leaves
+ * @exlist as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure.  The blocks mentioned in
+ * @exlist can be reaped.
+ */
+#define LEFT_ALIGNED	(1 << 0)
+#define RIGHT_ALIGNED	(1 << 1)
+int
+xfs_repair_subtract_extents(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist,
+	struct xfs_repair_extent_list	*sublist)
+{
+	struct list_head		*lp;
+	struct xfs_repair_extent	*ex;
+	struct xfs_repair_extent	*newex;
+	struct xfs_repair_extent	*subex;
+	xfs_fsblock_t			sub_fsb;
+	xfs_extlen_t			sub_len;
+	int				state;
+	int				error = 0;
+
+	if (list_empty(&exlist->list) || list_empty(&sublist->list))
+		return 0;
+	ASSERT(!list_empty(&sublist->list));
+
+	list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
+	list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
+
+	/*
+	 * Now that we've sorted both lists, we iterate exlist once, rolling
+	 * forward through sublist and/or exlist as necessary until we find an
+	 * overlap or reach the end of either list.  We do not reset lp to the
+	 * head of exlist nor do we reset subex to the head of sublist.  The
+	 * list traversal is similar to merge sort, but we're deleting
+	 * instead.  In this manner we avoid O(n^2) operations.
+	 */
+	subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
+			list);
+	lp = exlist->list.next;
+	while (lp != &exlist->list) {
+		ex = list_entry(lp, struct xfs_repair_extent, list);
+
+		/*
+		 * Advance subex and/or ex until we find a pair that
+		 * intersect or we run out of extents.
+		 */
+		while (subex->fsbno + subex->len <= ex->fsbno) {
+			if (list_is_last(&subex->list, &sublist->list))
+				goto out;
+			subex = list_next_entry(subex, list);
+		}
+		if (subex->fsbno >= ex->fsbno + ex->len) {
+			lp = lp->next;
+			continue;
+		}
+
+		/* trim subex to fit the extent we have */
+		sub_fsb = subex->fsbno;
+		sub_len = subex->len;
+		if (subex->fsbno < ex->fsbno) {
+			sub_len -= ex->fsbno - subex->fsbno;
+			sub_fsb = ex->fsbno;
+		}
+		if (sub_len > ex->len)
+			sub_len = ex->len;
+
+		state = 0;
+		if (sub_fsb == ex->fsbno)
+			state |= LEFT_ALIGNED;
+		if (sub_fsb + sub_len == ex->fsbno + ex->len)
+			state |= RIGHT_ALIGNED;
+		switch (state) {
+		case LEFT_ALIGNED:
+			/* Coincides with only the left. */
+			ex->fsbno += sub_len;
+			ex->len -= sub_len;
+			break;
+		case RIGHT_ALIGNED:
+			/* Coincides with only the right. */
+			ex->len -= sub_len;
+			lp = lp->next;
+			break;
+		case LEFT_ALIGNED | RIGHT_ALIGNED:
+			/* Total overlap, just delete ex. */
+			lp = lp->next;
+			list_del(&ex->list);
+			kmem_free(ex);
+			break;
+		case 0:
+			/*
+			 * Deleting from the middle: add the new right extent
+			 * and then shrink the left extent.
+			 */
+			newex = kmem_alloc(sizeof(struct xfs_repair_extent),
+					KM_MAYFAIL);
+			if (!newex) {
+				error = -ENOMEM;
+				goto out;
+			}
+			INIT_LIST_HEAD(&newex->list);
+			newex->fsbno = sub_fsb + sub_len;
+			newex->len = ex->fsbno + ex->len - newex->fsbno;
+			list_add(&newex->list, &ex->list);
+			ex->len = sub_fsb - ex->fsbno;
+			lp = lp->next;
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
+	}
+
+out:
+	return error;
+}
+#undef LEFT_ALIGNED
+#undef RIGHT_ALIGNED
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 40990fa5f381..ba1fdd7b9a79 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -43,6 +43,34 @@ int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
 		struct xfs_buf **bpp, xfs_btnum_t btnum,
 		const struct xfs_buf_ops *ops);
 
+struct xfs_repair_extent {
+	struct list_head		list;
+	xfs_fsblock_t			fsbno;
+	xfs_extlen_t			len;
+};
+
+struct xfs_repair_extent_list {
+	struct list_head		list;
+};
+
+static inline void
+xfs_repair_init_extent_list(
+	struct xfs_repair_extent_list	*exlist)
+{
+	INIT_LIST_HEAD(&exlist->list);
+}
+
+#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
+	list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
+		xfs_extlen_t len);
+void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist);
+int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *exlist,
+		struct xfs_repair_extent_list *sublist);
+
 /* Metadata repairers */
 
 int xfs_repair_probe(struct xfs_scrub_context *sc);

From 12c6510e2ff17cf94cae08ba7b6d2355760dfd1d Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:10 -0700
Subject: [PATCH 097/121] xfs: add helpers to dispose of old btree blocks after
 a repair

Now that we've plumbed in the ability to construct a list of dead btree
blocks following a repair, add more helpers to dispose of them.  This is
done by examining the rmapbt -- if the btree was the only owner we can
free the block, otherwise it's crosslinked and we can only remove the
rmapbt record.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 251 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |   6 +
 2 files changed, 257 insertions(+)

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index fca8e3c7887d..7daf0120d1bf 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -585,3 +585,254 @@ out:
 }
 #undef LEFT_ALIGNED
 #undef RIGHT_ALIGNED
+
+/*
+ * Disposal of Blocks from Old per-AG Btrees
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (exlist) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from exlist.  In theory the extents
+ * remaining in exlist are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk.  The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked.  Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space.  Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block.  If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok.  For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @exlist,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal.  If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking.  We also assume that the caller already invalidated any
+ * buffers associated with @exlist.
+ */
+
+/*
+ * Invalidate buffers for per-AG btree blocks we're dumping.  This function
+ * is not intended for use with file data repairs; we have bunmapi for that.
+ */
+int
+xfs_repair_invalidate_blocks(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist)
+{
+	struct xfs_repair_extent	*rex;
+	struct xfs_repair_extent	*n;
+	struct xfs_buf			*bp;
+	xfs_fsblock_t			fsbno;
+	xfs_agblock_t			i;
+
+	/*
+	 * For each block in each extent, see if there's an incore buffer for
+	 * exactly that block; if so, invalidate it.  The buffer cache only
+	 * lets us look for one buffer at a time, so we have to look one block
+	 * at a time.  Avoid invalidating AG headers and post-EOFS blocks
+	 * because we never own those; and if we can't TRYLOCK the buffer we
+	 * assume it's owned by someone else.
+	 */
+	for_each_xfs_repair_extent_safe(rex, n, exlist) {
+		for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) {
+			/* Skip AG headers and post-EOFS blocks */
+			if (!xfs_verify_fsbno(sc->mp, fsbno))
+				continue;
+			bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+					XFS_FSB_TO_DADDR(sc->mp, fsbno),
+					XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
+			if (bp) {
+				xfs_trans_bjoin(sc->tp, bp);
+				xfs_trans_binval(sc->tp, bp);
+			}
+		}
+	}
+
+	return 0;
+}
+
+/* Ensure the freelist is the correct size. */
+int
+xfs_repair_fix_freelist(
+	struct xfs_scrub_context	*sc,
+	bool				can_shrink)
+{
+	struct xfs_alloc_arg		args = {0};
+
+	args.mp = sc->mp;
+	args.tp = sc->tp;
+	args.agno = sc->sa.agno;
+	args.alignment = 1;
+	args.pag = sc->sa.pag;
+
+	return xfs_alloc_fix_freelist(&args,
+			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+}
+
+/*
+ * Put a block back on the AGFL.
+ */
+STATIC int
+xfs_repair_put_freelist(
+	struct xfs_scrub_context	*sc,
+	xfs_agblock_t			agbno)
+{
+	struct xfs_owner_info		oinfo;
+	int				error;
+
+	/* Make sure there's space on the freelist. */
+	error = xfs_repair_fix_freelist(sc, true);
+	if (error)
+		return error;
+
+	/*
+	 * Since we're "freeing" a lost block onto the AGFL, we have to
+	 * create an rmap for the block prior to merging it or else other
+	 * parts will break.
+	 */
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+			&oinfo);
+	if (error)
+		return error;
+
+	/* Put the block on the AGFL. */
+	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
+			agbno, 0);
+	if (error)
+		return error;
+	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+			XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+	return 0;
+}
+
+/* Dispose of a single metadata block. */
+STATIC int
+xfs_repair_dispose_btree_block(
+	struct xfs_scrub_context	*sc,
+	xfs_fsblock_t			fsbno,
+	struct xfs_owner_info		*oinfo,
+	enum xfs_ag_resv_type		resv)
+{
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agf_bp = NULL;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	bool				has_other_rmap;
+	int				error;
+
+	agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+	/*
+	 * If we are repairing per-inode metadata, we need to read in the AGF
+	 * buffer.  Otherwise, we're repairing a per-AG structure, so reuse
+	 * the AGF buffer that the setup functions already grabbed.
+	 */
+	if (sc->ip) {
+		error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+		if (error)
+			return error;
+		if (!agf_bp)
+			return -ENOMEM;
+	} else {
+		agf_bp = sc->sa.agf_bp;
+	}
+	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+
+	/* Can we find any other rmappings? */
+	error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+	if (error)
+		goto out_cur;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+	/*
+	 * If there are other rmappings, this block is cross linked and must
+	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
+	 * we were the only owner of the block, so free the extent, which will
+	 * also remove the rmap.
+	 *
+	 * XXX: XFS doesn't support detecting the case where a single block
+	 * metadata structure is crosslinked with a multi-block structure
+	 * because the buffer cache doesn't detect aliasing problems, so we
+	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
+	 * blow on writeout, the filesystem will shut down, and the admin gets
+	 * to run xfs_repair.
+	 */
+	if (has_other_rmap)
+		error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+	else if (resv == XFS_AG_RESV_AGFL)
+		error = xfs_repair_put_freelist(sc, agbno);
+	else
+		error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+	if (agf_bp != sc->sa.agf_bp)
+		xfs_trans_brelse(sc->tp, agf_bp);
+	if (error)
+		return error;
+
+	if (sc->ip)
+		return xfs_trans_roll_inode(&sc->tp, sc->ip);
+	return xfs_repair_roll_ag_trans(sc);
+
+out_cur:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	if (agf_bp != sc->sa.agf_bp)
+		xfs_trans_brelse(sc->tp, agf_bp);
+	return error;
+}
+
+/* Dispose of btree blocks from an old per-AG btree. */
+int
+xfs_repair_reap_btree_extents(
+	struct xfs_scrub_context	*sc,
+	struct xfs_repair_extent_list	*exlist,
+	struct xfs_owner_info		*oinfo,
+	enum xfs_ag_resv_type		type)
+{
+	struct xfs_repair_extent	*rex;
+	struct xfs_repair_extent	*n;
+	int				error = 0;
+
+	ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
+
+	/* Dispose of every block from the old btree. */
+	for_each_xfs_repair_extent_safe(rex, n, exlist) {
+		ASSERT(sc->ip != NULL ||
+		       XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno);
+
+		trace_xfs_repair_dispose_btree_extent(sc->mp,
+				XFS_FSB_TO_AGNO(sc->mp, rex->fsbno),
+				XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len);
+
+		for (; rex->len > 0; rex->len--, rex->fsbno++) {
+			error = xfs_repair_dispose_btree_block(sc, rex->fsbno,
+					oinfo, type);
+			if (error)
+				goto out;
+		}
+		list_del(&rex->list);
+		kmem_free(rex);
+	}
+
+out:
+	xfs_repair_cancel_btree_extents(sc, exlist);
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index ba1fdd7b9a79..f14aaab7df9e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -70,6 +70,12 @@ void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
 int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
 		struct xfs_repair_extent_list *exlist,
 		struct xfs_repair_extent_list *sublist);
+int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
+int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *btlist);
+int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
+		struct xfs_repair_extent_list *exlist,
+		struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
 
 /* Metadata repairers */
 

From 04a2b7b254f9c72bdcb819a99d95df8504c9b639 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:10 -0700
Subject: [PATCH 098/121] xfs: recover AG btree roots from rmap data

Add a helper function to help us recover btree roots from the rmap data.
Callers pass in a list of rmap owner codes, buffer ops, and magic
numbers.  We iterate the rmap records looking for owner matches, and
then read the matching blocks to see if the magic number & uuid match.
If so, we then read-verify the block, and if that passes then we retain
a pointer to the block with the highest level, assuming that by the end
of the call we will have found the root.  This will be used to reset the
AGF/AGI btree root fields during their rebuild procedures.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 190 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |  20 +++++
 2 files changed, 210 insertions(+)

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 7daf0120d1bf..877488ce4bc8 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -836,3 +836,193 @@ out:
 	xfs_repair_cancel_btree_extents(sc, exlist);
 	return error;
 }
+
+/*
+ * Finding per-AG Btree Roots for AGF/AGI Reconstruction
+ *
+ * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
+ * the AG headers by using the rmap data to rummage through the AG looking for
+ * btree roots.  This is not guaranteed to work if the AG is heavily damaged
+ * or the rmap data are corrupt.
+ *
+ * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL
+ * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
+ * AGI is being rebuilt.  It must maintain these locks until it's safe for
+ * other threads to change the btrees' shapes.  The caller provides
+ * information about the btrees to look for by passing in an array of
+ * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
+ * The (root, height) fields will be set on return if anything is found.  The
+ * last element of the array should have a NULL buf_ops to mark the end of the
+ * array.
+ *
+ * For every rmapbt record matching any of the rmap owners in btree_info,
+ * read each block referenced by the rmap record.  If the block is a btree
+ * block from this filesystem matching any of the magic numbers and has a
+ * level higher than what we've already seen, remember the block and the
+ * height of the tree required to have such a block.  When the call completes,
+ * we return the highest block we've found for each btree description; those
+ * should be the roots.
+ */
+
+struct xfs_repair_findroot {
+	struct xfs_scrub_context	*sc;
+	struct xfs_buf			*agfl_bp;
+	struct xfs_agf			*agf;
+	struct xfs_repair_find_ag_btree	*btree_info;
+};
+
+/* See if our block is in the AGFL. */
+STATIC int
+xfs_repair_findroot_agfl_walk(
+	struct xfs_mount		*mp,
+	xfs_agblock_t			bno,
+	void				*priv)
+{
+	xfs_agblock_t			*agbno = priv;
+
+	return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+}
+
+/* Does this block match the btree information passed in? */
+STATIC int
+xfs_repair_findroot_block(
+	struct xfs_repair_findroot	*ri,
+	struct xfs_repair_find_ag_btree	*fab,
+	uint64_t			owner,
+	xfs_agblock_t			agbno,
+	bool				*found_it)
+{
+	struct xfs_mount		*mp = ri->sc->mp;
+	struct xfs_buf			*bp;
+	struct xfs_btree_block		*btblock;
+	xfs_daddr_t			daddr;
+	int				error;
+
+	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+
+	/*
+	 * Blocks in the AGFL have stale contents that might just happen to
+	 * have a matching magic and uuid.  We don't want to pull these blocks
+	 * in as part of a tree root, so we have to filter out the AGFL stuff
+	 * here.  If the AGFL looks insane we'll just refuse to repair.
+	 */
+	if (owner == XFS_RMAP_OWN_AG) {
+		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
+				xfs_repair_findroot_agfl_walk, &agbno);
+		if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+			return 0;
+		if (error)
+			return error;
+	}
+
+	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
+			mp->m_bsize, 0, &bp, NULL);
+	if (error)
+		return error;
+
+	/*
+	 * Does this look like a block matching our fs and higher than any
+	 * other block we've found so far?  If so, reattach buffer verifiers
+	 * so the AIL won't complain if the buffer is also dirty.
+	 */
+	btblock = XFS_BUF_TO_BLOCK(bp);
+	if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+		goto out;
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+		goto out;
+	bp->b_ops = fab->buf_ops;
+
+	/* Ignore this block if it's lower in the tree than we've seen. */
+	if (fab->root != NULLAGBLOCK &&
+	    xfs_btree_get_level(btblock) < fab->height)
+		goto out;
+
+	/* Make sure we pass the verifiers. */
+	bp->b_ops->verify_read(bp);
+	if (bp->b_error)
+		goto out;
+	fab->root = agbno;
+	fab->height = xfs_btree_get_level(btblock) + 1;
+	*found_it = true;
+
+	trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
+			be32_to_cpu(btblock->bb_magic), fab->height - 1);
+out:
+	xfs_trans_brelse(ri->sc->tp, bp);
+	return error;
+}
+
+/*
+ * Do any of the blocks in this rmap record match one of the btrees we're
+ * looking for?
+ */
+STATIC int
+xfs_repair_findroot_rmap(
+	struct xfs_btree_cur		*cur,
+	struct xfs_rmap_irec		*rec,
+	void				*priv)
+{
+	struct xfs_repair_findroot	*ri = priv;
+	struct xfs_repair_find_ag_btree	*fab;
+	xfs_agblock_t			b;
+	bool				found_it;
+	int				error = 0;
+
+	/* Ignore anything that isn't AG metadata. */
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+		return 0;
+
+	/* Otherwise scan each block + btree type. */
+	for (b = 0; b < rec->rm_blockcount; b++) {
+		found_it = false;
+		for (fab = ri->btree_info; fab->buf_ops; fab++) {
+			if (rec->rm_owner != fab->rmap_owner)
+				continue;
+			error = xfs_repair_findroot_block(ri, fab,
+					rec->rm_owner, rec->rm_startblock + b,
+					&found_it);
+			if (error)
+				return error;
+			if (found_it)
+				break;
+		}
+	}
+
+	return 0;
+}
+
+/* Find the roots of the per-AG btrees described in btree_info. */
+int
+xfs_repair_find_ag_btree_roots(
+	struct xfs_scrub_context	*sc,
+	struct xfs_buf			*agf_bp,
+	struct xfs_repair_find_ag_btree	*btree_info,
+	struct xfs_buf			*agfl_bp)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_repair_findroot	ri;
+	struct xfs_repair_find_ag_btree	*fab;
+	struct xfs_btree_cur		*cur;
+	int				error;
+
+	ASSERT(xfs_buf_islocked(agf_bp));
+	ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
+
+	ri.sc = sc;
+	ri.btree_info = btree_info;
+	ri.agf = XFS_BUF_TO_AGF(agf_bp);
+	ri.agfl_bp = agfl_bp;
+	for (fab = btree_info; fab->buf_ops; fab++) {
+		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+		ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
+		fab->root = NULLAGBLOCK;
+		fab->height = 0;
+	}
+
+	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+	error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f14aaab7df9e..c922ef06b894 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -77,6 +77,26 @@ int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
 		struct xfs_repair_extent_list *exlist,
 		struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
 
+struct xfs_repair_find_ag_btree {
+	/* in: rmap owner of the btree we're looking for */
+	uint64_t			rmap_owner;
+
+	/* in: buffer ops */
+	const struct xfs_buf_ops	*buf_ops;
+
+	/* in: magic number of the btree */
+	uint32_t			magic;
+
+	/* out: the highest btree block found and the tree height */
+	xfs_agblock_t			root;
+	unsigned int			height;
+};
+
+int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
+		struct xfs_buf *agf_bp,
+		struct xfs_repair_find_ag_btree *btree_info,
+		struct xfs_buf *agfl_bp);
+
 /* Metadata repairers */
 
 int xfs_repair_probe(struct xfs_scrub_context *sc);

From 7e85bc6c873215f19a0b750febfb097d1f3c3292 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:11 -0700
Subject: [PATCH 099/121] xfs: add helpers to attach quotas to inodes

Add a helper routine to attach quota information to inodes that are
about to undergo repair.  If that fails, we need to schedule a
quotacheck for the next mount but allow the corrupted metadata repair to
continue.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/scrub/repair.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h |  2 ++
 fs/xfs/xfs_quota.h    | 16 ++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 877488ce4bc8..e3e8fba1c99c 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -42,6 +42,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_ag_resv.h"
 #include "xfs_trans_space.h"
+#include "xfs_quota.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -1026,3 +1027,63 @@ xfs_repair_find_ag_btree_roots(
 
 	return error;
 }
+
+/* Force a quotacheck the next time we mount. */
+void
+xfs_repair_force_quotacheck(
+	struct xfs_scrub_context	*sc,
+	uint				dqtype)
+{
+	uint				flag;
+
+	flag = xfs_quota_chkd_flag(dqtype);
+	if (!(flag & sc->mp->m_qflags))
+		return;
+
+	sc->mp->m_qflags &= ~flag;
+	spin_lock(&sc->mp->m_sb_lock);
+	sc->mp->m_sb.sb_qflags &= ~flag;
+	spin_unlock(&sc->mp->m_sb_lock);
+	xfs_log_sb(sc->tp);
+}
+
+/*
+ * Attach dquots to this inode, or schedule quotacheck to fix them.
+ *
+ * This function ensures that the appropriate dquots are attached to an inode.
+ * We cannot allow the dquot code to allocate an on-disk dquot block here
+ * because we're already in transaction context with the inode locked.  The
+ * on-disk dquot should already exist anyway.  If the quota code signals
+ * corruption or missing quota information, schedule quotacheck, which will
+ * repair corruptions in the quota metadata.
+ */
+int
+xfs_repair_ino_dqattach(
+	struct xfs_scrub_context	*sc)
+{
+	int				error;
+
+	error = xfs_qm_dqattach_locked(sc->ip, false);
+	switch (error) {
+	case -EFSBADCRC:
+	case -EFSCORRUPTED:
+	case -ENOENT:
+		xfs_err_ratelimited(sc->mp,
+"inode %llu repair encountered quota error %d, quotacheck forced.",
+				(unsigned long long)sc->ip->i_ino, error);
+		if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
+			xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+		if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
+			xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+		if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
+			xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
+		/* fall through */
+	case -ESRCH:
+		error = 0;
+		break;
+	default:
+		break;
+	}
+
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index c922ef06b894..e9213e7d653a 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -96,6 +96,8 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
 		struct xfs_buf *agf_bp,
 		struct xfs_repair_find_ag_btree *btree_info,
 		struct xfs_buf *agfl_bp);
+void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
+int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
 
 /* Metadata repairers */
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 1c79ebbe5236..3edf52b14919 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -48,6 +48,22 @@ struct xfs_trans;
 	 (XFS_IS_PQUOTA_ON(mp) && \
 		(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
 
+static inline uint
+xfs_quota_chkd_flag(
+	uint		dqtype)
+{
+	switch (dqtype) {
+	case XFS_DQ_USER:
+		return XFS_UQUOTA_CHKD;
+	case XFS_DQ_GROUP:
+		return XFS_GQUOTA_CHKD;
+	case XFS_DQ_PROJ:
+		return XFS_PQUOTA_CHKD;
+	default:
+		return 0;
+	}
+}
+
 /*
  * The structure kept inside the xfs_trans_t keep track of dquot changes
  * within a transaction and apply them later.

From d25522f10cfa342a20867086a516c45c87b0877c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 29 May 2018 22:18:12 -0700
Subject: [PATCH 100/121] xfs: repair superblocks

If one of the backup superblocks is found to differ seriously from
superblock 0, write out a fresh copy from the in-core sb.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/Makefile                |  1 +
 fs/xfs/libxfs/xfs_sb.c         | 22 +++++++++++
 fs/xfs/libxfs/xfs_sb.h         |  3 ++
 fs/xfs/scrub/agheader_repair.c | 70 ++++++++++++++++++++++++++++++++++
 fs/xfs/scrub/repair.h          |  2 +
 fs/xfs/scrub/scrub.c           |  2 +-
 6 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 fs/xfs/scrub/agheader_repair.c

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d73444528d9b..e8d67a443bd7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -168,6 +168,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= scrub/quota.o
 # online repair
 ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
 xfs-y				+= $(addprefix scrub/, \
+				   agheader_repair.o \
 				   repair.o \
 				   )
 endif
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ea6c85a4c27c..d485e14313c6 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1097,3 +1097,25 @@ xfs_sb_read_secondary(
 	*bpp = bp;
 	return 0;
 }
+
+/* Get an uninitialised secondary superblock buffer. */
+int
+xfs_sb_get_secondary(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_buf		*bp;
+
+	ASSERT(agno != 0 && agno != NULLAGNUMBER);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+			XFS_FSS_TO_BB(mp, 1), 0);
+	if (!bp)
+		return -ENOMEM;
+	bp->b_ops = &xfs_sb_buf_ops;
+	xfs_buf_oneshot(bp);
+	*bpp = bp;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 03e0cc6bf3a6..244e0162c49e 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -50,5 +50,8 @@ extern int	xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
 extern int	xfs_sb_read_secondary(struct xfs_mount *mp,
 				struct xfs_trans *tp, xfs_agnumber_t agno,
 				struct xfs_buf **bpp);
+extern int	xfs_sb_get_secondary(struct xfs_mount *mp,
+				struct xfs_trans *tp, xfs_agnumber_t agno,
+				struct xfs_buf **bpp);
 
 #endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
new file mode 100644
index 000000000000..8b91e9ebe1e7
--- /dev/null
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Superblock */
+
+/* Repair the superblock. */
+int
+xfs_repair_superblock(
+	struct xfs_scrub_context	*sc)
+{
+	struct xfs_mount		*mp = sc->mp;
+	struct xfs_buf			*bp;
+	xfs_agnumber_t			agno;
+	int				error;
+
+	/* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
+	agno = sc->sm->sm_agno;
+	if (agno == 0)
+		return -EOPNOTSUPP;
+
+	error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
+	if (error)
+		return error;
+
+	/* Copy AG 0's superblock to this one. */
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+
+	/* Write this to disk. */
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+	return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index e9213e7d653a..f2b0895294db 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -102,6 +102,7 @@ int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
 /* Metadata repairers */
 
 int xfs_repair_probe(struct xfs_scrub_context *sc);
+int xfs_repair_superblock(struct xfs_scrub_context *sc);
 
 #else
 
@@ -124,6 +125,7 @@ xfs_repair_calc_ag_resblks(
 }
 
 #define xfs_repair_probe		xfs_repair_notsupported
+#define xfs_repair_superblock		xfs_repair_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index c5999c28c20c..36db098ba583 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -216,7 +216,7 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
 		.type	= ST_PERAG,
 		.setup	= xfs_scrub_setup_fs,
 		.scrub	= xfs_scrub_superblock,
-		.repair	= xfs_repair_notsupported,
+		.repair	= xfs_repair_superblock,
 	},
 	[XFS_SCRUB_TYPE_AGF] = {	/* agf */
 		.type	= ST_PERAG,

From ba23cba9b3bdc967aabdc6ff1e3e9b11ce05bb4f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 30 May 2018 13:03:45 -0700
Subject: [PATCH 101/121] fs: allow per-device dax status checking for
 filesystems

Change bdev_dax_supported so it takes a bdev parameter.  This enables
multi-device filesystems like xfs to check that a dax device can work for
the particular filesystem.  Once that's in place, actually fix all the
parts of XFS where we need to be able to distinguish between datadev and
rtdev.

This patch fixes the problem where we screw up the dax support checking
in xfs if the datadev and rtdev have different dax capabilities.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
[rez: Re-added __bdev_dax_supported() for !CONFIG_FS_DAX cases]
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 drivers/dax/super.c | 26 +++++++++++++-------------
 fs/ext2/super.c     |  2 +-
 fs/ext4/super.c     |  2 +-
 fs/xfs/xfs_ioctl.c  |  3 ++-
 fs/xfs/xfs_iops.c   | 30 +++++++++++++++++++++++++-----
 fs/xfs/xfs_super.c  | 10 ++++++++--
 include/linux/dax.h |  9 +++++----
 7 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..3943feb9a090 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -74,7 +74,7 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
 
 /**
  * __bdev_dax_supported() - Check if the device supports dax for filesystem
- * @sb: The superblock of the device
+ * @bdev: block device to check
  * @blocksize: The block size of the device
  *
  * This is a library function for filesystems to check if the block device
@@ -82,33 +82,33 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  *
  * Return: negative errno if unsupported, 0 if supported.
  */
-int __bdev_dax_supported(struct super_block *sb, int blocksize)
+int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
-	struct block_device *bdev = sb->s_bdev;
 	struct dax_device *dax_dev;
 	pgoff_t pgoff;
 	int err, id;
 	void *kaddr;
 	pfn_t pfn;
 	long len;
+	char buf[BDEVNAME_SIZE];
 
 	if (blocksize != PAGE_SIZE) {
-		pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
-				sb->s_id);
+		pr_debug("%s: error: unsupported blocksize for dax\n",
+				bdevname(bdev, buf));
 		return -EINVAL;
 	}
 
 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
-		pr_debug("VFS (%s): error: unaligned partition for dax\n",
-				sb->s_id);
+		pr_debug("%s: error: unaligned partition for dax\n",
+				bdevname(bdev, buf));
 		return err;
 	}
 
 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
-		pr_debug("VFS (%s): error: device does not support dax\n",
-				sb->s_id);
+		pr_debug("%s: error: device does not support dax\n",
+				bdevname(bdev, buf));
 		return -EOPNOTSUPP;
 	}
 
@@ -119,8 +119,8 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	put_dax(dax_dev);
 
 	if (len < 1) {
-		pr_debug("VFS (%s): error: dax access failed (%ld)\n",
-				sb->s_id, len);
+		pr_debug("%s: error: dax access failed (%ld)\n",
+				bdevname(bdev, buf), len);
 		return len < 0 ? len : -EIO;
 	}
 
@@ -137,8 +137,8 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	} else if (pfn_t_devmap(pfn)) {
 		/* pass */;
 	} else {
-		pr_debug("VFS (%s): error: dax support not enabled\n",
-				sb->s_id);
+		pr_debug("%s: error: dax support not enabled\n",
+				bdevname(bdev, buf));
 		return -EOPNOTSUPP;
 	}
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index de1694512f1f..9627c3054b5c 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,7 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
 	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-		err = bdev_dax_supported(sb, blocksize);
+		err = bdev_dax_supported(sb->s_bdev, blocksize);
 		if (err) {
 			ext2_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb104e8476f0..089170e99895 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3732,7 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 					" that may contain inline data");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
 		}
-		err = bdev_dax_supported(sb, blocksize);
+		err = bdev_dax_supported(sb->s_bdev, blocksize);
 		if (err) {
 			ext4_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f9c97ea52961..c9a5b75a73a8 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1103,7 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate(
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
+		if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+				sb->s_blocksize) < 0)
 			return -EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 5afe3c2234b3..9925d75411bb 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1195,6 +1195,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = {
 	.update_time		= xfs_vn_update_time,
 };
 
+/* Figure out if this file actually supports DAX. */
+static bool
+xfs_inode_supports_dax(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	/* Only supported on non-reflinked files. */
+	if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip))
+		return false;
+
+	/* DAX mount option or DAX iflag must be set. */
+	if (!(mp->m_flags & XFS_MOUNT_DAX) &&
+	    !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+		return false;
+
+	/* Block size must match page size */
+	if (mp->m_sb.sb_blocksize != PAGE_SIZE)
+		return false;
+
+	/* Device has to support DAX too. */
+	return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL;
+}
+
 STATIC void
 xfs_diflags_to_iflags(
 	struct inode		*inode,
@@ -1213,11 +1237,7 @@ xfs_diflags_to_iflags(
 		inode->i_flags |= S_SYNC;
 	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	if (S_ISREG(inode->i_mode) &&
-	    ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
-	    !xfs_is_reflink_inode(ip) &&
-	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
-	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+	if (xfs_inode_supports_dax(ip))
 		inode->i_flags |= S_DAX;
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 39e5ec3d407f..fed63e0b8f4d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1701,11 +1701,17 @@ xfs_fs_fill_super(
 		sb->s_flags |= SB_I_VERSION;
 
 	if (mp->m_flags & XFS_MOUNT_DAX) {
+		int	error2 = 0;
+
 		xfs_warn(mp,
 		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 
-		error = bdev_dax_supported(sb, sb->s_blocksize);
-		if (error) {
+		error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+				sb->s_blocksize);
+		if (mp->m_rtdev_targp)
+			error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev,
+					sb->s_blocksize);
+		if (error && error2) {
 			xfs_alert(mp,
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index f9eb22ad341e..2a4f7295c12b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -64,10 +64,10 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
-int __bdev_dax_supported(struct super_block *sb, int blocksize);
-static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+int __bdev_dax_supported(struct block_device *bdev, int blocksize);
+static inline int bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
-	return __bdev_dax_supported(sb, blocksize);
+	return __bdev_dax_supported(bdev, blocksize);
 }
 
 static inline struct dax_device *fs_dax_get_by_host(const char *host)
@@ -84,7 +84,8 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
 #else
-static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+static inline int bdev_dax_supported(struct block_device *bdev,
+		int blocksize)
 {
 	return -EOPNOTSUPP;
 }

From 80660f20252d6f76c9f203874ad7c7a4a8508cf8 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 30 May 2018 13:03:46 -0700
Subject: [PATCH 102/121] dax: change bdev_dax_supported() to support boolean
 returns

The function return values are confusing with the way the function is
named. We expect a true or false return value but it actually returns
0/-errno.  This makes the code very confusing. Changing the return values
to return a bool where if DAX is supported then return true and no DAX
support returns false.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 drivers/dax/super.c | 16 ++++++++--------
 fs/ext2/super.c     |  3 +--
 fs/ext4/super.c     |  3 +--
 fs/xfs/xfs_ioctl.c  |  4 ++--
 fs/xfs/xfs_super.c  | 12 ++++++------
 include/linux/dax.h |  8 ++++----
 6 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 3943feb9a090..1d7bd96511f0 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -80,9 +80,9 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  * This is a library function for filesystems to check if the block device
  * can be mounted with dax option.
  *
- * Return: negative errno if unsupported, 0 if supported.
+ * Return: true if supported, false if unsupported
  */
-int __bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
 	struct dax_device *dax_dev;
 	pgoff_t pgoff;
@@ -95,21 +95,21 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	if (blocksize != PAGE_SIZE) {
 		pr_debug("%s: error: unsupported blocksize for dax\n",
 				bdevname(bdev, buf));
-		return -EINVAL;
+		return false;
 	}
 
 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
 		pr_debug("%s: error: unaligned partition for dax\n",
 				bdevname(bdev, buf));
-		return err;
+		return false;
 	}
 
 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
 		pr_debug("%s: error: device does not support dax\n",
 				bdevname(bdev, buf));
-		return -EOPNOTSUPP;
+		return false;
 	}
 
 	id = dax_read_lock();
@@ -121,7 +121,7 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	if (len < 1) {
 		pr_debug("%s: error: dax access failed (%ld)\n",
 				bdevname(bdev, buf), len);
-		return len < 0 ? len : -EIO;
+		return false;
 	}
 
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
@@ -139,10 +139,10 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	} else {
 		pr_debug("%s: error: dax support not enabled\n",
 				bdevname(bdev, buf));
-		return -EOPNOTSUPP;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 #endif
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9627c3054b5c..c09289a42dc5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
 	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-		err = bdev_dax_supported(sb->s_bdev, blocksize);
-		if (err) {
+		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext2_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
 			sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 089170e99895..2e1622907f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3732,8 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 					" that may contain inline data");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
 		}
-		err = bdev_dax_supported(sb->s_bdev, blocksize);
-		if (err) {
+		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext4_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c9a5b75a73a8..5dd9e22b4a4c 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1103,8 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate(
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
-				sb->s_blocksize) < 0)
+		if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+				sb->s_blocksize))
 			return -EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fed63e0b8f4d..4e66e61865fb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1701,17 +1701,17 @@ xfs_fs_fill_super(
 		sb->s_flags |= SB_I_VERSION;
 
 	if (mp->m_flags & XFS_MOUNT_DAX) {
-		int	error2 = 0;
+		bool rtdev_is_dax = false, datadev_is_dax;
 
 		xfs_warn(mp,
 		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 
-		error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
-				sb->s_blocksize);
+		datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+			sb->s_blocksize);
 		if (mp->m_rtdev_targp)
-			error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev,
-					sb->s_blocksize);
-		if (error && error2) {
+			rtdev_is_dax = bdev_dax_supported(
+				mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
+		if (!rtdev_is_dax && !datadev_is_dax) {
 			xfs_alert(mp,
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2a4f7295c12b..c99692ddd4b5 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -64,8 +64,8 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
-int __bdev_dax_supported(struct block_device *bdev, int blocksize);
-static inline int bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize);
+static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
 	return __bdev_dax_supported(bdev, blocksize);
 }
@@ -84,10 +84,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
 #else
-static inline int bdev_dax_supported(struct block_device *bdev,
+static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
 {
-	return -EOPNOTSUPP;
+	return false;
 }
 
 static inline struct dax_device *fs_dax_get_by_host(const char *host)

From 2483113f3d7baa303deac115e6764d9489c8316b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 May 2018 09:07:20 -0700
Subject: [PATCH 103/121] xfs: xfs_rtword_t should be unsigned, not signed

xfs_rtword_t is used for bit manipulations in the realtime bitmap file.
Since we're performing bit shifts with this type, we don't want sign
extension and we don't want to be left shifting negative quantities
because that's undefined behavior.

This also shuts up these UBSAN warnings:
UBSAN: Undefined behaviour in fs/xfs/libxfs/xfs_rtbitmap.c:833:48
signed integer overflow:
-2147483648 - 1 cannot be represented in type 'int'

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
---
 fs/xfs/libxfs/xfs_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3c560695c546..ea18449bd732 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -30,7 +30,7 @@ typedef int64_t		xfs_fsize_t;	/* bytes in a file */
 typedef uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
 
 typedef int32_t		xfs_suminfo_t;	/* type of bitmap summary info */
-typedef int32_t		xfs_rtword_t;	/* word type for bitmap manipulations */
+typedef uint32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
 
 typedef int64_t		xfs_lsn_t;	/* log sequence number */
 typedef int32_t		xfs_tid_t;	/* transaction identifier */

From a03f1641c7a6d4e88c6aae0cd3d52305cdb967a0 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 May 2018 09:07:20 -0700
Subject: [PATCH 104/121] xfs: xfs_rtbuf_get should check the bmapi_read
 results

The xfs_rtbuf_get function should check the block mapping it gets back
from bmapi_read.  If there are no mappings or the mapping isn't a real
extent, we should return -EFSCORRUPTED rather than trying to read a
garbage value.  We also require realtime bitmap blocks to be real,
written allocations.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
---
 fs/xfs/libxfs/xfs_rtbitmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 106be2d0bb88..7712f282d172 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -90,6 +90,9 @@ xfs_rtbuf_get(
 	if (error)
 		return error;
 
+	if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+		return -EFSCORRUPTED;
+
 	ASSERT(map.br_startblock != NULLFSBLOCK);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),

From 8ad560d2565e64b8be0cf5901c1e8fe034ac5599 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 May 2018 09:07:21 -0700
Subject: [PATCH 105/121] xfs: strengthen rtalloc query range checks

Strengthen the rtalloc range query checks to make sure that the keys do
not run off the end of the realtime device inappropriately.  Note that
the query range functions require units of rt extents, not blocks,
despite the type name.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
---
 fs/xfs/libxfs/xfs_rtbitmap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 7712f282d172..1855182c11ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1038,8 +1038,11 @@ xfs_rtalloc_query_range(
 
 	if (low_rec->ar_startblock > high_rec->ar_startblock)
 		return -EINVAL;
-	else if (low_rec->ar_startblock == high_rec->ar_startblock)
+	if (low_rec->ar_startblock >= mp->m_sb.sb_rextents ||
+	    low_rec->ar_startblock == high_rec->ar_startblock)
 		return 0;
+	if (high_rec->ar_startblock >= mp->m_sb.sb_rextents)
+		high_rec->ar_startblock = mp->m_sb.sb_rextents - 1;
 
 	/* Iterate the bitmap, looking for discrepancies. */
 	rtstart = low_rec->ar_startblock;
@@ -1083,7 +1086,7 @@ xfs_rtalloc_query_all(
 	struct xfs_rtalloc_rec		keys[2];
 
 	keys[0].ar_startblock = 0;
-	keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
+	keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rextents - 1;
 	keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
 
 	return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);

From a0e5c435babd8bb61612d2e4e9b098cacdd825f7 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 May 2018 09:12:10 -0700
Subject: [PATCH 106/121] xfs: fix xfs_rtalloc_rec units

All the realtime allocation functions deal with space on the rtdev in
units of realtime extents.  However, struct xfs_rtalloc_rec confusingly
uses the word 'block' in the name, even though they're really extents.

Fix the naming problem and fix all the unit handling problems in the two
existing users.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
---
 fs/xfs/libxfs/xfs_rtbitmap.c | 26 +++++++++++++-------------
 fs/xfs/scrub/rtbitmap.c      | 23 ++++++++++++++++++-----
 fs/xfs/xfs_fsmap.c           | 14 ++++++++------
 fs/xfs/xfs_rtalloc.h         |  9 +++++++--
 4 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 1855182c11ec..369eeb7a52ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1036,17 +1036,17 @@ xfs_rtalloc_query_range(
 	int				is_free;
 	int				error = 0;
 
-	if (low_rec->ar_startblock > high_rec->ar_startblock)
+	if (low_rec->ar_startext > high_rec->ar_startext)
 		return -EINVAL;
-	if (low_rec->ar_startblock >= mp->m_sb.sb_rextents ||
-	    low_rec->ar_startblock == high_rec->ar_startblock)
+	if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
+	    low_rec->ar_startext == high_rec->ar_startext)
 		return 0;
-	if (high_rec->ar_startblock >= mp->m_sb.sb_rextents)
-		high_rec->ar_startblock = mp->m_sb.sb_rextents - 1;
+	if (high_rec->ar_startext >= mp->m_sb.sb_rextents)
+		high_rec->ar_startext = mp->m_sb.sb_rextents - 1;
 
 	/* Iterate the bitmap, looking for discrepancies. */
-	rtstart = low_rec->ar_startblock;
-	rem = high_rec->ar_startblock - rtstart;
+	rtstart = low_rec->ar_startext;
+	rem = high_rec->ar_startext - rtstart;
 	while (rem) {
 		/* Is the first block free? */
 		error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
@@ -1056,13 +1056,13 @@ xfs_rtalloc_query_range(
 
 		/* How long does the extent go for? */
 		error = xfs_rtfind_forw(mp, tp, rtstart,
-				high_rec->ar_startblock - 1, &rtend);
+				high_rec->ar_startext - 1, &rtend);
 		if (error)
 			break;
 
 		if (is_free) {
-			rec.ar_startblock = rtstart;
-			rec.ar_blockcount = rtend - rtstart + 1;
+			rec.ar_startext = rtstart;
+			rec.ar_extcount = rtend - rtstart + 1;
 
 			error = fn(tp, &rec, priv);
 			if (error)
@@ -1085,9 +1085,9 @@ xfs_rtalloc_query_all(
 {
 	struct xfs_rtalloc_rec		keys[2];
 
-	keys[0].ar_startblock = 0;
-	keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rextents - 1;
-	keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+	keys[0].ar_startext = 0;
+	keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+	keys[0].ar_extcount = keys[1].ar_extcount = 0;
 
 	return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
 }
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 0fa3ef5c83b8..40f462a11ea5 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -66,11 +66,15 @@ xfs_scrub_rtbitmap_rec(
 	void				*priv)
 {
 	struct xfs_scrub_context	*sc = priv;
+	xfs_rtblock_t			startblock;
+	xfs_rtblock_t			blockcount;
 
-	if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock ||
-	    !xfs_verify_rtbno(sc->mp, rec->ar_startblock) ||
-	    !xfs_verify_rtbno(sc->mp, rec->ar_startblock +
-			rec->ar_blockcount - 1))
+	startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
+	blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+
+	if (startblock + blockcount <= startblock ||
+	    !xfs_verify_rtbno(sc->mp, startblock) ||
+	    !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1))
 		xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
 	return 0;
 }
@@ -139,14 +143,23 @@ xfs_scrub_xref_is_used_rt_space(
 	xfs_rtblock_t			fsbno,
 	xfs_extlen_t			len)
 {
+	xfs_rtblock_t			startext;
+	xfs_rtblock_t			endext;
+	xfs_rtblock_t			extcount;
 	bool				is_free;
 	int				error;
 
 	if (xfs_scrub_skip_xref(sc->sm))
 		return;
 
+	startext = fsbno;
+	endext = fsbno + len - 1;
+	do_div(startext, sc->mp->m_sb.sb_rextsize);
+	if (do_div(endext, sc->mp->m_sb.sb_rextsize))
+		endext++;
+	extcount = endext - startext;
 	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
-	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len,
+	error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
 			&is_free);
 	if (!xfs_scrub_should_check_xref(sc, &error, NULL))
 		goto out_unlock;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 43cfc07996a4..0299febece9c 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -465,10 +465,9 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
 	struct xfs_rmap_irec		irec;
 	xfs_daddr_t			rec_daddr;
 
-	rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-
-	irec.rm_startblock = rec->ar_startblock;
-	irec.rm_blockcount = rec->ar_blockcount;
+	irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+	rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock);
+	irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
 	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
 	irec.rm_offset = 0;
 	irec.rm_flags = 0;
@@ -534,8 +533,11 @@ xfs_getfsmap_rtdev_rtbitmap_query(
 
 	xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
 
-	alow.ar_startblock = info->low.rm_startblock;
-	ahigh.ar_startblock = info->high.rm_startblock;
+	alow.ar_startext = info->low.rm_startblock;
+	ahigh.ar_startext = info->high.rm_startblock;
+	do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize);
+	if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize))
+		ahigh.ar_startext++;
 	error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
 			xfs_getfsmap_rtdev_rtbitmap_helper, info);
 	if (error)
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index dfee3c991155..52632ab727f7 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,9 +23,14 @@
 struct xfs_mount;
 struct xfs_trans;
 
+/*
+ * XXX: Most of the realtime allocation functions deal in units of realtime
+ * extents, not realtime blocks.  This looks funny when paired with the type
+ * name and screams for a larger cleanup.
+ */
 struct xfs_rtalloc_rec {
-	xfs_rtblock_t		ar_startblock;
-	xfs_rtblock_t		ar_blockcount;
+	xfs_rtblock_t		ar_startext;
+	xfs_rtblock_t		ar_extcount;
 };
 
 typedef int (*xfs_rtalloc_query_range_fn)(

From 16858f7c21156e676c98a5f6532f68cacbde3784 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 31 May 2018 16:49:00 -0700
Subject: [PATCH 107/121] xfs: fix error handling in xfs_refcount_insert()

generic/475 fired an assert failure just after the filesystem was
shut down:

XFS: Assertion failed: fs_is_ok, file: fs/xfs/libxfs/xfs_refcount.c, line: 182
.....
Call Trace:
 xfs_refcount_insert+0x151/0x190
 xfs_refcount_adjust_extents.constprop.11+0x9c/0x470
 xfs_refcount_adjust.constprop.10+0xb0/0x270
 xfs_refcount_finish_one+0x25a/0x420
 xfs_trans_log_finish_refcount_update+0x2a/0x40
 xfs_refcount_update_finish_item+0x35/0xa0
 xfs_defer_finish+0x15e/0x4d0
 xfs_reflink_remap_extent+0x1bc/0x610
 xfs_reflink_remap_blocks+0x6e/0x280
 xfs_reflink_remap_range+0x311/0x530
 vfs_clone_file_range+0x119/0x200
 ....

If xfs_btree_insert() returns an error, the corruption check fires
instead of passing the error back the caller. The corruption check
should be after we've checked for an error, not before, thereby
avoiding assert failures if the filesystem shuts down during a
refcount btree record insert.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_refcount.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 0f23371227cc..418d53295893 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -179,7 +179,10 @@ xfs_refcount_insert(
 	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
 	cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
 	error = xfs_btree_insert(cur, i);
+	if (error)
+		goto out_error;
 	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+
 out_error:
 	if (error)
 		trace_xfs_refcount_insert_error(cur->bc_mp,

From 0aa69fd32a5f766e997ca8ab4723c5a1146efa8b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:05 -0700
Subject: [PATCH 108/121] block: add a lower-level bio_add_page interface

For the upcoming removal of buffer heads in XFS we need to keep track of
the number of outstanding writeback requests per page.  For this we need
to know if bio_add_page merged a region with the previous bvec or not.
Instead of adding additional arguments this refactors bio_add_page to
be implemented using three lower level helpers which users like XFS can
use directly if they care about the merge decisions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 block/bio.c         | 98 +++++++++++++++++++++++++++++----------------
 include/linux/bio.h |  9 +++++
 2 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 53e0f0a1ed94..fdf635d42bbd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -773,7 +773,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 			return 0;
 	}
 
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
+	if (bio_full(bio))
 		return 0;
 
 	/*
@@ -820,6 +820,65 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
 }
 EXPORT_SYMBOL(bio_add_pc_page);
 
+/**
+ * __bio_try_merge_page - try appending data to an existing bvec.
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the data to add
+ * @off: offset of the data in @page
+ *
+ * Try to add the data at @page + @off to the last bvec of @bio.  This is a
+ * a useful optimisation for file systems with a block size smaller than the
+ * page size.
+ *
+ * Return %true on success or %false on failure.
+ */
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off)
+{
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return false;
+
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
+			bv->bv_len += len;
+			bio->bi_iter.bi_size += len;
+			return true;
+		}
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(__bio_try_merge_page);
+
+/**
+ * __bio_add_page - add page to a bio in a new segment
+ * @bio: destination bio
+ * @page: page to add
+ * @len: length of the data to add
+ * @off: offset of the data in @page
+ *
+ * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
+ * that @bio has space for another bvec.
+ */
+void __bio_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off)
+{
+	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+	WARN_ON_ONCE(bio_full(bio));
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+
+	bio->bi_iter.bi_size += len;
+	bio->bi_vcnt++;
+}
+EXPORT_SYMBOL_GPL(__bio_add_page);
+
 /**
  *	bio_add_page	-	attempt to add page to bio
  *	@bio: destination bio
@@ -833,40 +892,11 @@ EXPORT_SYMBOL(bio_add_pc_page);
 int bio_add_page(struct bio *bio, struct page *page,
 		 unsigned int len, unsigned int offset)
 {
-	struct bio_vec *bv;
-
-	/*
-	 * cloned bio must not modify vec list
-	 */
-	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
-		return 0;
-
-	/*
-	 * For filesystems with a blocksize smaller than the pagesize
-	 * we will often be called with the same page as last time and
-	 * a consecutive offset.  Optimize this special case.
-	 */
-	if (bio->bi_vcnt > 0) {
-		bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-		if (page == bv->bv_page &&
-		    offset == bv->bv_offset + bv->bv_len) {
-			bv->bv_len += len;
-			goto done;
-		}
+	if (!__bio_try_merge_page(bio, page, len, offset)) {
+		if (bio_full(bio))
+			return 0;
+		__bio_add_page(bio, page, len, offset);
 	}
-
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return 0;
-
-	bv		= &bio->bi_io_vec[bio->bi_vcnt];
-	bv->bv_page	= page;
-	bv->bv_len	= len;
-	bv->bv_offset	= offset;
-
-	bio->bi_vcnt++;
-done:
-	bio->bi_iter.bi_size += len;
 	return len;
 }
 EXPORT_SYMBOL(bio_add_page);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ce547a25e8ae..3e73c8bc25ea 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -123,6 +123,11 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
+static inline bool bio_full(struct bio *bio)
+{
+	return bio->bi_vcnt >= bio->bi_max_vecs;
+}
+
 /*
  * will die
  */
@@ -470,6 +475,10 @@ void bio_chain(struct bio *, struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
+bool __bio_try_merge_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off);
+void __bio_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,

From 836978b35fcd402a31323a16ac0b4c8242890a84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:05 -0700
Subject: [PATCH 109/121] mm: give the 'ret' variable a better name
 __do_page_cache_readahead

It counts the number of pages acted on, so name it nr_pages to make that
obvious.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 mm/readahead.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 539bbb6c1fad..16d0cb1e2616 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -156,7 +156,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	unsigned long end_index;	/* The last page we want to read */
 	LIST_HEAD(page_pool);
 	int page_idx;
-	int ret = 0;
+	int nr_pages = 0;
 	loff_t isize = i_size_read(inode);
 	gfp_t gfp_mask = readahead_gfp_mask(mapping);
 
@@ -187,7 +187,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		list_add(&page->lru, &page_pool);
 		if (page_idx == nr_to_read - lookahead_size)
 			SetPageReadahead(page);
-		ret++;
+		nr_pages++;
 	}
 
 	/*
@@ -195,11 +195,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	 * uptodate then the caller will launch readpage again, and
 	 * will then handle the error.
 	 */
-	if (ret)
-		read_pages(mapping, filp, &page_pool, ret, gfp_mask);
+	if (nr_pages)
+		read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
 	BUG_ON(!list_empty(&page_pool));
 out:
-	return ret;
+	return nr_pages;
 }
 
 /*

From c534aa3fdd149fab18b094375f334b4bb3635cbf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:05 -0700
Subject: [PATCH 110/121] mm: return an unsigned int from
 __do_page_cache_readahead

We never return an error, so switch to returning an unsigned int.  Most
callers already did implicit casts to an unsigned type, and the one that
didn't can be simplified now.

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 mm/internal.h  |  2 +-
 mm/readahead.c | 15 +++++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 62d8c34e63d5..954003ac766a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -53,7 +53,7 @@ void unmap_page_range(struct mmu_gather *tlb,
 			     unsigned long addr, unsigned long end,
 			     struct zap_details *details);
 
-extern int __do_page_cache_readahead(struct address_space *mapping,
+extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
 		struct file *filp, pgoff_t offset, unsigned long nr_to_read,
 		unsigned long lookahead_size);
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 16d0cb1e2616..fa4d4b767130 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -147,16 +147,16 @@ out:
  *
  * Returns the number of pages requested, or the maximum amount of I/O allowed.
  */
-int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
-			pgoff_t offset, unsigned long nr_to_read,
-			unsigned long lookahead_size)
+unsigned int __do_page_cache_readahead(struct address_space *mapping,
+		struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+		unsigned long lookahead_size)
 {
 	struct inode *inode = mapping->host;
 	struct page *page;
 	unsigned long end_index;	/* The last page we want to read */
 	LIST_HEAD(page_pool);
 	int page_idx;
-	int nr_pages = 0;
+	unsigned int nr_pages = 0;
 	loff_t isize = i_size_read(inode);
 	gfp_t gfp_mask = readahead_gfp_mask(mapping);
 
@@ -223,16 +223,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
 	nr_to_read = min(nr_to_read, max_pages);
 	while (nr_to_read) {
-		int err;
-
 		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
 
 		if (this_chunk > nr_to_read)
 			this_chunk = nr_to_read;
-		err = __do_page_cache_readahead(mapping, filp,
-						offset, this_chunk, 0);
-		if (err < 0)
-			return err;
+		__do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
 
 		offset += this_chunk;
 		nr_to_read -= this_chunk;

From b3751e6ab45a3b92da1e4acd42ada7b6a4122f2b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:06 -0700
Subject: [PATCH 111/121] mm: split ->readpages calls to avoid non-contiguous
 pages lists

That way file systems don't have to go spotting for non-contiguous pages
and work around them.  It also kicks off I/O earlier, allowing it to
finish earlier and reduce latency.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 mm/readahead.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index fa4d4b767130..e273f0de3376 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -140,8 +140,8 @@ out:
 }
 
 /*
- * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
- * the pages first, then submits them all for I/O. This avoids the very bad
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
  * behaviour which would occur if page allocations are causing VM writeback.
  * We really don't want to intermingle reads and writes like that.
  *
@@ -177,8 +177,18 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
 		rcu_read_lock();
 		page = radix_tree_lookup(&mapping->i_pages, page_offset);
 		rcu_read_unlock();
-		if (page && !radix_tree_exceptional_entry(page))
+		if (page && !radix_tree_exceptional_entry(page)) {
+			/*
+			 * Page already present?  Kick off the current batch of
+			 * contiguous pages before continuing with the next
+			 * batch.
+			 */
+			if (nr_pages)
+				read_pages(mapping, filp, &page_pool, nr_pages,
+						gfp_mask);
+			nr_pages = 0;
 			continue;
+		}
 
 		page = __page_cache_alloc(gfp_mask);
 		if (!page)

From 19319b53210c6b89c375cf395c08f156cccd83ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:06 -0700
Subject: [PATCH 112/121] iomap: inline data should be an iomap type, not a
 flag

Inline data is fundamentally different from our normal mapped case in that
it doesn't even have a block address.  So instead of having a flag for it
it should be an entirely separate iomap range type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/ext4/inline.c      |  4 ++--
 fs/gfs2/bmap.c        |  3 +--
 fs/iomap.c            | 21 ++++++++++++---------
 include/linux/iomap.h |  2 +-
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 70cf4c7b268a..e1f00891ef95 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1835,8 +1835,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
 	iomap->offset = 0;
 	iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
 			      i_size_read(inode));
-	iomap->type = 0;
-	iomap->flags = IOMAP_F_DATA_INLINE;
+	iomap->type = IOMAP_INLINE;
+	iomap->flags = 0;
 
 out:
 	up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 278ed0869c3c..cbeedd3cfb36 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -680,8 +680,7 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 		      sizeof(struct gfs2_dinode);
 	iomap->offset = 0;
 	iomap->length = i_size_read(inode);
-	iomap->type = IOMAP_MAPPED;
-	iomap->flags = IOMAP_F_DATA_INLINE;
+	iomap->type = IOMAP_INLINE;
 }
 
 /**
diff --git a/fs/iomap.c b/fs/iomap.c
index f2456d0d8ddd..df2652b0d85d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -502,10 +502,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 	case IOMAP_DELALLOC:
 		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
 		break;
+	case IOMAP_MAPPED:
+		break;
 	case IOMAP_UNWRITTEN:
 		flags |= FIEMAP_EXTENT_UNWRITTEN;
 		break;
-	case IOMAP_MAPPED:
+	case IOMAP_INLINE:
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
 		break;
 	}
 
@@ -513,8 +516,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
 		flags |= FIEMAP_EXTENT_MERGED;
 	if (iomap->flags & IOMAP_F_SHARED)
 		flags |= FIEMAP_EXTENT_SHARED;
-	if (iomap->flags & IOMAP_F_DATA_INLINE)
-		flags |= FIEMAP_EXTENT_DATA_INLINE;
 
 	return fiemap_fill_next_extent(fi, iomap->offset,
 			iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -1214,14 +1215,16 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
 	struct iomap_swapfile_info *isi = data;
 	int error;
 
-	/* No inline data. */
-	if (iomap->flags & IOMAP_F_DATA_INLINE) {
+	switch (iomap->type) {
+	case IOMAP_MAPPED:
+	case IOMAP_UNWRITTEN:
+		/* Only real or unwritten extents. */
+		break;
+	case IOMAP_INLINE:
+		/* No inline data. */
 		pr_err("swapon: file is inline\n");
 		return -EINVAL;
-	}
-
-	/* Only real or unwritten extents. */
-	if (iomap->type != IOMAP_MAPPED && iomap->type != IOMAP_UNWRITTEN) {
+	default:
 		pr_err("swapon: file has unallocated extents\n");
 		return -EINVAL;
 	}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4bd87294219a..8f7095fc514e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -18,6 +18,7 @@ struct vm_fault;
 #define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
 #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
+#define IOMAP_INLINE	0x05	/* data inline in the inode */
 
 /*
  * Flags for all iomap mappings:
@@ -34,7 +35,6 @@ struct vm_fault;
  */
 #define IOMAP_F_MERGED		0x10	/* contains multiple blocks/extents */
 #define IOMAP_F_SHARED		0x20	/* block shared with another file */
-#define IOMAP_F_DATA_INLINE	0x40	/* data inline in the inode */
 
 /*
  * Magic value for addr:

From 9ecac0ef2233edcb418cc17f9fcbe51ddb0c696c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:07 -0700
Subject: [PATCH 113/121] iomap: fix the comment describing IOMAP_NOWAIT

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 include/linux/iomap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8f7095fc514e..13d19b4c29a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -59,7 +59,7 @@ struct iomap {
 #define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT		(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT		(1 << 4) /* direct I/O */
-#define IOMAP_NOWAIT		(1 << 5) /* Don't wait for writeback */
+#define IOMAP_NOWAIT		(1 << 5) /* do not block */
 
 struct iomap_ops {
 	/*

From 7ee66c03e40a570cbf641ff83c063f5209eb22b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:07 -0700
Subject: [PATCH 114/121] iomap: move IOMAP_F_BOUNDARY to gfs2

Just define a range of fs specific flags and use that in gfs2 instead of
exposing this internal flag globally.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/gfs2/bmap.c        | 8 +++++---
 include/linux/iomap.h | 9 +++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index cbeedd3cfb36..8efa6297e19c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -683,6 +683,8 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
 	iomap->type = IOMAP_INLINE;
 }
 
+#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
+
 /**
  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
  * @inode: The inode
@@ -774,7 +776,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 	bh = mp.mp_bh[ip->i_height - 1];
 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
 	if (eob)
-		iomap->flags |= IOMAP_F_BOUNDARY;
+		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 	iomap->length = (u64)len << inode->i_blkbits;
 
 out_release:
@@ -846,12 +848,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
 	if (iomap.length > bh_map->b_size) {
 		iomap.length = bh_map->b_size;
-		iomap.flags &= ~IOMAP_F_BOUNDARY;
+		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
 	}
 	if (iomap.addr != IOMAP_NULL_ADDR)
 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
 	bh_map->b_size = iomap.length;
-	if (iomap.flags & IOMAP_F_BOUNDARY)
+	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
 		set_buffer_boundary(bh_map);
 	if (iomap.flags & IOMAP_F_NEW)
 		set_buffer_new(bh_map);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 13d19b4c29a9..819e0cd2a950 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -27,8 +27,7 @@ struct vm_fault;
  * written data and requires fdatasync to commit them to persistent storage.
  */
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
-#define IOMAP_F_BOUNDARY	0x02	/* mapping ends at metadata boundary */
-#define IOMAP_F_DIRTY		0x04	/* uncommitted metadata */
+#define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
@@ -36,6 +35,12 @@ struct vm_fault;
 #define IOMAP_F_MERGED		0x10	/* contains multiple blocks/extents */
 #define IOMAP_F_SHARED		0x20	/* block shared with another file */
 
+/*
+ * Flags from 0x1000 up are for file system specific usage:
+ */
+#define IOMAP_F_PRIVATE		0x1000
+
+
 /*
  * Magic value for addr:
  */

From 6533b4e40d8be5f07336fc62ddaf35b1fb4028be Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:07 -0700
Subject: [PATCH 115/121] iomap: use __bio_add_page in iomap_dio_zero

We don't need any merging logic, and this also replaces a BUG_ON with a
WARN_ON_ONCE inside __bio_add_page for the impossible overflow condition.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index df2652b0d85d..85901b449146 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -845,8 +845,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
 	get_page(page);
-	if (bio_add_page(bio, page, len, 0) != len)
-		BUG();
+	__bio_add_page(bio, page, len, 0);
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
 
 	atomic_inc(&dio->ref);

From 57fc505d119d6e6bdf9b9b713215616a8cd2b8de Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:08 -0700
Subject: [PATCH 116/121] iomap: add a iomap_sector helper

Factor the repeated calculation of the on-disk sector for a given logical
block into a littler helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 85901b449146..74cdf8b5bbb0 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -96,6 +96,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 	return written ? written : ret;
 }
 
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -353,11 +359,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
 		struct iomap *iomap)
 {
-	sector_t sector = (iomap->addr +
-			   (pos & PAGE_MASK) - iomap->offset) >> 9;
-
-	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
-			offset, bytes);
+	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+			iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
 }
 
 static loff_t
@@ -839,8 +842,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 
 	bio = bio_alloc(GFP_KERNEL, 1);
 	bio_set_dev(bio, iomap->bdev);
-	bio->bi_iter.bi_sector =
-		(iomap->addr + pos - iomap->offset) >> 9;
+	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
@@ -934,8 +936,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector =
-			(iomap->addr + pos - iomap->offset) >> 9;
+		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
 		bio->bi_write_hint = dio->iocb->ki_hint;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;

From 89eb1906a9530ef694b2a5817c9498997722754f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:08 -0700
Subject: [PATCH 117/121] iomap: add an iomap-based bmap implementation

This adds a simple iomap-based implementation of the legacy ->bmap
interface.  Note that we can't easily add checks for rt or reflink
files, so these will have to remain in the callers.  This interface
just needs to die..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c            | 34 ++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 +++
 2 files changed, 37 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 74cdf8b5bbb0..b0bc928672af 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1307,3 +1307,37 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
 }
 EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
 #endif /* CONFIG_SWAP */
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	sector_t *bno = data, addr;
+
+	if (iomap->type == IOMAP_MAPPED) {
+		addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+		if (addr > INT_MAX)
+			WARN(1, "would truncate bmap result\n");
+		else
+			*bno = addr;
+	}
+	return 0;
+}
+
+/* legacy ->bmap interface.  0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops)
+{
+	struct inode *inode = mapping->host;
+	loff_t pos = bno >> inode->i_blkbits;
+	unsigned blocksize = i_blocksize(inode);
+
+	if (filemap_write_and_wait(mapping))
+		return 0;
+
+	bno = 0;
+	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+	return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 819e0cd2a950..a044a824da85 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 
+struct address_space;
 struct fiemap_extent_info;
 struct inode;
 struct iov_iter;
@@ -100,6 +101,8 @@ loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
 		const struct iomap_ops *ops);
 loff_t iomap_seek_data(struct inode *inode, loff_t offset,
 		const struct iomap_ops *ops);
+sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
+		const struct iomap_ops *ops);
 
 /*
  * Flags for direct I/O ->end_io:

From b84e772299dc5b9da22a40ba87ed519a891397a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:03:09 -0700
Subject: [PATCH 118/121] xfs: use iomap_bmap

Switch to the iomap based bmap implementation to get rid of one of the
last users of xfs_get_blocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 80de476cecf8..56e405572909 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1378,10 +1378,9 @@ xfs_vm_bmap(
 	struct address_space	*mapping,
 	sector_t		block)
 {
-	struct inode		*inode = (struct inode *)mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_inode	*ip = XFS_I(mapping->host);
 
-	trace_xfs_vm_bmap(XFS_I(inode));
+	trace_xfs_vm_bmap(ip);
 
 	/*
 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1394,9 +1393,7 @@ xfs_vm_bmap(
 	 */
 	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 		return 0;
-
-	filemap_write_and_wait(mapping);
-	return generic_block_bmap(mapping, block, xfs_get_blocks);
+	return iomap_bmap(mapping, block, &xfs_iomap_ops);
 }
 
 STATIC int

From 8a78cb1f1b98e5ea970674e0f049832d19e76ace Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:04:40 -0700
Subject: [PATCH 119/121] fs: move page_cache_seek_hole_data to iomap.c

This function is only used by the iomap code, depends on being called
from it, and will soon stop poking into buffer head internals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/buffer.c                 | 114 -----------------------------------
 fs/iomap.c                  | 116 ++++++++++++++++++++++++++++++++++++
 include/linux/buffer_head.h |   2 -
 3 files changed, 116 insertions(+), 116 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 249b83fafe48..cabc045f483d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3427,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(bh_submit_read);
 
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
- */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
-{
-	loff_t offset = page_offset(page);
-	struct buffer_head *bh, *head;
-	bool seek_data = whence == SEEK_DATA;
-
-	if (lastoff < offset)
-		lastoff = offset;
-
-	bh = head = page_buffers(page);
-	do {
-		offset += bh->b_size;
-		if (lastoff >= offset)
-			continue;
-
-		/*
-		 * Unwritten extents that have data in the page cache covering
-		 * them can be identified by the BH_Unwritten state flag.
-		 * Pages with multiple buffers might have a mix of holes, data
-		 * and unwritten extents - any buffer with valid data in it
-		 * should have BH_Uptodate flag set on it.
-		 */
-
-		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
-			return lastoff;
-
-		lastoff = offset;
-	} while ((bh = bh->b_this_page) != head);
-	return -ENOENT;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
-			  int whence)
-{
-	pgoff_t index = offset >> PAGE_SHIFT;
-	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	loff_t lastoff = offset;
-	struct pagevec pvec;
-
-	if (length <= 0)
-		return -ENOENT;
-
-	pagevec_init(&pvec);
-
-	do {
-		unsigned nr_pages, i;
-
-		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-						end - 1);
-		if (nr_pages == 0)
-			break;
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping.  However, page->index will not change
-			 * because we have a reference on the page.
-                         *
-			 * If current page offset is beyond where we've ended,
-			 * we've found a hole.
-                         */
-			if (whence == SEEK_HOLE &&
-			    lastoff < page_offset(page))
-				goto check_range;
-
-			lock_page(page);
-			if (likely(page->mapping == inode->i_mapping) &&
-			    page_has_buffers(page)) {
-				lastoff = page_seek_hole_data(page, lastoff, whence);
-				if (lastoff >= 0) {
-					unlock_page(page);
-					goto check_range;
-				}
-			}
-			unlock_page(page);
-			lastoff = page_offset(page) + PAGE_SIZE;
-		}
-		pagevec_release(&pvec);
-	} while (index < end);
-
-	/* When no page at lastoff and we are not done, we found a hole. */
-	if (whence != SEEK_HOLE)
-		goto not_found;
-
-check_range:
-	if (lastoff < offset + length)
-		goto out;
-not_found:
-	lastoff = -ENOENT;
-out:
-	pagevec_release(&pvec);
-	return lastoff;
-}
-
 void __init buffer_init(void)
 {
 	unsigned long nrpages;
diff --git a/fs/iomap.c b/fs/iomap.c
index b0bc928672af..fe3b9222dd46 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/file.h>
 #include <linux/uio.h>
 #include <linux/backing-dev.h>
@@ -592,6 +593,121 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
 }
 EXPORT_SYMBOL_GPL(iomap_fiemap);
 
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ *
+ * Returns the offset within the file on success, and -ENOENT otherwise.
+ */
+static loff_t
+page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+{
+	loff_t offset = page_offset(page);
+	struct buffer_head *bh, *head;
+	bool seek_data = whence == SEEK_DATA;
+
+	if (lastoff < offset)
+		lastoff = offset;
+
+	bh = head = page_buffers(page);
+	do {
+		offset += bh->b_size;
+		if (lastoff >= offset)
+			continue;
+
+		/*
+		 * Unwritten extents that have data in the page cache covering
+		 * them can be identified by the BH_Unwritten state flag.
+		 * Pages with multiple buffers might have a mix of holes, data
+		 * and unwritten extents - any buffer with valid data in it
+		 * should have BH_Uptodate flag set on it.
+		 */
+
+		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+			return lastoff;
+
+		lastoff = offset;
+	} while ((bh = bh->b_this_page) != head);
+	return -ENOENT;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: unwritten and uptodate buffer heads count as data;
+ * everything else counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+		int whence)
+{
+	pgoff_t index = offset >> PAGE_SHIFT;
+	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+	loff_t lastoff = offset;
+	struct pagevec pvec;
+
+	if (length <= 0)
+		return -ENOENT;
+
+	pagevec_init(&pvec);
+
+	do {
+		unsigned nr_pages, i;
+
+		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+						end - 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping.  However, page->index will not change
+			 * because we have a reference on the page.
+                         *
+			 * If current page offset is beyond where we've ended,
+			 * we've found a hole.
+                         */
+			if (whence == SEEK_HOLE &&
+			    lastoff < page_offset(page))
+				goto check_range;
+
+			lock_page(page);
+			if (likely(page->mapping == inode->i_mapping) &&
+			    page_has_buffers(page)) {
+				lastoff = page_seek_hole_data(page, lastoff, whence);
+				if (lastoff >= 0) {
+					unlock_page(page);
+					goto check_range;
+				}
+			}
+			unlock_page(page);
+			lastoff = page_offset(page) + PAGE_SIZE;
+		}
+		pagevec_release(&pvec);
+	} while (index < end);
+
+	/* When no page at lastoff and we are not done, we found a hole. */
+	if (whence != SEEK_HOLE)
+		goto not_found;
+
+check_range:
+	if (lastoff < offset + length)
+		goto out;
+not_found:
+	lastoff = -ENOENT;
+out:
+	pagevec_release(&pvec);
+	return lastoff;
+}
+
+
 static loff_t
 iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
 		      void *data, struct iomap *iomap)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 894e5d125de6..96225a77c112 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,8 +205,6 @@ void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
 int bh_uptodate_or_lock(struct buffer_head *bh);
 int bh_submit_read(struct buffer_head *bh);
-loff_t page_cache_seek_hole_data(struct inode *inode, loff_t offset,
-				 loff_t length, int whence);
 
 extern int buffer_heads_over_limit;
 

From bd56b3e14410b8cf6a4c13c94f7065341b179517 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:05:14 -0700
Subject: [PATCH 120/121] fs: remove the buffer_unwritten check in
 page_seek_hole_data

We only call into this function through the iomap iterators, so we already
know the buffer is unwritten.  In addition to that we always require the
uptodate flag that is ORed with the result anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index fe3b9222dd46..2a20d50f0c1d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -615,14 +615,9 @@ page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
 			continue;
 
 		/*
-		 * Unwritten extents that have data in the page cache covering
-		 * them can be identified by the BH_Unwritten state flag.
-		 * Pages with multiple buffers might have a mix of holes, data
-		 * and unwritten extents - any buffer with valid data in it
-		 * should have BH_Uptodate flag set on it.
+		 * Any buffer with valid data in it should have BH_Uptodate set.
 		 */
-
-		if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
+		if (buffer_uptodate(bh) == seek_data)
 			return lastoff;
 
 		lastoff = offset;
@@ -634,8 +629,8 @@ page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
  * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
  *
  * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
  *
  * Returns the resulting offset on successs, and -ENOENT otherwise.
  */

From afd9d6a1df75807684fa40dab77c52e104e5c74b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Jun 2018 09:05:15 -0700
Subject: [PATCH 121/121] fs: use ->is_partially_uptodate in
 page_cache_seek_hole_data

This way the implementation doesn't depend on buffer_head internals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 83 ++++++++++++++++++++++++++----------------------------
 1 file changed, 40 insertions(+), 43 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 2a20d50f0c1d..206539d369a8 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -595,34 +595,53 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
 
 /*
  * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
+ * Returns true if found and updates @lastoff to the offset in file.
  */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+		int whence)
 {
-	loff_t offset = page_offset(page);
-	struct buffer_head *bh, *head;
+	const struct address_space_operations *ops = inode->i_mapping->a_ops;
+	unsigned int bsize = i_blocksize(inode), off;
 	bool seek_data = whence == SEEK_DATA;
+	loff_t poff = page_offset(page);
 
-	if (lastoff < offset)
-		lastoff = offset;
-
-	bh = head = page_buffers(page);
-	do {
-		offset += bh->b_size;
-		if (lastoff >= offset)
-			continue;
+	if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+		return false;
 
+	if (*lastoff < poff) {
 		/*
-		 * Any buffer with valid data in it should have BH_Uptodate set.
+		 * Last offset smaller than the start of the page means we found
+		 * a hole:
 		 */
-		if (buffer_uptodate(bh) == seek_data)
-			return lastoff;
+		if (whence == SEEK_HOLE)
+			return true;
+		*lastoff = poff;
+	}
 
-		lastoff = offset;
-	} while ((bh = bh->b_this_page) != head);
-	return -ENOENT;
+	/*
+	 * Just check the page unless we can and should check block ranges:
+	 */
+	if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
+		return PageUptodate(page) == seek_data;
+
+	lock_page(page);
+	if (unlikely(page->mapping != inode->i_mapping))
+		goto out_unlock_not_found;
+
+	for (off = 0; off < PAGE_SIZE; off += bsize) {
+		if ((*lastoff & ~PAGE_MASK) >= off + bsize)
+			continue;
+		if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+			unlock_page(page);
+			return true;
+		}
+		*lastoff = poff + off + bsize;
+	}
+
+out_unlock_not_found:
+	unlock_page(page);
+	return false;
 }
 
 /*
@@ -659,30 +678,8 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			/*
-			 * At this point, the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or
-			 * even swizzled back from swapper_space to tmpfs file
-			 * mapping.  However, page->index will not change
-			 * because we have a reference on the page.
-                         *
-			 * If current page offset is beyond where we've ended,
-			 * we've found a hole.
-                         */
-			if (whence == SEEK_HOLE &&
-			    lastoff < page_offset(page))
+			if (page_seek_hole_data(inode, page, &lastoff, whence))
 				goto check_range;
-
-			lock_page(page);
-			if (likely(page->mapping == inode->i_mapping) &&
-			    page_has_buffers(page)) {
-				lastoff = page_seek_hole_data(page, lastoff, whence);
-				if (lastoff >= 0) {
-					unlock_page(page);
-					goto check_range;
-				}
-			}
-			unlock_page(page);
 			lastoff = page_offset(page) + PAGE_SIZE;
 		}
 		pagevec_release(&pvec);