From 1bee12b8c44d825fb45cd6a13e76c185ed6888b8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 16 Jan 2013 17:33:53 -0600 Subject: [PATCH 1/7] xfs: Do not return EFSCORRUPTED when filesystem probe finds no XFS magic 9802182 changed the return value from EWRONGFS (aka EINVAL) to EFSCORRUPTED which doesn't seem to be handled properly by the root filesystem probe. Signed-off-by: Eric Sandeen Tested-by: Sergei Trofimovich Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da508463ff10..7d6df7c00c36 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify( return; } /* quietly fail */ - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, EWRONGFS); } static void From d26978dd866dbb3b3a9690f3655a5e735055de89 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 17 Jan 2013 13:11:29 -0500 Subject: [PATCH 2/7] xfs: pull up stack_switch check into xfs_bmapi_write The stack_switch check currently occurs in __xfs_bmapi_allocate, which means the stack switch only occurs when xfs_bmapi_allocate() is called in a loop. Pull the check up before the loop in xfs_bmapi_write() such that the first iteration of the loop has consistent behavior. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12765d2..cdb2d3348583 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4680,9 +4680,6 @@ __xfs_bmapi_allocate( return error; } - if (bma->flags & XFS_BMAPI_STACK_SWITCH) - bma->stack_switch = 1; - error = xfs_bmap_alloc(bma); if (error) return error; @@ -4956,6 +4953,9 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; + if (flags & XFS_BMAPI_STACK_SWITCH) + bma.stack_switch = 1; + while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); From eb178619f930fa2ba2348de332a1ff1c66a31424 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:52 +1100 Subject: [PATCH 3/7] xfs: fix _xfs_buf_find oops on blocks beyond the filesystem end When _xfs_buf_find is passed an out of range address, it will fail to find a relevant struct xfs_perag and oops with a null dereference. This can happen when trying to walk a filesystem with a metadata inode that has a partially corrupted extent map (i.e. the block number returned is corrupt, but is otherwise intact) and we try to read from the corrupted block address. In this case, just fail the lookup. If it is readahead being issued, it will simply not be done, but if it is real read that fails we will get an error being reported. Ideally this case should result in an EFSCORRUPTED error being reported, but we cannot return an error through xfs_buf_read() or xfs_buf_get() so this lookup failure may result in ENOMEM or EIO errors being reported instead. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 56d1614760cf..689d72655ea6 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -487,6 +487,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -498,6 +499,23 @@ _xfs_buf_find( ASSERT(!(numbytes < (1 << btp->bt_sshift))); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + return NULL; + } + /* get tree root */ pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, blkno)); From f2a459565b02b60408f3f2e5ca992a031319712b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:54 +1100 Subject: [PATCH 4/7] xfs: limit speculative prealloc near ENOSPC thresholds There is a window on small filesytsems where specualtive preallocation can be larger than that ENOSPC throttling thresholds, resulting in specualtive preallocation trying to reserve more space than there is space available. This causes immediate ENOSPC to be triggered, prealloc to be turned off and flushing to occur. One the next write (i.e. next 4k page), we do exactly the same thing, and so effective drive into synchronous 4k writes by triggering ENOSPC flushing on every page while in the window between the prealloc size and the ENOSPC prealloc throttle threshold. Fix this by checking to see if the prealloc size would consume all free space, and throttle it appropriately to avoid premature ENOSPC... Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers --- fs/xfs/xfs_iomap.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4e9a63..364818eef40e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -351,6 +351,15 @@ xfs_iomap_prealloc_size( } if (shift) alloc_blocks >>= shift; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks >= freesp) + alloc_blocks >>= 4; } if (alloc_blocks < mp->m_writeio_blocks) From 9f87832a82923943aaab38b8d53658af134bbfa4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:55 +1100 Subject: [PATCH 5/7] xfs: fix shutdown hang on invalid inode during create When the new inode verify in xfs_iread() fails, the create transaction is aborted and a shutdown occurs. The subsequent unmount then hangs in xfs_wait_buftarg() on a buffer that has an elevated hold count. Debug showed that it was an AGI buffer getting stuck: [ 22.576147] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 22.976213] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 23.376206] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 23.776325] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck The trace of this buffer leading up to the shutdown (trimmed for brevity) looks like: xfs_buf_init: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_get_map xfs_buf_get: bno 0x2 len 0x200 hold 1 caller xfs_buf_read_map xfs_buf_read: bno 0x2 len 0x200 hold 1 caller xfs_trans_read_buf_map xfs_buf_iorequest: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_hold: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_iorequest xfs_buf_rele: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_iorequest xfs_buf_iowait: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_ioerror: bno 0x2 len 0x200 hold 1 caller xfs_buf_bio_end_io xfs_buf_iodone: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_ioend xfs_buf_iowait_done: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_hold: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_item_init xfs_trans_read_buf: bno 0x2 len 0x200 hold 2 recur 0 refcount 1 xfs_trans_brelse: bno 0x2 len 0x200 hold 2 recur 0 refcount 1 xfs_buf_item_relse: bno 0x2 nblks 0x1 hold 2 caller xfs_trans_brelse xfs_buf_rele: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_item_relse xfs_buf_unlock: bno 0x2 nblks 0x1 hold 1 caller xfs_trans_brelse xfs_buf_rele: bno 0x2 nblks 0x1 hold 1 caller xfs_trans_brelse xfs_buf_trylock: bno 0x2 nblks 0x1 hold 2 caller _xfs_buf_find xfs_buf_find: bno 0x2 len 0x200 hold 2 caller xfs_buf_get_map xfs_buf_get: bno 0x2 len 0x200 hold 2 caller xfs_buf_read_map xfs_buf_read: bno 0x2 len 0x200 hold 2 caller xfs_trans_read_buf_map xfs_buf_hold: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_item_init xfs_trans_read_buf: bno 0x2 len 0x200 hold 3 recur 0 refcount 1 xfs_trans_log_buf: bno 0x2 len 0x200 hold 3 recur 0 refcount 1 xfs_buf_item_unlock: bno 0x2 len 0x200 hold 3 flags DIRTY liflags ABORTED xfs_buf_unlock: bno 0x2 nblks 0x1 hold 3 caller xfs_buf_item_unlock xfs_buf_rele: bno 0x2 nblks 0x1 hold 3 caller xfs_buf_item_unlock And that is the AGI buffer from cold cache read into memory to transaction abort. You can see at transaction abort the bli is dirty and only has a single reference. The item is not pinned, and it's not in the AIL. Hence the only reference to it is this transaction. The problem is that the xfs_buf_item_unlock() call is dropping the last reference to the xfs_buf_log_item attached to the buffer (which holds a reference to the buffer), but it is not freeing the xfs_buf_log_item. Hence nothing will ever release the buffer, and the unmount hangs waiting for this reference to go away. The fix is simple - xfs_buf_item_unlock needs to detect the last reference going away in this case and free the xfs_buf_log_item to release the reference it holds on the buffer. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 2 ++ fs/xfs/xfs_buf_item.c | 12 ++++++++++-- fs/xfs/xfs_trace.h | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 689d72655ea6..fbbb9eb92e32 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1505,6 +1505,8 @@ restart: while (!list_empty(&btp->bt_lru)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); if (atomic_read(&bp->b_hold) > 1) { + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + list_move_tail(&bp->b_lru, &btp->bt_lru); spin_unlock(&btp->bt_lru_lock); delay(100); goto restart; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 77b09750e92c..3f9949fee391 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -652,7 +652,10 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. */ clean = 1; for (i = 0; i < bip->bli_format_count; i++) { @@ -664,7 +667,12 @@ xfs_buf_item_unlock( } if (clean) xfs_buf_item_relse(bp); - else + else if (aborted) { + if (atomic_dec_and_test(&bip->bli_refcount)) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_buf_item_relse(bp); + } + } else atomic_dec(&bip->bli_refcount); if (!hold) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4a85ae..16a812977eab 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); From 4b05d09c18d9aa62d2e7fb4b057f54e5a38963f5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jan 2013 13:56:18 +0100 Subject: [PATCH 6/7] xfs: Fix possible use-after-free with AIO Running AIO is pinning inode in memory using file reference. Once AIO is completed using aio_complete(), file reference is put and inode can be freed from memory. So we have to be sure that calling aio_complete() is the last thing we do with the inode. CC: xfs@oss.sgi.com CC: Ben Myers CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40ebe1a..5f707e537171 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { + inode_dio_done(ioend->io_inode); if (ioend->io_isasync) { aio_complete(ioend->io_iocb, ioend->io_error ? ioend->io_error : ioend->io_result, 0); } - inode_dio_done(ioend->io_inode); } mempool_free(ioend, xfs_ioend_pool); From 65e3aa77f1b0269720660a6879f6f28d158f54c8 Mon Sep 17 00:00:00 2001 From: Torsten Kaiser Date: Sun, 20 Jan 2013 10:24:49 +0100 Subject: [PATCH 7/7] xfs: Fix xfs_swap_extents() after removal of xfs_flushinval_pages() Commit fb59581404ab7ec5075299065c22cb211a9262a9 removed xfs_flushinval_pages() and changed its callers to use filemap_write_and_wait() and truncate_pagecache_range() directly. But in xfs_swap_extents() this change accidental switched the argument for 'tip' to 'ip'. This patch switches it back to 'tip' Signed-off-by: Torsten Kaiser Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_dfrag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d0e9c74d3d96..a8bd26b82ecb 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,10 +246,10 @@ xfs_swap_extents( goto out_unlock; } - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(ip), 0, -1); + truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) {