Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: only run xfs_error_test if error injection is active
  xfs: avoid moving stale inodes in the AIL
  xfs: delayed alloc blocks beyond EOF are valid after writeback
  xfs: push stale, pinned buffers on trylock failures
  xfs: fix failed write truncation handling.
This commit is contained in:
Linus Torvalds 2010-12-02 09:13:36 -08:00
commit 8cb280c90f
8 changed files with 188 additions and 83 deletions

View file

@ -934,7 +934,6 @@ xfs_aops_discard_page(
struct xfs_inode *ip = XFS_I(inode);
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
ssize_t len = 1 << inode->i_blkbits;
if (!xfs_is_delayed_page(page, IO_DELAY))
goto out_invalidate;
@ -949,58 +948,14 @@ xfs_aops_discard_page(
xfs_ilock(ip, XFS_ILOCK_EXCL);
bh = head = page_buffers(page);
do {
int done;
xfs_fileoff_t offset_fsb;
xfs_bmbt_irec_t imap;
int nimaps = 1;
int error;
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
xfs_fileoff_t start_fsb;
if (!buffer_delay(bh))
goto next_buffer;
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
/*
* Map the range first and check that it is a delalloc extent
* before trying to unmap the range. Otherwise we will be
* trying to remove a real extent (which requires a
* transaction) or a hole, which is probably a bad idea...
*/
error = xfs_bmapi(NULL, ip, offset_fsb, 1,
XFS_BMAPI_ENTIRE, NULL, 0, &imap,
&nimaps, NULL);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
"page discard failed delalloc mapping lookup.");
}
break;
}
if (!nimaps) {
/* nothing there */
goto next_buffer;
}
if (imap.br_startblock != DELAYSTARTBLOCK) {
/* been converted, ignore */
goto next_buffer;
}
WARN_ON(imap.br_blockcount == 0);
/*
* Note: while we initialise the firstblock/flist pair, they
* should never be used because blocks should never be
* allocated or freed for a delalloc extent and hence we need
* don't cancel or finish them after the xfs_bunmapi() call.
*/
xfs_bmap_init(&flist, &firstblock);
error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
&flist, &done);
ASSERT(!flist.xbf_count && !flist.xbf_first);
start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@ -1010,7 +965,7 @@ xfs_aops_discard_page(
break;
}
next_buffer:
offset += len;
offset += 1 << inode->i_blkbits;
} while ((bh = bh->b_this_page) != head);
@ -1505,11 +1460,42 @@ xfs_vm_write_failed(
struct inode *inode = mapping->host;
if (to > inode->i_size) {
struct iattr ia = {
.ia_valid = ATTR_SIZE | ATTR_FORCE,
.ia_size = inode->i_size,
};
xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK);
/*
* punch out the delalloc blocks we have already allocated. We
* don't call xfs_setattr() to do this as we may be in the
* middle of a multi-iovec write and so the vfs inode->i_size
* will not match the xfs ip->i_size and so it will zero too
* much. Hence we jus truncate the page cache to zero what is
* necessary and punch the delalloc blocks directly.
*/
struct xfs_inode *ip = XFS_I(inode);
xfs_fileoff_t start_fsb;
xfs_fileoff_t end_fsb;
int error;
truncate_pagecache(inode, to, inode->i_size);
/*
* Check if there are any blocks that are outside of i_size
* that need to be trimmed back.
*/
start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
if (end_fsb <= start_fsb)
return;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
end_fsb - start_fsb);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
"xfs_vm_write_failed: unable to clean up ino %lld",
ip->i_ino);
}
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
}

View file

@ -488,29 +488,16 @@ found:
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
/* Attempt to get the semaphore without sleeping,
* if this does not work then we need to drop the
* spinlock and do a hard attempt on the semaphore.
*/
if (down_trylock(&bp->b_sema)) {
if (xfs_buf_cond_lock(bp)) {
/* failed, so wait for the lock if requested. */
if (!(flags & XBF_TRYLOCK)) {
/* wait for buffer ownership */
xfs_buf_lock(bp);
XFS_STATS_INC(xb_get_locked_waited);
} else {
/* We asked for a trylock and failed, no need
* to look at file offset and length here, we
* know that this buffer at least overlaps our
* buffer and is locked, therefore our buffer
* either does not exist, or is this buffer.
*/
xfs_buf_rele(bp);
XFS_STATS_INC(xb_busy_locked);
return NULL;
}
} else {
/* trylock worked */
XB_SET_OWNER(bp);
}
if (bp->b_flags & XBF_STALE) {
@ -876,10 +863,18 @@ xfs_buf_rele(
*/
/*
* Locks a buffer object, if it is not already locked.
* Note that this in no way locks the underlying pages, so it is only
* useful for synchronizing concurrent use of buffer objects, not for
* synchronizing independent access to the underlying pages.
* Locks a buffer object, if it is not already locked. Note that this in
* no way locks the underlying pages, so it is only useful for
* synchronizing concurrent use of buffer objects, not for synchronizing
* independent access to the underlying pages.
*
* If we come across a stale, pinned, locked buffer, we know that we are
* being asked to lock a buffer that has been reallocated. Because it is
* pinned, we know that the log has not been pushed to disk and hence it
* will still be locked. Rather than continuing to have trylock attempts
* fail until someone else pushes the log, push it ourselves before
* returning. This means that the xfsaild will not get stuck trying
* to push on stale inode buffers.
*/
int
xfs_buf_cond_lock(
@ -890,6 +885,8 @@ xfs_buf_cond_lock(
locked = down_trylock(&bp->b_sema) == 0;
if (locked)
XB_SET_OWNER(bp);
else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
xfs_log_force(bp->b_target->bt_mount, 0);
trace_xfs_buf_cond_lock(bp, _RET_IP_);
return locked ? 0 : -EBUSY;

View file

@ -5471,8 +5471,13 @@ xfs_getbmap(
if (error)
goto out_unlock_iolock;
}
ASSERT(ip->i_delayed_blks == 0);
/*
* even after flushing the inode, there can still be delalloc
* blocks on the inode beyond EOF due to speculative
* preallocation. These are not removed until the release
* function is called or the inode is inactivated. Hence we
* cannot assert here that ip->i_delayed_blks == 0.
*/
}
lock = xfs_ilock_map_shared(ip);
@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves(
*count += xfs_bmbt_disk_get_blockcount(frp);
}
}
/*
* dead simple method of punching delalyed allocation blocks from a range in
* the inode. Walks a block at a time so will be slow, but is only executed in
* rare error cases so the overhead is not critical. This will alays punch out
* both the start and end blocks, even if the ranges only partially overlap
* them, so it is up to the caller to ensure that partial blocks are not
* passed in.
*/
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
xfs_fileoff_t start_fsb,
xfs_fileoff_t length)
{
xfs_fileoff_t remaining = length;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
do {
int done;
xfs_bmbt_irec_t imap;
int nimaps = 1;
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
/*
* Map the range first and check that it is a delalloc extent
* before trying to unmap the range. Otherwise we will be
* trying to remove a real extent (which requires a
* transaction) or a hole, which is probably a bad idea...
*/
error = xfs_bmapi(NULL, ip, start_fsb, 1,
XFS_BMAPI_ENTIRE, NULL, 0, &imap,
&nimaps, NULL);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
"Failed delalloc mapping lookup ino %lld fsb %lld.",
ip->i_ino, start_fsb);
}
break;
}
if (!nimaps) {
/* nothing there */
goto next_block;
}
if (imap.br_startblock != DELAYSTARTBLOCK) {
/* been converted, ignore */
goto next_block;
}
WARN_ON(imap.br_blockcount == 0);
/*
* Note: while we initialise the firstblock/flist pair, they
* should never be used because blocks should never be
* allocated or freed for a delalloc extent and hence we need
* don't cancel or finish them after the xfs_bunmapi() call.
*/
xfs_bmap_init(&flist, &firstblock);
error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
&flist, &done);
if (error)
break;
ASSERT(!flist.xbf_count && !flist.xbf_first);
next_block:
start_fsb++;
remaining--;
} while(remaining > 0);
return error;
}

View file

@ -394,6 +394,11 @@ xfs_bmap_count_blocks(
int whichfork,
int *count);
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
xfs_fileoff_t start_fsb,
xfs_fileoff_t length);
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */

View file

@ -377,6 +377,19 @@ xfs_swap_extents(
ip->i_d.di_format = tip->i_d.di_format;
tip->i_d.di_format = tmp;
/*
* The extents in the source inode could still contain speculative
* preallocation beyond EOF (e.g. the file is open but not modified
* while defrag is in progress). In that case, we need to copy over the
* number of delalloc blocks the data fork in the source inode is
* tracking beyond EOF so that when the fork is truncated away when the
* temporary inode is unlinked we don't underrun the i_delayed_blks
* counter on that inode.
*/
ASSERT(tip->i_delayed_blks == 0);
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
ilf_fields = XFS_ILOG_CORE;
switch(ip->i_d.di_format) {

View file

@ -58,6 +58,7 @@ xfs_error_trap(int e)
int xfs_etest[XFS_NUM_INJECT_ERROR];
int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
int xfs_error_test_active;
int
xfs_error_test(int error_tag, int *fsidp, char *expression,
@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
len = strlen(mp->m_fsname);
xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
strcpy(xfs_etest_fsname[i], mp->m_fsname);
xfs_error_test_active++;
return 0;
}
}
@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
xfs_etest_fsid[i] = 0LL;
kmem_free(xfs_etest_fsname[i]);
xfs_etest_fsname[i] = NULL;
xfs_error_test_active--;
}
}

View file

@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level,
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
#ifdef DEBUG
extern int xfs_error_test_active;
extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
#define XFS_NUM_INJECT_ERROR 10
#define XFS_TEST_ERROR(expr, mp, tag, rf) \
((expr) || \
((expr) || (xfs_error_test_active && \
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
(rf)))
(rf))))
extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);

View file

@ -657,18 +657,37 @@ xfs_inode_item_unlock(
}
/*
* This is called to find out where the oldest active copy of the
* inode log item in the on disk log resides now that the last log
* write of it completed at the given lsn. Since we always re-log
* all dirty data in an inode, the latest copy in the on disk log
* is the only one that matters. Therefore, simply return the
* given lsn.
* This is called to find out where the oldest active copy of the inode log
* item in the on disk log resides now that the last log write of it completed
* at the given lsn. Since we always re-log all dirty data in an inode, the
* latest copy in the on disk log is the only one that matters. Therefore,
* simply return the given lsn.
*
* If the inode has been marked stale because the cluster is being freed, we
* don't want to (re-)insert this inode into the AIL. There is a race condition
* where the cluster buffer may be unpinned before the inode is inserted into
* the AIL during transaction committed processing. If the buffer is unpinned
* before the inode item has been committed and inserted, then it is possible
* for the buffer to be written and IO completions before the inode is inserted
* into the AIL. In that case, we'd be inserting a clean, stale inode into the
* AIL which will never get removed. It will, however, get reclaimed which
* triggers an assert in xfs_inode_free() complaining about freein an inode
* still in the AIL.
*
* To avoid this, return a lower LSN than the one passed in so that the
* transaction committed code will not move the inode forward in the AIL but
* will still unpin it properly.
*/
STATIC xfs_lsn_t
xfs_inode_item_committed(
struct xfs_log_item *lip,
xfs_lsn_t lsn)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
if (xfs_iflags_test(ip, XFS_ISTALE))
return lsn - 1;
return lsn;
}