From d17c701ce6a548a92f7f8a3cec20299465f36ee3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:52 +1000 Subject: [PATCH 01/10] xfs: unlock items before allowing the CIL to commit When we commit a transaction using delayed logging, we need to unlock the items in the transaciton before we unlock the CIL context and allow it to be checkpointed. If we unlock them after we release the CIl context lock, the CIL can checkpoint and complete before we free the log items. This breaks stale buffer item unlock and unpin processing as there is an implicit assumption that the unlock will occur before the unpin. Also, some log items need to store the LSN of the transaction commit in the item (inodes and EFIs) and so can race with other transaction completions if we don't prevent the CIL from checkpointing before the unlock occurs. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log_cil.c | 14 ++++++++++++++ fs/xfs/xfs_trans.c | 5 +---- fs/xfs/xfs_trans_priv.h | 3 ++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 31e4ea2d19ac..ef8e7d9f445d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -377,9 +377,23 @@ xfs_log_commit_cil( xfs_log_done(mp, tp->t_ticket, NULL, log_flags); xfs_trans_unreserve_and_mod_sb(tp); + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, *commit_lsn, 0); + /* check for background commit before unlock */ if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) push = 1; + up_read(&log->l_cilp->xc_ctx_lock); /* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdca7416c754..1c47edaea0d2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1167,7 +1167,7 @@ xfs_trans_del_item( * Unlock all of the items of a transaction and free all the descriptors * of that transaction. */ -STATIC void +void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, @@ -1653,9 +1653,6 @@ xfs_trans_commit_cil( return error; current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* xfs_trans_free_items() unlocks them first */ - xfs_trans_free_items(tp, *commit_lsn, 0); xfs_trans_free(tp); return 0; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index e2d93d8ead7b..62da86c90de5 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -25,7 +25,8 @@ struct xfs_trans; void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); - +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); void xfs_trans_item_committed(struct xfs_log_item *lip, xfs_lsn_t commit_lsn, int aborted); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); From 5b3eed756cd37255cad1181bd86bfd0977e97953 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:41 +1000 Subject: [PATCH 02/10] xfs: ensure we mark all inodes in a freed cluster XFS_ISTALE Under heavy load parallel metadata loads (e.g. dbench), we can fail to mark all the inodes in a cluster being freed as XFS_ISTALE as we skip inodes we cannot get the XFS_ILOCK_EXCL or the flush lock on. When this happens and the inode cluster buffer has already been marked stale and freed, inode reclaim can try to write the inode out as it is dirty and not marked stale. This can result in writing th metadata to an freed extent, or in the case it has already been overwritten trigger a magic number check failure and return an EUCLEAN error such as: Filesystem "ram0": inode 0x442ba1 background reclaim flush failed with 117 Fix this by ensuring that we hoover up all in memory inodes in the cluster and mark them XFS_ISTALE when freeing the cluster. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 49 ++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 68415cb4f23c..34798f391c49 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1914,6 +1914,11 @@ xfs_iunlink_remove( return 0; } +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -1945,8 +1950,6 @@ xfs_ifree_cluster( } for (j = 0; j < nbufs; j++, inum += ninodes) { - int found = 0; - blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1965,7 +1968,9 @@ xfs_ifree_cluster( /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. */ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); while (lip) { @@ -1977,11 +1982,11 @@ xfs_ifree_cluster( &iip->ili_flush_lsn, &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - found++; } lip = lip->li_bio_list; } + /* * For each inode in memory attempt to add it to the inode * buffer and set it up for being staled on buffer IO @@ -1993,6 +1998,7 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < ninodes; i++) { +retry: read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); @@ -2003,38 +2009,36 @@ xfs_ifree_cluster( continue; } - /* don't try to lock/unlock the current inode */ + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); - continue; + delay(1); + goto retry; } read_unlock(&pag->pag_ici_lock); - if (!xfs_iflock_nowait(ip)) { - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - + xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ iip = ip->i_itemp; - if (!iip) { - /* inode with unlogged changes only */ + if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2049,8 +2053,7 @@ xfs_ifree_cluster( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (found) - xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } From 4536f2ad8b330453d7ebec0746c4374eadd649b1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:30 +1000 Subject: [PATCH 03/10] xfs: fix untrusted inode number lookup Commit 7124fe0a5b619d65b739477b3b55a20bf805b06d ("xfs: validate untrusted inode numbers during lookup") changes the inode lookup code to do btree lookups for untrusted inode numbers. This change made an invalid assumption about the alignment of inodes and hence incorrectly calculated the first inode in the cluster. As a result, some inode numbers were being incorrectly considered invalid when they were actually valid. The issue was not picked up by the xfstests suite because it always runs fsr and dump (the two utilities that utilise the bulkstat interface) on cache hot inodes and hence the lookup code in the cold cache path was not sufficiently exercised to uncover this intermittent problem. Fix the issue by relaxing the btree lookup criteria and then checking if the record returned contains the inode number we are lookup for. If it we get an incorrect record, then the inode number is invalid. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_ialloc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index abf80ae1e95b..5371d2dc360e 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1213,7 +1213,6 @@ xfs_imap_lookup( struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - xfs_agino_t startino; int error; int i; @@ -1227,13 +1226,13 @@ xfs_imap_lookup( } /* - * derive and lookup the exact inode record for the given agino. If the - * record cannot be found, then it's an invalid inode number and we - * should abort. + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); - error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); @@ -1246,6 +1245,11 @@ xfs_imap_lookup( if (error) return error; + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + return EINVAL; + /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) From 546a1924224078c6f582e68f890b05b387b42653 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:44:34 +1000 Subject: [PATCH 04/10] writeback: write_cache_pages doesn't terminate at nr_to_write <= 0 I noticed XFS writeback in 2.6.36-rc1 was much slower than it should have been. Enabling writeback tracing showed: flush-253:16-8516 [007] 1342952.351608: wbc_writepage: bdi 253:16: towrt=1024 skip=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0 flush-253:16-8516 [007] 1342952.351654: wbc_writepage: bdi 253:16: towrt=1023 skip=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0 flush-253:16-8516 [000] 1342952.369520: wbc_writepage: bdi 253:16: towrt=0 skip=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0 flush-253:16-8516 [000] 1342952.369542: wbc_writepage: bdi 253:16: towrt=-1 skip=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0 flush-253:16-8516 [000] 1342952.369549: wbc_writepage: bdi 253:16: towrt=-2 skip=0 mode=0 kupd=0 bgrd=1 reclm=0 cyclic=1 more=0 older=0x0 start=0x0 end=0x0 Writeback is not terminating in background writeback if ->writepage is returning with wbc->nr_to_write == 0, resulting in sub-optimal single page writeback on XFS. Fix the write_cache_pages loop to terminate correctly when this situation occurs and so prevent this sub-optimal background writeback pattern. This improves sustained sequential buffered write performance from around 250MB/s to 750MB/s for a 100GB file on an XFS filesystem on my 8p test VM. Cc: Signed-off-by: Dave Chinner Reviewed-by: Wu Fengguang Reviewed-by: Christoph Hellwig --- mm/page-writeback.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c09ef5219cbe..a803f5e33471 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -985,22 +985,16 @@ continue_unlock: } } - if (wbc->nr_to_write > 0) { - if (--wbc->nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; - } + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; } } pagevec_release(&pvec); From efceab1d563153a2b1a6e7d35376241a48126989 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:44:56 +1000 Subject: [PATCH 05/10] xfs: handle negative wbc->nr_to_write during sync writeback During data integrity (WB_SYNC_ALL) writeback, wbc->nr_to_write will go negative on inodes with more than 1024 dirty pages due to implementation details of write_cache_pages(). Currently XFS will abort page clustering in writeback once nr_to_write drops below zero, and so for data integrity writeback we will do very inefficient page at a time allocation and IO submission for inodes with large numbers of dirty pages. Fix this by only aborting the page clustering code when wbc->nr_to_write is negative and the sync mode is WB_SYNC_NONE. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_aops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 15412fe15c3a..528be1ba1402 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -852,8 +852,8 @@ xfs_convert_page( SetPageUptodate(page); if (count) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) done = 1; } xfs_start_page_writeback(page, !page_dirty, count); From 2fe33661fcd79d4c53022509f7223d526b5fa233 Mon Sep 17 00:00:00 2001 From: Stuart Brodsky Date: Tue, 24 Aug 2010 11:46:05 +1000 Subject: [PATCH 06/10] xfs: ensure f_ffree returned by statfs() is non-negative Because of delayed updates to sb_icount field in the super block, it is possible to allocate over maxicount number of inodes. This causes the arithmetic to calculate a negative number of free inodes in user commands like df or stat -f. Since maxicount is a somewhat arbitrary number, a slight over allocation is not critical but user commands should be displayed as 0 or greater and never go negative. To do this the value in the stats buffer f_ffree is capped to never go negative. [ Modified to use max_t as per Christoph's comment. ] Signed-off-by: Stu Brodsky Signed-off-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_super.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 15c35b62ff14..c6b24e7c308a 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1226,6 +1226,7 @@ xfs_fs_statfs( struct xfs_inode *ip = XFS_I(dentry->d_inode); __uint64_t fakeinos, id; xfs_extlen_t lsize; + __int64_t ffree; statp->f_type = XFS_SB_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1249,7 +1250,11 @@ xfs_fs_statfs( statp->f_files = min_t(typeof(statp->f_files), statp->f_files, mp->m_maxicount); - statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || From 1a387d3be2b30c90f20d49a3497a8fc0693a9d18 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:46:31 +1000 Subject: [PATCH 07/10] xfs: dummy transactions should not dirty VFS state When we need to cover the log, we issue dummy transactions to ensure the current log tail is on disk. Unfortunately we currently use the root inode in the dummy transaction, and the act of committing the transaction dirties the inode at the VFS level. As a result, the VFS writeback of the dirty inode will prevent the filesystem from idling long enough for the log covering state machine to complete. The state machine gets stuck in a loop issuing new dummy transactions to cover the log and never makes progress. To avoid this problem, the dummy transactions should not cause externally visible state changes. To ensure this occurs, make sure that dummy transactions log an unchanging field in the superblock as it's state is never propagated outside the filesystem. This allows the log covering state machine to complete successfully and the filesystem now correctly enters a fully idle state about 90s after the last modification was made. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/linux-2.6/xfs_sync.c | 42 ++++++------------------------------ fs/xfs/xfs_fsops.c | 31 +++++++++++++++----------- fs/xfs/xfs_fsops.h | 2 +- 4 files changed, 26 insertions(+), 51 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c6b24e7c308a..a4e07974955b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1407,7 +1407,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp); + return -xfs_fs_log_dummy(mp, SYNC_WAIT); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index dfcbd98d1599..d59c4a65d492 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -34,6 +34,7 @@ #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_fsops.h" #include #include @@ -340,38 +341,6 @@ xfs_sync_attr( XFS_ICI_NO_TAG, 0, NULL); } -STATIC int -xfs_commit_dummy_trans( - struct xfs_mount *mp, - uint flags) -{ - struct xfs_inode *ip = mp->m_rootip; - struct xfs_trans *tp; - int error; - - /* - * Put a dummy transaction in the log to tell recovery - * that all others are OK. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - /* the log force ensures this transaction is pushed to disk */ - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return error; -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -432,7 +401,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -563,7 +532,7 @@ xfs_flush_inodes( /* * Every sync period we need to unpin all items, reclaim inodes and sync * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle. + * filesystem is idle and not frozen. */ STATIC void xfs_sync_worker( @@ -577,8 +546,9 @@ xfs_sync_worker( xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, 0); + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index dbca5f5c37ba..43b1d5699335 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -604,31 +604,36 @@ out: return 0; } +/* + * Dump a transaction into the log that contains no real change. This is needed + * to be able to make the log dirty or stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty state back up + * into the VFS and then periodic inode flushing will prevent log covering from + * making progress. Hence we log a field in the superblock instead. + */ int xfs_fs_log_dummy( - xfs_mount_t *mp) + xfs_mount_t *mp, + int flags) { xfs_trans_t *tp; - xfs_inode_t *ip; int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; } - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + /* log the UUID because it is an unchanging field */ + xfs_mod_sb(tp, XFS_SB_UUID); + if (flags & SYNC_WAIT) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); } int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 88435e0a77c9..a786c5212c1e 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); #endif /* __XFS_FSOPS_H__ */ From a44f13edf0ebb4e41942d0f16ca80489dcf6659d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:40:03 +1000 Subject: [PATCH 08/10] xfs: Reduce log force overhead for delayed logging Delayed logging adds some serialisation to the log force process to ensure that it does not deference a bad commit context structure when determining if a CIL push is necessary or not. It does this by grabing the CIL context lock exclusively, then dropping it before pushing the CIL if necessary. This causes serialisation of all log forces and pushes regardless of whether a force is necessary or not. As a result fsync heavy workloads (like dbench) can be significantly slower with delayed logging than without. To avoid this penalty, copy the current sequence from the context to the CIL structure when they are swapped. This allows us to do unlocked checks on the current sequence without having to worry about dereferencing context structures that may have already been freed. Hence we can remove the CIL context locking in the forcing code and only call into the push code if the current context matches the sequence we need to force. By passing the sequence into the push code, we can check the sequence again once we have the CIL lock held exclusive and abort if the sequence has already been pushed. This avoids a lock round-trip and unnecessary CIL pushes when we have racing push calls. The result is that the regression in dbench performance goes away - this change improves dbench performance on a ramdisk from ~2100MB/s to ~2500MB/s. This compares favourably to not using delayed logging which retuns ~2500MB/s for the same workload. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 7 +- fs/xfs/xfs_log_cil.c | 243 +++++++++++++++++++++++------------------- fs/xfs/xfs_log_priv.h | 13 ++- 3 files changed, 146 insertions(+), 117 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 925d572bf0f4..33f718f92a48 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3015,7 +3015,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); - xlog_cil_push(log, 1); + if (log->l_cilp) + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3167,7 +3168,7 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); if (log->l_cilp) { - lsn = xlog_cil_push_lsn(log, lsn); + lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) return 0; } @@ -3724,7 +3725,7 @@ xfs_log_force_umount( * call below. */ if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) - xlog_cil_push(log, 1); + xlog_cil_force(log); /* * We must hold both the GRANT lock and the LOG lock, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ef8e7d9f445d..9768f2437bb3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -68,6 +68,7 @@ xlog_cil_init( ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; cil->xc_log = log; log->l_cilp = cil; @@ -320,94 +321,6 @@ xlog_cil_free_logvec( } } -/* - * Commit a transaction with the given vector to the Committed Item List. - * - * To do this, we need to format the item, pin it in memory if required and - * account for the space used by the transaction. Once we have done that we - * need to release the unused reservation for the transaction, attach the - * transaction to the checkpoint context so we carry the busy extents through - * to checkpoint completion, and then unlock all the items in the transaction. - * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * - * Called with the context lock already held in read mode to lock out - * background commit, returns without it held once background commits are - * allowed again. - */ -int -xfs_log_commit_cil( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_log_vec *log_vector, - xfs_lsn_t *commit_lsn, - int flags) -{ - struct log *log = mp->m_log; - int log_flags = 0; - int push = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; - - if (XLOG_FORCED_SHUTDOWN(log)) { - xlog_cil_free_logvec(log_vector); - return XFS_ERROR(EIO); - } - - /* lock out background commit */ - down_read(&log->l_cilp->xc_ctx_lock); - xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); - - /* check we didn't blow the reservation */ - if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(log->l_mp, tp->t_ticket); - - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) { - spin_lock(&log->l_cilp->xc_cil_lock); - list_splice_init(&tp->t_busy, - &log->l_cilp->xc_ctx->busy_extents); - spin_unlock(&log->l_cilp->xc_cil_lock); - } - - tp->t_commit_lsn = *commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - xfs_trans_unreserve_and_mod_sb(tp); - - /* - * Once all the items of the transaction have been copied to the CIL, - * the items can be unlocked and freed. - * - * This needs to be done before we drop the CIL context lock because we - * have to update state in the log items and unlock them before they go - * to disk. If we don't, then the CIL checkpoint can race with us and - * we can run checkpoint completion before we've updated and unlocked - * the log items. This affects (at least) processing of stale buffers, - * inodes and EFIs. - */ - xfs_trans_free_items(tp, *commit_lsn, 0); - - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; - - up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); - return 0; -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -441,13 +354,23 @@ xlog_cil_committed( } /* - * Push the Committed Item List to the log. If the push_now flag is not set, - * then it is a background flush and so we can chose to ignore it. + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. */ -int +STATIC int xlog_cil_push( struct log *log, - int push_now) + xfs_lsn_t push_seq) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -467,12 +390,14 @@ xlog_cil_push( if (!cil) return 0; + ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); /* lock out transaction commit, but don't block on background push */ if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_now) + if (!push_seq) goto out_free_ticket; down_write(&cil->xc_ctx_lock); } @@ -483,7 +408,11 @@ xlog_cil_push( goto out_skip; /* check for spurious background flush */ - if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* check for a previously pushed seqeunce */ + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -528,6 +457,13 @@ xlog_cil_push( new_ctx->cil = cil; cil->xc_ctx = new_ctx; + /* + * mirror the new sequence into the cil structure so that we can do + * unlocked checks against the current sequence in log forces without + * risking deferencing a freed context pointer. + */ + cil->xc_current_sequence = new_ctx->sequence; + /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, @@ -639,6 +575,94 @@ out_abort: return XFS_ERROR(EIO); } +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, *commit_lsn, 0); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + /* * Conditionally push the CIL based on the sequence passed in. * @@ -653,39 +677,34 @@ out_abort: * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t -xlog_cil_push_lsn( +xlog_cil_force_lsn( struct log *log, - xfs_lsn_t push_seq) + xfs_lsn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; xfs_lsn_t commit_lsn = NULLCOMMITLSN; -restart: - down_write(&cil->xc_ctx_lock); - ASSERT(push_seq <= cil->xc_ctx->sequence); + ASSERT(sequence <= cil->xc_current_sequence); - /* check to see if we need to force out the current context */ - if (push_seq == cil->xc_ctx->sequence) { - up_write(&cil->xc_ctx_lock); - xlog_cil_push(log, 1); - goto restart; - } + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ + if (sequence == cil->xc_current_sequence) + xlog_cil_push(log, sequence); /* * See if we can find a previous sequence still committing. - * We can drop the flush lock as soon as we have the cil lock - * because we are now only comparing contexts protected by - * the cil lock. - * * We need to wait for all previous sequence commits to complete * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ +restart: spin_lock(&cil->xc_cil_lock); - up_write(&cil->xc_ctx_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { - if (ctx->sequence > push_seq) + if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { /* @@ -695,7 +714,7 @@ restart: sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); goto restart; } - if (ctx->sequence != push_seq) + if (ctx->sequence != sequence) continue; /* found it! */ commit_lsn = ctx->commit_lsn; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8c072618965c..ced52b98b322 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -422,6 +422,7 @@ struct xfs_cil { struct rw_semaphore xc_ctx_lock; struct list_head xc_committing; sv_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; }; /* @@ -562,8 +563,16 @@ int xlog_cil_init(struct log *log); void xlog_cil_init_post_recovery(struct log *log); void xlog_cil_destroy(struct log *log); -int xlog_cil_push(struct log *log, int push_now); -xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); +/* + * CIL force routines + */ +xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct log *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} /* * Unmount record type is used as a pseudo transaction type for the ticket. From 3b93c7aaefc05ee2a75e2726929b01a321402984 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:45:53 +1000 Subject: [PATCH 09/10] xfs: don't do memory allocation under the CIL context lock Formatting items requires memory allocation when using delayed logging. Currently that memory allocation is done while holding the CIL context lock in read mode. This means that if memory allocation takes some time (e.g. enters reclaim), we cannot push on the CIL until the allocation(s) required by formatting complete. This can stall CIL pushes for some time, and once a push is stalled so are all new transaction commits. Fix this splitting the item formatting into two steps. The first step which does the allocation and memcpy() into the allocated buffer is now done outside the CIL context lock, and only the CIL insert is done inside the CIL context lock. This avoids the stall issue. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log_cil.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9768f2437bb3..ed575fb4b495 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -270,15 +270,10 @@ xlog_cil_insert( static void xlog_cil_format_items( struct log *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn) + struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; - if (start_lsn) - *start_lsn = log->l_cilp->xc_ctx->sequence; - ASSERT(log_vector); for (lv = log_vector; lv; lv = lv->lv_next) { void *ptr; @@ -302,11 +297,26 @@ xlog_cil_format_items( ptr += vec->i_len; } ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); - - xlog_cil_insert(log, ticket, lv->lv_item, lv); } } +static void +xlog_cil_insert_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) + xlog_cil_insert(log, ticket, lv->lv_item, lv); +} + static void xlog_cil_free_logvec( struct xfs_log_vec *log_vector) @@ -612,9 +622,17 @@ xfs_log_commit_cil( return XFS_ERROR(EIO); } + /* + * do all the hard work of formatting items (including memory + * allocation) outside the CIL context lock. This prevents stalling CIL + * pushes when we are low on memory and a transaction commit spends a + * lot of time in memory reclaim. + */ + xlog_cil_format_items(log, log_vector); + /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); - xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); /* check we didn't blow the reservation */ if (tp->t_ticket->t_curr_res < 0) From b5420f235953448eeae615b3361584dc5e414f34 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2010 11:47:51 +1000 Subject: [PATCH 10/10] xfs: do not discard page cache data on EAGAIN If xfs_map_blocks returns EAGAIN because of lock contention we must redirty the page and not disard the pagecache content and return an error from writepage. We used to do this correctly, but the logic got lost during the recent reshuffle of the writepage code. Signed-off-by: Christoph Hellwig Reported-by: Mike Gao Tested-by: Mike Gao Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_aops.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 528be1ba1402..b552f816de15 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1068,7 +1068,7 @@ xfs_vm_writepage( * by themselves. */ if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) - goto out_fail; + goto redirty; /* * We need a transaction if there are delalloc or unwritten buffers @@ -1080,7 +1080,7 @@ xfs_vm_writepage( */ xfs_count_page_state(page, &delalloc, &unwritten); if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) - goto out_fail; + goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -1245,12 +1245,15 @@ error: if (iohead) xfs_cancel_ioend(iohead); + if (err == -EAGAIN) + goto redirty; + xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); return err; -out_fail: +redirty: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0;