diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4665ff6e5153..32fc5401a756 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -79,6 +79,47 @@ xfs_buf_vmap_len( return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; } +/* + * Bump the I/O in flight count on the buftarg if we haven't yet done so for + * this buffer. The count is incremented once per buffer (per hold cycle) + * because the corresponding decrement is deferred to buffer release. Buffers + * can undergo I/O multiple times in a hold-release cycle and per buffer I/O + * tracking adds unnecessary overhead. This is used for sychronization purposes + * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * in-flight buffers. + * + * Buffers that are never released (e.g., superblock, iclog buffers) must set + * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count + * never reaches zero and unmount hangs indefinitely. + */ +static inline void +xfs_buf_ioacct_inc( + struct xfs_buf *bp) +{ + if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags |= _XBF_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); +} + +/* + * Clear the in-flight state on a buffer about to be released to the LRU or + * freed and unaccount from the buftarg. + */ +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + if (!(bp->b_flags & _XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags &= ~_XBF_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); +} + /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -102,6 +143,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; + /* + * Once the buffer is marked stale and unlocked, a subsequent lookup + * could reset b_flags. There is no guarantee that the buffer is + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ + xfs_buf_ioacct_dec(bp); + spin_lock(&bp->b_lock); atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && @@ -815,7 +864,8 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = _xfs_buf_alloc(target, &map, 1, 0); + /* flags might contain irrelevant bits, pass only what we care about */ + bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); if (unlikely(bp == NULL)) goto fail; @@ -866,63 +916,85 @@ xfs_buf_hold( } /* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. + * Release a hold on the specified buffer. If the hold count is 1, the buffer is + * placed on LRU or freed (depending on b_lru_ref). */ void xfs_buf_rele( xfs_buf_t *bp) { struct xfs_perag *pag = bp->b_pag; + bool release; + bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); - if (atomic_dec_and_test(&bp->b_hold)) + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); + } return; } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - spin_lock(&bp->b_lock); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU take a new - * reference to the buffer for the LRU and clear the - * (now stale) dispose list state flag - */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { - bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bp->b_lock); - spin_unlock(&pag->pag_buf_lock); - } else { - /* - * most of the time buffers will already be removed from - * the LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the - * buffer was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - spin_unlock(&bp->b_lock); - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - xfs_buf_free(bp); - } + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + spin_lock(&bp->b_lock); + if (!release) { + /* + * Drop the in-flight state if the buffer is already on the LRU + * and it holds the only reference. This is racy because we + * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + xfs_buf_ioacct_dec(bp); + goto out_unlock; } + + /* the last reference has been dropped ... */ + xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the + * buffer for the LRU and clear the (now stale) dispose list + * state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from the + * LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the buffer + * was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + freebuf = true; + } + +out_unlock: + spin_unlock(&bp->b_lock); + + if (freebuf) + xfs_buf_free(bp); } @@ -1341,6 +1413,7 @@ xfs_buf_submit( * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); + xfs_buf_ioacct_inc(bp); _xfs_buf_ioapply(bp); /* @@ -1526,13 +1599,19 @@ xfs_wait_buftarg( int loop = 0; /* - * We need to flush the buffer workqueue to ensure that all IO - * completion processing is 100% done. Just waiting on buffer locks is - * not sufficient for async IO as the reference count held over IO is - * not released until after the buffer lock is dropped. Hence we need to - * ensure here that all reference counts have been dropped before we - * start walking the LRU list. + * First wait on the buftarg I/O count for all in-flight buffers to be + * released. This is critical as new buffers do not make the LRU until + * they are released. + * + * Next, flush the buffer workqueue to ensure all completion processing + * has finished. Just waiting on buffer locks is not sufficient for + * async IO as the reference count held over IO is not released until + * after the buffer lock is dropped. Hence we need to ensure here that + * all reference counts have been dropped before we start walking the + * LRU list. */ + while (percpu_counter_sum(&btp->bt_io_count)) + delay(100); drain_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ @@ -1629,6 +1708,8 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) @@ -1693,6 +1774,9 @@ xfs_alloc_buftarg( if (list_lru_init(&btp->bt_lru)) goto error; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto error; + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1834,7 +1918,7 @@ xfs_buf_delwri_submit_buffers( * side. We need to move the buffer onto the io_list * at this point so the caller can still access it. */ - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE | XBF_ASYNC; if (wait_list) { xfs_buf_hold(bp); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 8bfb974f0772..1c2e52b2d926 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -43,6 +43,7 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ @@ -62,6 +63,7 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_IN_FLIGHT, "IN_FLIGHT" } /* @@ -115,6 +118,8 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; struct list_lru bt_lru; + + struct percpu_counter bt_io_count; } xfs_buftarg_t; struct xfs_buf; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2e95ad036316..0337690c88bc 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1081,6 +1081,8 @@ xfs_buf_iodone_callback_error( trace_xfs_buf_item_iodone_async(bp, _RET_IP_); ASSERT(bp->b_iodone != NULL); + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + /* * If the write was asynchronous then no one will be looking for the * error. If this is the first failure of this type, clear the error @@ -1088,13 +1090,12 @@ xfs_buf_iodone_callback_error( * async write failure at least once, but we also need to set the buffer * up to behave correctly now for repeated failures. */ - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || + if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || bp->b_last_error != bp->b_error) { - bp->b_flags |= (XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL); + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - bp->b_retries = 0; - bp->b_first_retry_time = jiffies; + if (cfg->retry_timeout && !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); xfs_buf_submit(bp); @@ -1105,7 +1106,6 @@ xfs_buf_iodone_callback_error( * Repeated failure on an async write. Take action according to the * error configuration we have been set up to use. */ - cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 63dad9ea9016..3b74fa011bb1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1415,7 +1415,7 @@ xlog_alloc_log( */ error = -ENOMEM; bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), XBF_NO_IOACCT); if (!bp) goto out_free_log; @@ -1454,7 +1454,8 @@ xlog_alloc_log( prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), + XBF_NO_IOACCT); if (!bp) goto out_free_iclog; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e39b02351b4a..970c19ba2f56 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -272,13 +272,15 @@ xfs_readsb( buf_ops = NULL; /* - * Allocate a (locked) buffer to hold the superblock. - * This will be kept around at all times to optimize - * access to the superblock. + * Allocate a (locked) buffer to hold the superblock. This will be kept + * around at all times to optimize access to the superblock. Therefore, + * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count + * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), XBF_NO_IOACCT, &bp, + buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 4c2c55086208..79cfd3fc5324 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -634,6 +634,9 @@ xfs_error_get_cfg( { struct xfs_error_cfg *cfg; + if (error < 0) + error = -error; + switch (error) { case EIO: cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];