From 4be536debe3f7b0c62283e77fd6bd8bdb9f83c6f Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 7 Sep 2006 14:26:50 +1000 Subject: [PATCH 1/4] [XFS] Prevent free space oversubscription and xfssyncd looping. The fix for recent ENOSPC deadlocks introduced certain limitations on allocations. The fix could cause xfssyncd to loop endlessly if we did not leave some space free for the allocator to work correctly. Basically, we needed to ensure that we had at least 4 blocks free for an AG free list and a block for the inode bmap btree at all times. However, this did not take into account the fact that each AG has a free list that needs 4 blocks. Hence any filesystem with more than one AG could cause oversubscription of free space and make xfssyncd spin forever trying to allocate space needed for AG freelists that was not available in the AG. The following patch reserves space for the free lists in all AGs plus the inode bmap btree which prevents oversubscription. It also prevents those blocks from being reported as free space (as they can never be used) and makes the SMP in-core superblock accounting code and the reserved block ioctl respect this requirement. SGI-PV: 955674 SGI-Modid: xfs-linux-melb:xfs-kern:26894a Signed-off-by: David Chinner Signed-off-by: David Chatterton --- fs/xfs/xfs_alloc.h | 20 ++++++++++++++++++++ fs/xfs/xfs_fsops.c | 16 ++++++++++------ fs/xfs/xfs_mount.c | 32 ++++++++------------------------ fs/xfs/xfs_vfsops.c | 3 ++- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 650591f999ae..5a4256120ccc 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -43,6 +43,26 @@ typedef enum xfs_alloctype #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ +/* + * In order to avoid ENOSPC-related deadlock caused by + * out-of-order locking of AGF buffer (PV 947395), we place + * constraints on the relationship among actual allocations for + * data blocks, freelist blocks, and potential file data bmap + * btree blocks. However, these restrictions may result in no + * actual space allocated for a delayed extent, for example, a data + * block in a certain AG is allocated but there is no additional + * block for the additional bmap btree block due to a split of the + * bmap btree of the file. The result of this may lead to an + * infinite loop in xfssyncd when the file gets flushed to disk and + * all delayed extents need to be actually allocated. To get around + * this, we explicitly set aside a few blocks which will not be + * reserved in delayed allocation. Considering the minimum number of + * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap + * btree requires 1 fsb, so we set the number of set-aside blocks + * to 4 + 4*agcount. + */ +#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) + /* * Argument structure for xfs_alloc routines. * This is turned into a structure to avoid having 20 arguments passed diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 077629bab532..c064e72ada9e 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -462,7 +462,7 @@ xfs_fs_counts( xfs_icsb_sync_counters_lazy(mp); s = XFS_SB_LOCK(mp); - cnt->freedata = mp->m_sb.sb_fdblocks; + cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; cnt->freeino = mp->m_sb.sb_ifree; cnt->allocino = mp->m_sb.sb_icount; @@ -519,15 +519,19 @@ xfs_reserve_blocks( } mp->m_resblks = request; } else { + __int64_t free; + + free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); delta = request - mp->m_resblks; - lcounter = mp->m_sb.sb_fdblocks - delta; + lcounter = free - delta; if (lcounter < 0) { /* We can't satisfy the request, just get what we can */ - mp->m_resblks += mp->m_sb.sb_fdblocks; - mp->m_resblks_avail += mp->m_sb.sb_fdblocks; - mp->m_sb.sb_fdblocks = 0; + mp->m_resblks += free; + mp->m_resblks_avail += free; + mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp); } else { - mp->m_sb.sb_fdblocks = lcounter; + mp->m_sb.sb_fdblocks = + lcounter + XFS_ALLOC_SET_ASIDE(mp); mp->m_resblks = request; mp->m_resblks_avail += delta; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4be5c0b2d296..9dfae18d995f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1243,24 +1243,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) xfs_trans_log_buf(tp, bp, first, last); } -/* - * In order to avoid ENOSPC-related deadlock caused by - * out-of-order locking of AGF buffer (PV 947395), we place - * constraints on the relationship among actual allocations for - * data blocks, freelist blocks, and potential file data bmap - * btree blocks. However, these restrictions may result in no - * actual space allocated for a delayed extent, for example, a data - * block in a certain AG is allocated but there is no additional - * block for the additional bmap btree block due to a split of the - * bmap btree of the file. The result of this may lead to an - * infinite loop in xfssyncd when the file gets flushed to disk and - * all delayed extents need to be actually allocated. To get around - * this, we explicitly set aside a few blocks which will not be - * reserved in delayed allocation. Considering the minimum number of - * needed freelist blocks is 4 fsbs, a potential split of file's bmap - * btree requires 1 fsb, so we set the number of set-aside blocks to 8. -*/ -#define SET_ASIDE_BLOCKS 8 /* * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply @@ -1306,7 +1288,8 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, return 0; case XFS_SBS_FDBLOCKS: - lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS; + lcounter = (long long) + mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); if (delta > 0) { /* Putting blocks back */ @@ -1340,7 +1323,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, } } - mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS; + mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); return 0; case XFS_SBS_FREXTENTS: lcounter = (long long)mp->m_sb.sb_frextents; @@ -2021,7 +2004,8 @@ xfs_icsb_sync_counters_lazy( * when we get near ENOSPC. */ #define XFS_ICSB_INO_CNTR_REENABLE 64 -#define XFS_ICSB_FDBLK_CNTR_REENABLE 512 +#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ + (512 + XFS_ALLOC_SET_ASIDE(mp)) STATIC void xfs_icsb_balance_counter( xfs_mount_t *mp, @@ -2055,7 +2039,7 @@ xfs_icsb_balance_counter( case XFS_SBS_FDBLOCKS: count = mp->m_sb.sb_fdblocks; resid = do_div(count, weight); - if (count < XFS_ICSB_FDBLK_CNTR_REENABLE) + if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp)) goto out; break; default: @@ -2110,11 +2094,11 @@ again: case XFS_SBS_FDBLOCKS: BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); - lcounter = icsbp->icsb_fdblocks; + lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); lcounter += delta; if (unlikely(lcounter < 0)) goto slow_path; - icsbp->icsb_fdblocks = lcounter; + icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); break; default: BUG(); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index b427d220a169..a34796e57afb 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -811,7 +811,8 @@ xfs_statvfs( statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks; + statp->f_bfree = statp->f_bavail = + sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); fakeinos = statp->f_bfree << sbp->sb_inopblog; #if XFS_BIG_INUMS fakeinos += mp->m_inoadd; From 721259bce2851893155c6cb88a3f8ecb106b348c Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Thu, 7 Sep 2006 14:27:05 +1000 Subject: [PATCH 2/4] [XFS] Fix ABBA deadlock between i_mutex and iolock. Avoid calling __blockdev_direct_IO for the DIO_OWN_LOCKING case for direct I/O reads since it drops and reacquires the i_mutex while holding the iolock and this violates the locking order. SGI-PV: 955696 SGI-Modid: xfs-linux-melb:xfs-kern:26898a Signed-off-by: Lachlan McIlroy Signed-off-by: David Chatterton --- fs/xfs/linux-2.6/xfs_aops.c | 18 +++++++++++++----- fs/xfs/linux-2.6/xfs_lrw.c | 11 ++++++----- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c40f81ba9b13..34dcb43a7837 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1390,11 +1390,19 @@ xfs_vm_direct_IO( iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); - ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); + if (rw == WRITE) { + ret = blockdev_direct_IO_own_locking(rw, iocb, inode, + iomap.iomap_target->bt_bdev, + iov, offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct); + } else { + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + iomap.iomap_target->bt_bdev, + iov, offset, nr_segs, + xfs_get_blocks_direct, + xfs_end_io_direct); + } if (unlikely(ret <= 0 && iocb->private)) xfs_destroy_ioend(iocb->private); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 5d9cfd91ad08..110c038910ff 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -264,7 +264,9 @@ xfs_read( dmflags, &locktype); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); - goto unlock_mutex; + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_unlock(&inode->i_mutex); + return ret; } } @@ -272,6 +274,9 @@ xfs_read( bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), -1, FI_REMAPF_LOCKED); + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_unlock(&inode->i_mutex); + xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, (void *)iovp, segs, *offset, ioflags); ret = __generic_file_aio_read(iocb, iovp, segs, offset); @@ -281,10 +286,6 @@ xfs_read( XFS_STATS_ADD(xs_read_bytes, ret); xfs_iunlock(ip, XFS_IOLOCK_SHARED); - -unlock_mutex: - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); return ret; } From 0a8d17d090a4939643a52194b7d4a4001b9b2d93 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 7 Sep 2006 14:27:15 +1000 Subject: [PATCH 3/4] [XFS] Fix xfs_splice_write() so appended data gets to disk. xfs_splice_write() failed to update the on disk inode size when extending the so when the file was closed the range extended by splice was truncated off. Hence any region of a file written to by splice would end up as a hole full of zeros. SGI-PV: 955939 SGI-Modid: xfs-linux-melb:xfs-kern:26920a Signed-off-by: David Chinner Signed-off-by: David Chatterton --- fs/xfs/linux-2.6/xfs_lrw.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 110c038910ff..ee788b1cb364 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -391,6 +391,8 @@ xfs_splice_write( xfs_inode_t *ip = XFS_BHVTOI(bdp); xfs_mount_t *mp = ip->i_mount; ssize_t ret; + struct inode *inode = outfilp->f_mapping->host; + xfs_fsize_t isize; XFS_STATS_INC(xs_write_calls); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -417,6 +419,20 @@ xfs_splice_write( if (ret > 0) XFS_STATS_ADD(xs_write_bytes, ret); + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_d.di_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_d.di_size) { + ip->i_d.di_size = *ppos; + i_size_write(inode, *ppos); + ip->i_update_core = 1; + ip->i_update_size = 1; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } From 0edc7d0f3709e8c3bb7e69c4df614218a753361e Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 7 Sep 2006 14:27:23 +1000 Subject: [PATCH 4/4] [XFS] Fix a bad pointer dereference in the quota statvfs handling. SGI-PV: 955993 SGI-Modid: xfs-linux-melb:xfs-kern:26934a Signed-off-by: Nathan Scott Signed-off-by: David Chatterton --- fs/xfs/quota/xfs_qm_bhv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index f137856c3261..db8872be8c87 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -203,7 +203,7 @@ xfs_qm_statvfs( if (error || !vnode) return error; - mp = XFS_BHVTOM(bhv); + mp = xfs_vfstom(bhvtovfs(bhv)); ip = xfs_vtoi(vnode); if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))