From f876e44603ad091c840a5fae5b0753bbb421c037 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Feb 2014 16:40:42 +1100 Subject: [PATCH 1/3] xfs: always do log forces via the workqueue Log forces can occur deep in the call chain when we have relatively little stack free. Log forces can also happen at close to the call chain leaves (e.g. xfs_buf_lock()) and hence we can trigger IO from places where we really don't want to add more stack overhead. This stack overhead occurs because log forces do foreground CIL pushes (xlog_cil_push_foreground()) rather than waking the background push wq and waiting for the for the push to complete. This foreground push was done to avoid confusing the CFQ Io scheduler when fsync()s were issued, as it has trouble dealing with dependent IOs being issued from different process contexts. Avoiding blowing the stack is much more critical than performance optimisations for CFQ, especially as we've been recommending against the use of CFQ for XFS since 3.2 kernels were release because of it's problems with multi-threaded IO workloads. Hence convert xlog_cil_push_foreground() to move the push work to the CIL workqueue. We already do the waiting for the push to complete in xlog_cil_force_lsn(), so there's nothing else we need to modify to make this work. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_cil.c | 52 +++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index cdebd832c3db..1270ded7610d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -487,13 +487,6 @@ xlog_cil_push( new_ctx->cil = cil; cil->xc_ctx = new_ctx; - /* - * mirror the new sequence into the cil structure so that we can do - * unlocked checks against the current sequence in log forces without - * risking deferencing a freed context pointer. - */ - cil->xc_current_sequence = new_ctx->sequence; - /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, @@ -512,8 +505,15 @@ xlog_cil_push( * Hence we need to add this context to the committing context list so * that higher sequences will wait for us to write out a commit record * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -651,8 +651,14 @@ xlog_cil_push_background( } +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ static void -xlog_cil_push_foreground( +xlog_cil_push_now( struct xlog *log, xfs_lsn_t push_seq) { @@ -677,10 +683,8 @@ xlog_cil_push_foreground( } cil->xc_push_seq = push_seq; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); spin_unlock(&cil->xc_push_lock); - - /* do the push now */ - xlog_cil_push(log); } bool @@ -785,7 +789,8 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - xlog_cil_push_foreground(log, sequence); +restart: + xlog_cil_push_now(log, sequence); /* * See if we can find a previous sequence still committing. @@ -793,7 +798,6 @@ xlog_cil_force_lsn( * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ -restart: spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { if (ctx->sequence > sequence) @@ -811,6 +815,28 @@ restart: /* found it! */ commit_lsn = ctx->commit_lsn; } + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. + * + * When the push occurs, it will empty the CIL and + * atomically increment the currect sequence past the push sequence and + * move it into the committing list. Of course, if the CIL is clean at + * the time of the push, it won't have pushed the CIL at all, so in that + * case we should try the push for this sequence again from the start + * just in case. + */ + + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + spin_unlock(&cil->xc_push_lock); return commit_lsn; } From a1358aa3d39251c16c6d8b20945e40bdfc2aad68 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Feb 2014 16:51:26 +1100 Subject: [PATCH 2/3] xfs: allocate xfs_da_args to reduce stack footprint The struct xfs_da_args used to pass directory/attribute operation information to the lower layers is 128 bytes in size and is allocated on the stack. Dynamically allocate them to reduce the stack footprint of directory operations. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_dir2.c | 336 ++++++++++++++++++++++++++++------------------ 1 file changed, 209 insertions(+), 127 deletions(-) diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index ce16ef02997a..fda46253966a 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -180,16 +180,23 @@ xfs_dir_init( xfs_inode_t *dp, xfs_inode_t *pdp) { - xfs_da_args_t args; + struct xfs_da_args *args; int error; - memset((char *)&args, 0, sizeof(args)); - args.dp = dp; - args.trans = tp; ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) return error; - return xfs_dir2_sf_create(&args, pdp->i_ino); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; } /* @@ -205,41 +212,56 @@ xfs_dir_createname( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; XFS_STATS_INC(xs_dir_create); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); return rval; } @@ -282,46 +304,66 @@ xfs_dir_lookup( xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_OKNOENT; + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; if (ci_name) - args.op_flags |= XFS_DA_OP_CILOOKUP; + args->op_flags |= XFS_DA_OP_CILOOKUP; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_lookup(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_lookup(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_lookup(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); else - rval = xfs_dir2_node_lookup(&args); + rval = xfs_dir2_node_lookup(args); + +out_check_rval: if (rval == EEXIST) rval = 0; if (!rval) { - *inum = args.inumber; + *inum = args->inumber; if (ci_name) { - ci_name->name = args.value; - ci_name->len = args.valuelen; + ci_name->name = args->value; + ci_name->len = args->valuelen; } } +out_free: + kmem_free(args); return rval; } @@ -338,38 +380,51 @@ xfs_dir_removename( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = ino; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_removename(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_removename(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_removename(&args); + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); else - rval = xfs_dir2_node_removename(&args); + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); return rval; } @@ -386,40 +441,54 @@ xfs_dir_replace( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_replace(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_replace(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_replace(&args); + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); else - rval = xfs_dir2_node_replace(&args); + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); return rval; } @@ -434,7 +503,7 @@ xfs_dir_canenter( struct xfs_name *name, /* name of entry to add */ uint resblks) { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ @@ -443,29 +512,42 @@ xfs_dir_canenter( ASSERT(S_ISDIR(dp->i_d.di_mode)); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); +out_free: + kmem_free(args); return rval; } From 93a8614e3a4dccd526aca34e892ac0b27f64b506 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 27 Feb 2014 16:51:39 +1100 Subject: [PATCH 3/3] xfs: fix directory inode iolock lockdep false positive The change to add the IO lock to protect the directory extent map during readdir operations has cause lockdep to have a heart attack as it now sees a different locking order on inodes w.r.t. the mmap_sem because readdir has a different ordering to write(). Add a new lockdep class for directory inodes to avoid this false positive. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f35d5c953ff9..234e84387ec5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -48,6 +48,18 @@ #include #include +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + static int xfs_initxattrs( struct inode *inode, @@ -1191,6 +1203,7 @@ xfs_setup_inode( xfs_diflags_to_iflags(inode, ip); ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; @@ -1198,6 +1211,7 @@ xfs_setup_inode( inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else