From 61eaadcd52924b8015ee57b9abd3844c5f9e03a8 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 3 Jul 2017 11:37:02 -0500 Subject: [PATCH 01/29] GFS2: Prevent double brelse in gfs2_meta_indirect_buffer Before this patch, problems reading in indirect buffers would send an IO error back to the caller, and release the buffer_head with brelse() in function gfs2_meta_indirect_buffer, however, it would still return the address of the buffer_head it released. After the error was discovered, function gfs2_block_map would call function release_metapath to free all buffers. That checked: if (mp->mp_bh[i] == NULL) but since the value was set after the error, it was non-zero, so brelse was called a second time. This resulted in the following error: kernel: WARNING: at fs/buffer.c:1224 __brelse+0x3a/0x40() (Tainted: G W -- ------------ ) kernel: Hardware name: RHEV Hypervisor kernel: VFS: brelse: Trying to free free buffer This patch changes gfs2_meta_indirect_buffer so it only sets the buffer_head pointer in cases where it isn't released. Signed-off-by: Bob Peterson Acked-by: Steven Whitehouse --- fs/gfs2/meta_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fabe1614f879..4da7745c890a 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -419,8 +419,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; + } else { + *bhp = bh; } - *bhp = bh; return ret; } From 283c9a97be1d7ab2cce2630b5f6cc793f3b387a1 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 17 Jul 2017 13:39:15 -0500 Subject: [PATCH 02/29] gfs2: Lock holder cleanup (fixup) Function gfs2_holder_initialized should be used in do_flock as well. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c2062a108d19..bb48074be019 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1030,8 +1030,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); - gl = fl_gh->gh_gl; - if (gl) { + if (gfs2_holder_initialized(fl_gh)) { if (fl_gh->gh_state == state) goto out; locks_lock_file_wait(file, From 914cea93dd89f00b41c1d8ff93f17be47356a36a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 19 Jul 2017 10:56:42 -0500 Subject: [PATCH 03/29] gfs2: Don't clear SGID when inheriting ACLs When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __gfs2_set_acl() into gfs2_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef Signed-off-by: Jan Kara Signed-off-by: Bob Peterson --- fs/gfs2/acl.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2524807ee070..a6d962323790 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -86,19 +86,6 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) char *data; const char *name = gfs2_acl_name(type); - if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) - return -E2BIG; - - if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - if (mode != inode->i_mode) - mark_inode_dirty(inode); - } - if (acl) { len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) @@ -130,6 +117,9 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) bool need_unlock = false; int ret; + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; + ret = gfs2_rsqa_alloc(ip); if (ret) return ret; @@ -140,7 +130,18 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; need_unlock = true; } + if (type == ACL_TYPE_ACCESS && acl) { + umode_t mode = inode->i_mode; + + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) + goto unlock; + if (mode != inode->i_mode) + mark_inode_dirty(inode); + } + ret = __gfs2_set_acl(inode, acl, type); +unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh); return ret; From 98e5a91a6136af01198cfe9a3499f090718fd6ff Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 19 Jul 2017 11:10:19 -0500 Subject: [PATCH 04/29] gfs2: Fixup to "Get rid of flush_delayed_work in gfs2_evict_inode" When commit 4fd1a57952 moved the call to flush_delayed_work from gfs2_evict_inode to gfs2_inode_lookup to avoid calling into DLM during evict, a similar call should have been added to gfs2_create_inode: that's another code path in which glocks of previous inodes may be reused. The flush of the iopen glock work queue added by 4fd1a57952, on the other hand, is unnecessary and can be removed. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index acca501f8110..f9302f16a28e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -174,7 +174,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -706,8 +705,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; - + flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; From e7cb550d79fba5876616ed33ccafafc4e2bd7e2e Mon Sep 17 00:00:00 2001 From: Wang Xibo Date: Fri, 21 Jul 2017 07:40:59 -0500 Subject: [PATCH 05/29] GFS2: fix code parameter error in inode_go_lock In inode_go_lock() function, the parameter order of list_add() is error. According to the define of list_add(), the first parameter is new entry and the second is the list head, so ip->i_trunc_list should be the first parameter and the sdp->sd_trunc_list should be second. Signed-off-by: Wang Xibo Signed-off-by: Xiao Likun Signed-off-by: Bob Peterson --- fs/gfs2/glops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5e69636d4dd3..28c203a02960 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -470,7 +470,7 @@ static int inode_go_lock(struct gfs2_holder *gh) (gh->gh_state == LM_ST_EXCLUSIVE)) { spin_lock(&sdp->sd_trunc_lock); if (list_empty(&ip->i_trunc_list)) - list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); wake_up(&sdp->sd_quota_wait); return 1; From e477b24b507998bc6568316a2e034025960d2404 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 21 Jul 2017 07:48:22 -0500 Subject: [PATCH 06/29] gfs2: add flag REQ_PRIO for metadata I/O When gfs2 does metadata I/O, only REQ_META is used as a metadata hint of the bio. But flag REQ_META is just a hint for block trace, not for block layer code to handle a bio as metadata request. For some of metadata I/Os of gfs2, A REQ_PRIO flag on the metadata bio would be very informative to block layer code. For example, if bcache is used as a I/O cache for gfs2, it will be possible for bcache code to get the hint and cache the pre-fetched metadata blocks on cache device. This behavior may be helpful to improve metadata I/O performance if the following requests hit the cache. Here are the locations in gfs2 code where a REQ_PRIO flag should be added, - All places where REQ_READAHEAD is used, gfs2 code uses this flag for metadata read ahead. - In gfs2_meta_rq() where the first metadata block is read in. - In gfs2_write_buf_to_page(), read in quota metadata blocks to have them up to date. These metadata blocks are probably to be accessed again in future, adding a REQ_PRIO flag may have bcache to keep such metadata in fast cache device. For system without a cache layer, REQ_PRIO can still provide hint to block layer to handle metadata requests more properly. Signed-off-by: Coly Li Signed-off-by: Bob Peterson --- fs/gfs2/bmap.c | 5 +++-- fs/gfs2/dir.c | 4 +++- fs/gfs2/meta_io.c | 6 ++++-- fs/gfs2/quota.c | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9fa3aef9a5b3..fa3ea29f39cf 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -291,8 +291,9 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, - rabh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + rabh); continue; } unlock_buffer(rabh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index db427658ccd9..0741e4018f8c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1514,7 +1514,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + bh); continue; } brelse(bh); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 4da7745c890a..61ef6c9be816 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -453,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -462,7 +462,9 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c2ca9566b764..739adf105d7f 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; From df3d87bde121213560fde0edb71bc46f0f75692c Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Jul 2017 11:35:04 -0500 Subject: [PATCH 07/29] GFS2: Introduce helper for clearing gl_object This patch introduces a new helper function in glock.h that clears gl_object, with an added integrity check. An additional integrity check has been added to glock_set_object, plus comments. This is step 1 in a series to ensure gl_object integrity. Signed-off-by: Bob Peterson Reviewed-by: Andreas Gruenbacher --- fs/gfs2/glock.h | 34 ++++++++++++++++++++++++++++++++++ fs/gfs2/inode.c | 4 ++-- fs/gfs2/super.c | 4 ++-- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9ad4a6ac6c84..526d2123f758 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -13,6 +13,7 @@ #include #include #include "incore.h" +#include "util.h" /* Options for hostdata parser */ @@ -257,11 +258,44 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ static inline void glock_set_object(struct gfs2_glock *gl, void *object) { spin_lock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) + gfs2_dump_glock(NULL, gl); gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); } +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + * @object: the object + * + * I'd love to similarly add this: + * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) + * gfs2_dump_glock(NULL, gl); + * Unfortunately, that's not possible because as soon as gfs2_delete_inode + * frees the block in the rgrp, another process can reassign it for an I_NEW + * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. + * That means gfs2_delete_inode may subsequently try to call this function + * for a glock that's already pointing to a brand new inode. If we clear the + * new inode's gl_object, we'll introduce metadata corruption. Function + * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also + * tries to clear gl_object, so it's more than just gfs2_delete_inode. + * + */ +static inline void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + if (gl->gl_object == object) + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index f9302f16a28e..2578bd824e34 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -201,14 +201,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - glock_set_object(ip->i_gl, NULL); + glock_clear_object(ip->i_gl, ip); fail: iget_failed(inode); return ERR_PTR(error); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fdedec379b78..5fdc54158ff6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1640,13 +1640,13 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - glock_set_object(ip->i_gl, NULL); + glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } From 4d7c18c7df89ef549f2de79b0faf873b49dea57a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Jul 2017 12:15:01 -0500 Subject: [PATCH 08/29] GFS2: Set gl_object in inode lookup only after block type check Before this patch, the inode glock's gl_object was set after a reference was acquired, but before the block type was verified. In cases where the block was unlinked, then freed and reused on another node, a residule delete callback (delete_work) would try to look up the inode, eventually failing the block check, but only after it overwrites gl_object with a pointer to the wrong inode. This patch moves the assignment of gl_object after the block check so it won't be improperly overwritten. Likewise, at the end of the function, gfs2_inode_lookup was clearing gl_object after it unlocked the glock, which meant another process might free the glock in the meantime. This patch guards against that case. Signed-off-by: Bob Peterson Reviewed-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2578bd824e34..fd6e1da3c5ab 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -145,7 +145,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; flush_delayed_work(&ip->i_gl->gl_work); - glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -170,6 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } } + glock_set_object(ip->i_gl, ip); set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) @@ -206,9 +206,9 @@ fail_refresh: fail_put: if (io_gl) gfs2_glock_put(io_gl); + glock_clear_object(ip->i_gl, ip); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - glock_clear_object(ip->i_gl, ip); fail: iget_failed(inode); return ERR_PTR(error); From 9c1b28081f43c0f14ccbcad02a6e0f227c072da2 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Jul 2017 12:26:07 -0500 Subject: [PATCH 09/29] GFS2: Clear gl_object if gfs2_create_inode fails If function gfs2_create_inode fails after the inode has been created (for example, if the inode_refresh fails for some reason) the function was setting gl_object but never clearing it again. The glocks are left pointing to a freed inode. This patch adds the calls to clear gl_object in the appropriate error paths. Signed-off-by: Bob Peterson Reviewed-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index fd6e1da3c5ab..1427328c6c86 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -775,14 +775,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: + glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put(io_gl); fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); fail_free_inode: - if (ip->i_gl) + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); + } gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) From 240c6235dfe4fe46f2fd0f2b3c15b3a22100a75e Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Jul 2017 12:36:01 -0500 Subject: [PATCH 10/29] GFS2: Clear gl_object when deleting an inode in gfs2_delete_inode This patch adds some calls to clear gl_object in function gfs2_delete_inode. Since we are deleting the inode, and the glock typically outlives the inode in core, we must clear gl_object so subsequent use of the glock (e.g. for a new inode in its place) will not have the old pointer sitting there. In error cases we need to tidy up after ourselves. In non-error cases, we need to clear gl_object before we set the block free in the bitmap so residules aren't left for potential inode creators. Signed-off-by: Bob Peterson Reviewed-by: Andreas Gruenbacher --- fs/gfs2/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5fdc54158ff6..87271a859a8d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1547,6 +1547,7 @@ static void gfs2_evict_inode(struct inode *inode) /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; @@ -1595,6 +1596,11 @@ alloc_failed: goto out_unlock; } + /* We're about to clear the bitmap for the dinode, but as soon as we + do, gfs2_create_inode can create another inode at the same block + location and try to set gl_object again. We clear gl_object here so + that subsequent inode creates don't see an old gl_object. */ + glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); goto out_unlock; @@ -1623,14 +1629,17 @@ out_unlock: gfs2_rs_deltree(&ip->i_res); if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } - if (gfs2_holder_initialized(&gh)) + if (gfs2_holder_initialized(&gh)) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); + } if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: From 2d821a8b7192ed9d2ea0c30f3cb62b58aa46cf41 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 26 Jul 2017 10:18:19 -0500 Subject: [PATCH 11/29] GFS2: Don't bother trying to add rgrps to the lru list This patch removes a call to gfs2_glock_add_to_lru from function gfs2_clear_rgrpd. The call is just a waste of time because as soon as it adds it to the lru_list, the call to gfs2_glock_put takes it back off again. Signed-off-by: Bob Peterson --- fs/gfs2/rgrp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 836e38ba5d0a..29fbeee36fa6 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -706,7 +706,6 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) if (gl) { glock_set_object(gl, NULL); - gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } From 645ebd49f0583f91234aa043ef71ddebe7e8351e Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 26 Jul 2017 10:57:35 -0500 Subject: [PATCH 12/29] GFS2: Don't waste time locking lru_lock for non-lru glocks Before this patch, glock_dq would call gfs2_glock_remove_from_lru. For glocks that are never put on the LRU, such as the transaction glock, this just takes the spin_lock, determines there's nothing to be done because the list is empty, then unlocks again. This was causing unnecessary lock contention on the lru_lock spin_lock. This patch adds a check for GLOF_LRU in the glops before taking the spin_lock. Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c38ab6c81898..1029340fc8ba 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -150,6 +150,9 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); From b2fb7dab7f193909e9a0ad489a91666b8c55eff1 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 28 Jul 2017 07:22:55 -0500 Subject: [PATCH 13/29] GFS2: Delete debugfs files only after we evict the glocks This patch moves the call to gfs2_delete_debugfs_file so that it comes after the glock hash table has been cleared. This way we can query the debugfs files if umount hangs. Signed-off-by: Bob Peterson --- fs/gfs2/ops_fstype.c | 1 - fs/gfs2/super.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e76058d34b74..51752de54f11 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1388,7 +1388,6 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); - gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); kill_block_super(sb); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 87271a859a8d..1918bb5fc943 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -924,6 +924,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); From 61b91cfdc6c0c49a8cc8258cbee846551029d694 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 1 Aug 2017 09:54:33 -0500 Subject: [PATCH 14/29] gfs2: Fix trivial typos Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/inode.c | 2 +- fs/gfs2/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1427328c6c86..863749e29bf9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -109,7 +109,7 @@ static void gfs2_set_iop(struct inode *inode) * @no_addr: The inode number * @no_formal_ino: The inode generation number * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; - * GFS2_BLKST_FREE do indicate not to verify) + * GFS2_BLKST_FREE to indicate not to verify) * * If @type is DT_UNKNOWN, the inode type is fetched from disk. * diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1918bb5fc943..6c39bb1ec100 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1296,7 +1296,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * - * If we've received a callback on an iopen lock then its because a + * If we've received a callback on an iopen lock then it's because a * remote node tried to deallocate the inode but failed due to this node * still having the inode open. Here we mark the link count zero * since we know that it must have reached zero if the GLF_DEMOTE flag From 0515480ad424f2d6853ffe448f444ba3c756c057 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 1 Aug 2017 11:18:26 -0500 Subject: [PATCH 15/29] gfs2: gfs2_glock_get: Wait on freeing glocks Keep glocks in their hash table until they are freed instead of removing them when their last reference is dropped. This allows to wait for any previous instances of a glock to go away in gfs2_glock_get before creating a new glocks. Special thanks to Andy Price for finding and fixing a problem which also required us to delete the rcu_read_unlock from the error case in function gfs2_glock_get. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 126 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 22 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1029340fc8ba..11d48b964047 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,66 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; +#define GLOCK_WAIT_TABLE_BITS 12 +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; + +struct wait_glock_queue { + struct lm_lockname *name; + wait_queue_entry_t wait; +}; + +static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct wait_glock_queue *wait_glock = + container_of(wait, struct wait_glock_queue, wait); + struct lm_lockname *wait_name = wait_glock->name; + struct lm_lockname *wake_name = key; + + if (wake_name->ln_sbd != wait_name->ln_sbd || + wake_name->ln_number != wait_name->ln_number || + wake_name->ln_type != wait_name->ln_type) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) +{ + u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + + return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); +} + +static void prepare_to_wait_on_glock(wait_queue_head_t **wq, + struct wait_glock_queue *wait, + struct lm_lockname *name) +{ + wait->name = name; + init_wait(&wait->wait); + wait->wait.func = glock_wake_function; + *wq = glock_waitqueue(name); + prepare_to_wait(*wq, &wait->wait, TASK_UNINTERRUPTIBLE); +} + +static void finish_wait_on_glock(wait_queue_head_t *wq, + struct wait_glock_queue *wait) +{ + finish_wait(wq, &wait->wait); +} + +/** + * wake_up_glock - Wake up waiters on a glock + * @gl: the glock + */ +static void wake_up_glock(struct gfs2_glock *gl) +{ + wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); + + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); +} + static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); @@ -96,6 +157,9 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); + smp_mb(); + wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); @@ -194,7 +258,6 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); - rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); @@ -679,6 +742,36 @@ static void glock_work_func(struct work_struct *work) spin_unlock(&gl->gl_lockref.lock); } +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, + struct gfs2_glock *new) +{ + struct wait_glock_queue wait; + wait_queue_head_t *wq; + struct gfs2_glock *gl; + +again: + prepare_to_wait_on_glock(&wq, &wait, name); + rcu_read_lock(); + if (new) { + gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, + &new->gl_node, ht_parms); + if (IS_ERR(gl)) + goto out; + } else { + gl = rhashtable_lookup_fast(&gl_hash_table, + name, ht_parms); + } + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { + rcu_read_unlock(); + schedule(); + goto again; + } +out: + rcu_read_unlock(); + finish_wait_on_glock(wq, &wait); + return gl; +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -705,15 +798,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret = 0; - rcu_read_lock(); - gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (gl && !lockref_get_not_dead(&gl->gl_lockref)) - gl = NULL; - rcu_read_unlock(); - - *glp = gl; - if (gl) + gl = find_insert_glock(&name, NULL); + if (gl) { + *glp = gl; return 0; + } if (!create) return -ENOENT; @@ -767,10 +856,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } -again: - rcu_read_lock(); - tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); + tmp = find_insert_glock(&name, gl); if (!tmp) { *glp = gl; goto out; @@ -779,13 +865,7 @@ again: ret = PTR_ERR(tmp); goto out_free; } - if (lockref_get_not_dead(&tmp->gl_lockref)) { - *glp = tmp; - goto out_free; - } - rcu_read_unlock(); - cond_resched(); - goto again; + *glp = tmp; out_free: kfree(gl->gl_lksb.sb_lvbptr); @@ -793,7 +873,6 @@ out_free: atomic_dec(&sdp->sd_glock_disposal); out: - rcu_read_unlock(); return ret; } @@ -1806,7 +1885,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) int __init gfs2_glock_init(void) { - int ret; + int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) @@ -1835,6 +1914,9 @@ int __init gfs2_glock_init(void) return ret; } + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(glock_wait_table + i); + return 0; } From eebd2e813f7ef688e22cd0b68aea78fb3d1ef19c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 1 Aug 2017 11:33:17 -0500 Subject: [PATCH 16/29] gfs2: Get rid of gfs2_set_nlink Remove gfs2_set_nlink which prevents the link count of an inode from becoming non-zero once it has reached zero. The next commit reduces the amount of waiting on glocks when an inode is evicted from memory. With that, an inode can become reallocated before all the remote-unlink callbacks from a previous delete are processed, which causes the link count to change from zero to non-zero. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/glops.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 28c203a02960..dac6559e2195 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) return 1; } -/** - * gfs2_set_nlink - Set the inode's link count based on on-disk info - * @inode: The inode in question - * @nlink: The link count - * - * If the link count has hit zero, it must never be raised, whatever the - * on-disk inode might say. When new struct inodes are created the link - * count is set to 1, so that we can safely use this test even when reading - * in on disk information for the first time. - */ - -static void gfs2_set_nlink(struct inode *inode, u32 nlink) -{ - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { - if (nlink == 0) - clear_nlink(inode); - else - set_nlink(inode, nlink); - } -} - static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; @@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); From 71c1b2136835c88c231f7a5e3dc618f7568f84f7 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 1 Aug 2017 11:45:23 -0500 Subject: [PATCH 17/29] gfs2: gfs2_evict_inode: Put glocks asynchronously gfs2_evict_inode is called to free inodes under memory pressure. The function calls into DLM when an inode's last cluster-wide reference goes away (remote unlink) and to release the glock and associated DLM lock before finally destroying the inode. However, if DLM is blocked on memory to become available, calling into DLM again will deadlock. Avoid that by decoupling releasing glocks from destroying inodes in that case: with gfs2_glock_queue_put, glocks will be dequeued asynchronously in work queue context, when the associated inodes have likely already been destroyed. With this change, inodes can end up being unlinked, remote-unlink can be triggered, and then the inode can be reallocated before all remote-unlink callbacks are processed. To detect that, revalidate the link count in gfs2_evict_inode to make sure we're not deleting an allocated, referenced inode. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 10 +++++++++- fs/gfs2/glock.h | 2 ++ fs/gfs2/super.c | 30 ++++++++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 11d48b964047..5ad757f0ce60 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -171,7 +171,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); @@ -264,6 +264,14 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } +/* + * Cause the glock to be put in work queue context. + */ +void gfs2_glock_queue_put(struct gfs2_glock *gl) +{ + gfs2_glock_queue_work(gl, 0); +} + /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 526d2123f758..5e12220cc0c2 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -182,7 +182,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_glock_queue_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_reinit(unsigned int state, u16 flags, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6c39bb1ec100..4089dbe617a6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1501,6 +1501,22 @@ out_qs: return error; } +/** + * gfs2_glock_put_eventually + * @gl: The glock to put + * + * When under memory pressure, trigger a deferred glock put to make sure we + * won't call into DLM and deadlock. Otherwise, put the glock directly. + */ + +static void gfs2_glock_put_eventually(struct gfs2_glock *gl) +{ + if (current->flags & PF_MEMALLOC) + gfs2_glock_queue_put(gl); + else + gfs2_glock_put(gl); +} + /** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict @@ -1564,6 +1580,12 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + goto out_truncate; + alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { @@ -1653,12 +1675,16 @@ out: glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + glock_clear_object(gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_hold(gl); gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put_eventually(gl); } } From 6a1c8f6dcf815d96197a2723781cf700925d17ed Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 1 Aug 2017 11:49:42 -0500 Subject: [PATCH 18/29] gfs2: Defer deleting inodes under memory pressure When under memory pressure and an inode's link count has dropped to zero, defer deleting the inode to the delete workqueue. This avoids calling into DLM under memory pressure, which can deadlock. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/super.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4089dbe617a6..a83fe8260d2e 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1318,6 +1318,23 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } + + /* + * When under memory pressure when an inode's link count has dropped to + * zero, defer deleting the inode to the delete workqueue. This avoids + * calling into DLM under memory pressure, which can deadlock. + */ + if (!inode->i_nlink && + unlikely(current->flags & PF_MEMALLOC) && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_queue_put(gl); + return false; + } + return generic_drop_inode(inode); } @@ -1561,6 +1578,10 @@ static void gfs2_evict_inode(struct inode *inode) goto alloc_failed; } + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + goto out; + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { From a91323e255fa8bc84b0acf63376b395c534a38fa Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 4 Aug 2017 07:40:45 -0500 Subject: [PATCH 19/29] gfs2: Clean up waiting on glocks The prepare_to_wait_on_glock and finish_wait_on_glock functions introduced in commit 56a365be "gfs2: gfs2_glock_get: Wait on freeing glocks" are better removed, resulting in cleaner code. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5ad757f0ce60..ffca19598525 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -112,23 +112,6 @@ static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } -static void prepare_to_wait_on_glock(wait_queue_head_t **wq, - struct wait_glock_queue *wait, - struct lm_lockname *name) -{ - wait->name = name; - init_wait(&wait->wait); - wait->wait.func = glock_wake_function; - *wq = glock_waitqueue(name); - prepare_to_wait(*wq, &wait->wait, TASK_UNINTERRUPTIBLE); -} - -static void finish_wait_on_glock(wait_queue_head_t *wq, - struct wait_glock_queue *wait) -{ - finish_wait(wq, &wait->wait); -} - /** * wake_up_glock - Wake up waiters on a glock * @gl: the glock @@ -754,11 +737,15 @@ static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, struct gfs2_glock *new) { struct wait_glock_queue wait; - wait_queue_head_t *wq; + wait_queue_head_t *wq = glock_waitqueue(name); struct gfs2_glock *gl; + wait.name = name; + init_wait(&wait.wait); + wait.wait.func = glock_wake_function; + again: - prepare_to_wait_on_glock(&wq, &wait, name); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); rcu_read_lock(); if (new) { gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, @@ -776,7 +763,7 @@ again: } out: rcu_read_unlock(); - finish_wait_on_glock(wq, &wait); + finish_wait(wq, &wait.wait); return gl; } From b066a4eebd4f5ea77f7e5c7d13104d38e1a1d4bf Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Fri, 4 Aug 2017 12:15:32 -0500 Subject: [PATCH 20/29] gfs2: forcibly flush ail to relieve memory pressure On systems with low memory, it is possible for gfs2 to infinitely loop in balance_dirty_pages() under heavy IO (creating sparse files). balance_dirty_pages() attempts to write out the dirty pages via gfs2_writepages() but none are found because these dirty pages are being used by the journaling code in the ail. Normally, the journal has an upper threshold which when hit triggers an automatic flush of the ail. But this threshold can be higher than the number of allowable dirty pages and result in the ail never being flushed. This patch forces an ail flush when gfs2_writepages() fails to write anything. This is a good indication that the ail might be holding some dirty pages. Signed-off-by: Abhi Das Signed-off-by: Bob Peterson --- fs/gfs2/aops.c | 14 +++++++++++++- fs/gfs2/incore.h | 1 + fs/gfs2/log.c | 4 ++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ed7a2e252ad8..68ed06962537 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -234,7 +234,19 @@ out: static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + /* + * Even if we didn't write any pages here, we might still be holding + * dirty pages in the ail. We forcibly flush the ail because we don't + * want balance_dirty_pages() to loop indefinitely trying to write out + * pages held in the ail that it can't find. + */ + if (ret == 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + + return ret; } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 73fce76e67ee..a7b0331c549d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -606,6 +606,7 @@ enum { SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, }; enum gfs2_freeze_state { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a624f694400..31585c2d22fe 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + + if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) + return 1; + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } From cc1dfa8b7571ea16dec9a29e0f4c4cad90b2a761 Mon Sep 17 00:00:00 2001 From: Thomas Tai Date: Tue, 15 Aug 2017 11:54:09 -0500 Subject: [PATCH 21/29] gfs2: fix slab corruption during mounting and umounting gfs file system When using cman-3.0.12.1 and gfs2-utils-3.0.12.1, mounting and unmounting GFS2 file system would cause kernel to hang. The slab allocator suggests that it is likely a double free memory corruption. The issue is traced back to v3.9-rc6 where a patch is submitted to use kzalloc() for storing a bitmap instead of using a local variable. The intention is to allocate memory during mount and to free memory during unmount. The original patch misses a code path which has already freed the memory and caused memory corruption. This patch sets the memory pointer to NULL after the memory is freed, so that double free memory corruption will not happen. gdlm_mount() '-- set_recover_size() which use kzalloc() '-- if dlm does not support ops callbacks then '--- free_recover_size() which use kfree() gldm_unmount() '-- free_recover_size() which use kfree() Previous patch which introduced the double free issue is commit 57c7310b8eb9 ("GFS2: use kmalloc for lvb bitmap") Signed-off-by: Thomas Tai Signed-off-by: Bob Peterson Reviewed-by: Liam R. Howlett --- fs/gfs2/lock_dlm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0515f0a68637..1d98b8a36eb3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1059,6 +1059,7 @@ static void free_recover_size(struct lm_lockstruct *ls) ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; ls->ls_recover_size = 0; + ls->ls_lvb_bits = NULL; } /* dlm calls before it does lock recovery */ From 942b0cddfbf66295effc9fd879ca85ae10638565 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 16 Aug 2017 11:30:06 -0500 Subject: [PATCH 22/29] GFS2: Withdraw for IO errors writing to the journal or statfs Before this patch, if GFS2 encountered IO errors while writing to the journal, it would not report the problem, so they would go unnoticed, sometimes for many hours. Sometimes this would only be noticed later, when recovery tried to do journal replay and failed due to invalid metadata at the blocks that resulted in IO errors. This patch makes GFS2's log daemon check for IO errors. If it encounters one, it withdraws from the file system and reports why in dmesg. A similar action is taken when IO errors occur when writing to the system statfs file. These errors are also reported back to any callers of fsync, since that requires the journal to be flushed. Therefore, any IO errors that would previously go unnoticed are now noticed and the file system is withdrawn as early as possible, thus preventing further file system damage. Also note that this reintroduces superblock variable sd_log_error, which Christoph removed with commit f729b66fca. Signed-off-by: Bob Peterson --- fs/gfs2/incore.h | 1 + fs/gfs2/log.c | 9 +++++++++ fs/gfs2/lops.c | 7 +++++-- fs/gfs2/quota.c | 5 ++++- fs/gfs2/super.c | 4 ++-- 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index a7b0331c549d..0ce0b334f412 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -817,6 +817,7 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 31585c2d22fe..f72c44231406 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -923,6 +923,15 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + /* Check for errors writing to the journal */ + if (sdp->sd_log_error) { + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + } + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3010f9edd177..7dabbe721dba 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -207,8 +207,11 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_status) - fs_err(sdp, "Error %d writing to log\n", bio->bi_status); + if (bio->bi_status) { + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + wake_up(&sdp->sd_logd_waitq); + } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 739adf105d7f..e647938432bd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1474,8 +1474,11 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); + sdp->sd_log_error = error; + wake_up(&sdp->sd_logd_waitq); + } } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a83fe8260d2e..769841185ce5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -944,9 +944,9 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) + if (wait) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); - return 0; + return sdp->sd_log_error; } void gfs2_freeze_func(struct work_struct *work) From 561b796987af8b169c47375194527219ef5b0974 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 22 Aug 2017 12:17:35 -0500 Subject: [PATCH 23/29] gfs2: Silence gcc format-truncation warning Enlarge sd_fsname to be big enough for the longest long lock table name and an arbitrary journal number. This silences two -Wformat-truncation warnings with gcc 7.1.1. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/incore.h | 2 +- fs/gfs2/ops_fstype.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0ce0b334f412..6e18e9793ec4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -833,7 +833,7 @@ struct gfs2_sbd { atomic_t sd_freeze_state; struct mutex sd_freeze_mutex; - char sd_fsname[GFS2_FSNAME_LEN]; + char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 51752de54f11..c0a4b3778f3f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1113,7 +1113,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); /* @@ -1159,10 +1159,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s", sdp->sd_table_name); else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u", sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); error = init_inodes(sdp, DO); From 27c3b415f6ffe6d29a95c8187e35436fcc9638b6 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 18 Aug 2017 09:15:13 -0500 Subject: [PATCH 24/29] GFS2: Fix up some sparse warnings This patch cleans up various pieces of GFS2 to avoid sparse errors. This doesn't fix them all, but it fixes several. The first error, in function glock_hash_walk was a genuine bug where the rhashtable could be started and not stopped. Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 9 ++++++--- fs/gfs2/lock_dlm.c | 4 +--- fs/gfs2/util.h | 1 + fs/gfs2/xattr.c | 1 + 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ffca19598525..4178417249c3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1550,14 +1550,15 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) do { gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; + if (IS_ERR(gl)) + goto walk_stop; while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) - if ((gl->gl_name.ln_sbd == sdp) && + if (gl->gl_name.ln_sbd == sdp && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); +walk_stop: rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1940,6 +1941,7 @@ static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; @@ -1972,6 +1974,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) + __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 1d98b8a36eb3..65f33a0ac190 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -23,8 +23,6 @@ #include "sys.h" #include "trace_gfs2.h" -extern struct workqueue_struct *gfs2_control_wq; - /** * gfs2_update_stats - Update time based stats * @mv: Pointer to mean/variance structure to update @@ -1176,7 +1174,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, spin_unlock(&ls->ls_recover_spin); } -const struct dlm_lockspace_ops gdlm_lockspace_ops = { +static const struct dlm_lockspace_ops gdlm_lockspace_ops = { .recover_prep = gdlm_recover_prep, .recover_slot = gdlm_recover_slot, .recover_done = gdlm_recover_done, diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index c81295f407f6..3926f95a6eb7 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -151,6 +151,7 @@ extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; +extern struct workqueue_struct *gfs2_control_wq; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 54179554c7d2..cf694de4991a 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -25,6 +25,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "util.h" From 7023a0b16f66a2f1358c95989d23142d8191fd6e Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 30 Aug 2017 07:46:24 -0500 Subject: [PATCH 25/29] GFS2: Fix gl_object warnings The following cleanup is needed to avoid spilling the syslog with false warnings. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 29fbeee36fa6..95b2a57ded33 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,7 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - glock_set_object(gl, NULL); + glock_clear_object(gl, rgd); gfs2_glock_put(gl); } From d296b15ed58231bd991c0fb0f3592d595539bcd1 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 30 Aug 2017 07:50:03 -0500 Subject: [PATCH 26/29] gfs2: constify rhashtable_params rhashtable_params are not supposed to change at runtime. All Functions rhashtable_* working with const rhashtable_params provided by . So mark the non-const structs as const. Signed-off-by: Arvind Yadav Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4178417249c3..98e845b7841b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) -static struct rhashtable_params ht_parms = { +static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), From c4a9d1892f1ce6fe040b717b68bd21e689cc2410 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 30 Aug 2017 09:26:09 -0500 Subject: [PATCH 27/29] GFS2: Fix non-recursive truncate bug Before this patch if you truncated a file to a smaller size it wasn't freeing all the blocks properly. There are two reasons. First, the metapath comparison was not comparing previous heights. I added a function, mp_eq_to_hgt, which checks the metapath at all heights prior to the target height. Second, in function find_nonnull_ptr, it needed to zero out all pointers for heights following the target height. Translated into decimal integer terms, this way a number like 299, when incremented, becomes 300, not 399. The 2 gets incremented to 3, and the following digits need to be reset. These two things allow the truncate state machine to properly find the blocks it needs to delete. Signed-off-by: Bob Peterson --- fs/gfs2/bmap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index fa3ea29f39cf..3dd0cceefa43 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1104,8 +1104,15 @@ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, while (true) { ptr = metapointer(h, mp); - if (*ptr) /* if we have a non-null pointer */ + if (*ptr) { /* if we have a non-null pointer */ + /* Now zero the metapath after the current height. */ + h++; + if (h < GFS2_MAX_META_HEIGHT) + memset(&mp->mp_list[h], 0, + (GFS2_MAX_META_HEIGHT - h) * + sizeof(mp->mp_list[0])); return true; + } if (mp->mp_list[h] < ptrs) mp->mp_list[h]++; @@ -1121,6 +1128,13 @@ enum dealloc_states { DEALLOC_DONE = 3, /* process complete */ }; +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h) +{ + if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + /** * trunc_dealloc - truncate a file down to a desired size * @ip: inode to truncate @@ -1198,8 +1212,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) /* If we're truncating to a non-zero size and the mp is at the beginning of file for the strip height, we need to preserve the first metadata pointer. */ - preserve1 = (newsize && - (mp.mp_list[mp_h] == nbof[mp_h])); + preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h)); bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, From 54aae14beee6a6e9f72358f1873b3e497029c41d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Wed, 30 Aug 2017 07:26:30 -0500 Subject: [PATCH 28/29] gfs2: don't return ENODATA in __gfs2_xattr_set unless replacing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function __gfs2_xattr_set() will return -ENODATA when called to remove a xattr that does not exist. The result is that setfacl will show an exit status of 1 when called to set only a file's mode bits (on a file with no ACLs), despite succeeding. A "No data available" error will be printed as well. To fix this return 0 instead, except when the XATTR_REPLACE flag is set, in which case -ENODATA is appropriate. This is consistent with how most other xattr setting functions work, in other filesystems. Signed-off-by: Ernesto A. Fernández Signed-off-by: Bob Peterson --- fs/gfs2/xattr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index cf694de4991a..ea09e41dbb49 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1210,8 +1210,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, if (namel > GFS2_EA_MAX_NAME_LEN) return -ERANGE; - if (value == NULL) - return gfs2_xattr_remove(ip, type, name); + if (value == NULL) { + error = gfs2_xattr_remove(ip, type, name); + if (error == -ENODATA && !(flags & XATTR_REPLACE)) + error = 0; + return error; + } if (ea_check_size(sdp, namel, size)) return -ERANGE; From 309e8cda596f6552a32dd14b969ce9b17f837f2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Thu, 31 Aug 2017 07:53:15 -0500 Subject: [PATCH 29/29] gfs2: preserve i_mode if __gfs2_set_acl() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing a file's acl mask, __gfs2_set_acl() will first set the group bits of i_mode to the value of the mask, and only then set the actual extended attribute representing the new acl. If the second part fails (due to lack of space, for example) and the file had no acl attribute to begin with, the system will from now on assume that the mask permission bits are actual group permission bits, potentially granting access to the wrong users. Prevent this by only changing the inode mode after the acl has been set. Signed-off-by: Ernesto A. Fernández Signed-off-by: Bob Peterson --- fs/gfs2/acl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index a6d962323790..9d5eecb123de 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -116,6 +116,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct gfs2_holder gh; bool need_unlock = false; int ret; + umode_t mode; if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) return -E2BIG; @@ -130,17 +131,19 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; need_unlock = true; } - if (type == ACL_TYPE_ACCESS && acl) { - umode_t mode = inode->i_mode; - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + mode = inode->i_mode; + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &mode, &acl); if (ret) goto unlock; - if (mode != inode->i_mode) - mark_inode_dirty(inode); } ret = __gfs2_set_acl(inode, acl, type); + if (!ret && mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh);