From 98bd5c547ef2300f915fc1adce5b6f25c195d4d4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 19 Jan 2015 14:21:02 +0100 Subject: [PATCH 1/6] btrfs: sync ioctl, handle errors after transaction start The version merged to 3.19 did not handle errors from start_trancaction and could pass an invalid pointer to commit_transaction. Fixes: 6b5fe46dfa52441f ("btrfs: do commit in sync_fs if there are pending changes") Reported-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 60f7cbe815e9..c86fb5438454 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1001,9 +1001,9 @@ int btrfs_sync_fs(struct super_block *sb, int wait) if (fs_info->pending_changes == 0) return 0; trans = btrfs_start_transaction(root, 0); - } else { - return PTR_ERR(trans); } + if (IS_ERR(trans)) + return PTR_ERR(trans); } return btrfs_commit_transaction(trans, root); } From 379d6854a2092e38b6e56a8067d922e31461b7e2 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Fri, 9 Jan 2015 17:37:52 +0900 Subject: [PATCH 2/6] Btrfs: fix incorrect freeing in scrub_stripe The address that should be freed is not 'ppath' but 'path'. Signed-off-by: Tsutomu Itoh Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9e1569ffbf6e..2f0fbc374e87 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3053,7 +3053,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ppath = btrfs_alloc_path(); if (!ppath) { - btrfs_free_path(ppath); + btrfs_free_path(path); return -ENOMEM; } From 75c68e9fbbdfc04467c9edcac76be998beaa630b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 16 Jan 2015 13:24:40 +0000 Subject: [PATCH 3/6] Btrfs: fix race deleting block group from space_info->ro_bgs list When removing a block group we were deleting it from its space_info's ro_bgs list without the correct protection - the space info's spinlock. Fix this by doing the list delete while holding the spinlock of the corresponding space info, which is the correct lock for any operation on that list. This issue was introduced in the 3.19 kernel by the following change: Btrfs: move read only block groups onto their own list V2 commit 633c0aad4c0243a506a3e8590551085ad78af82d I ran into a kernel crash while a task was running statfs, which iterates the space_info->ro_bgs list while holding the space info's spinlock, and another task was deleting it from the same list, without holding that spinlock, as part of the block group remove operation (while running the function btrfs_remove_block_group). This happened often when running the stress test xfstests/generic/038 I recently made. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7e607416755a..0b180708bf79 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1171,6 +1171,7 @@ struct btrfs_space_info { struct percpu_counter total_bytes_pinned; struct list_head list; + /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; struct rw_semaphore groups_sem; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 15116585e714..a684086c3c81 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9422,7 +9422,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - list_del_init(&block_group->ro_list); if (list_empty(&block_group->space_info->block_groups[index])) { kobj = block_group->space_info->block_group_kobjs[index]; block_group->space_info->block_group_kobjs[index] = NULL; @@ -9464,6 +9463,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; From 6e1103a6e9b19dbdc348077d04a546b626911fc5 Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Thu, 25 Dec 2014 18:21:41 +0900 Subject: [PATCH 4/6] btrfs: fix state->private cast on 32 bit machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Suppress the following warning displayed on building 32bit (i686) kernel. =============================================================================== ... CC [M] fs/btrfs/extent_io.o fs/btrfs/extent_io.c: In function ‘btrfs_free_io_failure_record’: fs/btrfs/extent_io.c:2193:13: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] failrec = (struct io_failure_record *)state->private; ... =============================================================================== Signed-off-by: Satoru Takeuchi Reported-by: Chris Murphy Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4ebabd237153..790dbae3343c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2190,7 +2190,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) next = next_state(state); - failrec = (struct io_failure_record *)state->private; + failrec = (struct io_failure_record *)(unsigned long)state->private; free_extent_state(state); kfree(failrec); From 6c9fe14f9d64cc12401a825a60ec5c5723496ca4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 20 Jan 2015 17:05:33 +0800 Subject: [PATCH 5/6] btrfs: Fix the bug that fs_info->pending_changes is never cleared. Fs_info->pending_changes is never cleared since the original code uses cmpxchg(&fs_info->pending_changes, 0, 0), which will only clear it if pending_changes is already 0. This will cause a lot of problem when mount it with inode_cache mount option. If the btrfs is mounted as inode_cache, pending_changes will always be 1, even when the fs is frozen. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a605d4e2f2bc..e88b59d13439 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2118,7 +2118,7 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) unsigned long prev; unsigned long bit; - prev = cmpxchg(&fs_info->pending_changes, 0, 0); + prev = xchg(&fs_info->pending_changes, 0); if (!prev) return; From a53f4f8e9c8ebe6c9ee3b34c368913aae9876e22 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 19 Jan 2015 15:42:41 +0800 Subject: [PATCH 6/6] btrfs: Don't call btrfs_start_transaction() on frozen fs to avoid deadlock. Commit 6b5fe46dfa52 (btrfs: do commit in sync_fs if there are pending changes) will call btrfs_start_transaction() in sync_fs(), to handle some operations needed to be done in next transaction. However this can cause deadlock if the filesystem is frozen, with the following sys_r+w output: [ 143.255932] Call Trace: [ 143.255936] [] schedule+0x29/0x70 [ 143.255939] [] __sb_start_write+0xb3/0x100 [ 143.255971] [] start_transaction+0x2e6/0x5a0 [btrfs] [ 143.255992] [] btrfs_start_transaction+0x1b/0x20 [btrfs] [ 143.256003] [] btrfs_sync_fs+0xca/0xd0 [btrfs] [ 143.256007] [] sync_fs_one_sb+0x20/0x30 [ 143.256011] [] iterate_supers+0xe1/0xf0 [ 143.256014] [] sys_sync+0x55/0x90 [ 143.256017] [] system_call_fastpath+0x12/0x17 [ 143.256111] Call Trace: [ 143.256114] [] schedule+0x29/0x70 [ 143.256119] [] rwsem_down_write_failed+0x1c5/0x2d0 [ 143.256123] [] call_rwsem_down_write_failed+0x13/0x20 [ 143.256131] [] thaw_super+0x28/0xc0 [ 143.256135] [] do_vfs_ioctl+0x3f5/0x540 [ 143.256187] [] SyS_ioctl+0x91/0xb0 [ 143.256213] [] system_call_fastpath+0x12/0x17 The reason is like the following: (Holding s_umount) VFS sync_fs staff: |- btrfs_sync_fs() |- btrfs_start_transaction() |- sb_start_intwrite() (Waiting thaw_fs to unfreeze) VFS thaw_fs staff: thaw_fs() (Waiting sync_fs to release s_umount) So deadlock happens. This can be easily triggered by fstest/generic/068 with inode_cache mount option. The fix is to check if the fs is frozen, if the fs is frozen, just return and waiting for the next transaction. Cc: David Sterba Reported-by: Gui Hecheng Signed-off-by: Qu Wenruo [enhanced comment, changed to SB_FREEZE_WRITE] Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c86fb5438454..6f49b2872a64 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1000,6 +1000,16 @@ int btrfs_sync_fs(struct super_block *sb, int wait) */ if (fs_info->pending_changes == 0) return 0; + /* + * A non-blocking test if the fs is frozen. We must not + * start a new transaction here otherwise a deadlock + * happens. The pending operations are delayed to the + * next commit after thawing. + */ + if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) + __sb_end_write(sb, SB_FREEZE_WRITE); + else + return 0; trans = btrfs_start_transaction(root, 0); } if (IS_ERR(trans))