1
0
Fork 0
alistair23-linux/fs/btrfs/volumes.c

7993 lines
210 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/sched.h>
btrfs: fix lockdep splat in add_missing_dev commit fccc0007b8dc952c6bc0805cdf842eb8ea06a639 upstream. Nikolay reported a lockdep splat in generic/476 that I could reproduce with btrfs/187. ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc2+ #1 Tainted: G W ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff9e8ef38b6268 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc_trace+0x3a/0x1a0 btrfs_alloc_device+0x43/0x210 add_missing_dev+0x20/0x90 read_one_chunk+0x301/0x430 btrfs_read_sys_array+0x17b/0x1b0 open_ctree+0xa62/0x1896 btrfs_mount_root.cold+0x12/0xea legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x10d/0x379 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_lookup_inode+0x2a/0x8f __btrfs_update_delayed_inode+0x80/0x240 btrfs_commit_inode_delayed_inode+0x119/0x120 btrfs_evict_inode+0x357/0x500 evict+0xcf/0x1f0 vfs_rmdir.part.0+0x149/0x160 do_rmdir+0x136/0x1a0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&fs_info->chunk_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffffa9d65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff9e8e9da260e0 (&type->s_umount_key#48){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 1 PID: 100 Comm: kswapd0 Tainted: G W 5.9.0-rc2+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x12d/0x150 __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa4/0x3d0 ? btrfs_evict_inode+0x11e/0x500 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x46/0x60 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This is because we are holding the chunk_mutex when we call btrfs_alloc_device, which does a GFP_KERNEL allocation. We don't want to switch that to a GFP_NOFS lock because this is the only place where it matters. So instead use memalloc_nofs_save() around the allocation in order to avoid the lockdep splat. Reported-by: Nikolay Borisov <nborisov@suse.com> CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-08-31 08:52:42 -06:00
#include <linux/sched/mm.h>
#include <linux/bio.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 02:04:11 -06:00
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
#include "space-info.h"
#include "block-group.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
.devs_min = 4,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
.nparity = 0,
.raid_name = "raid10",
.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
},
[BTRFS_RAID_RAID1] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 2,
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 2,
.ncopies = 2,
.nparity = 0,
.raid_name = "raid1",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
.dev_stripes = 2,
.devs_max = 1,
.devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 2,
.nparity = 0,
.raid_name = "dup",
.bg_flag = BTRFS_BLOCK_GROUP_DUP,
.mindev_error = 0,
},
[BTRFS_RAID_RAID0] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 2,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
.nparity = 0,
.raid_name = "raid0",
.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
.mindev_error = 0,
},
[BTRFS_RAID_SINGLE] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 1,
.devs_min = 1,
.tolerated_failures = 0,
.devs_increment = 1,
.ncopies = 1,
.nparity = 0,
.raid_name = "single",
.bg_flag = 0,
.mindev_error = 0,
},
[BTRFS_RAID_RAID5] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 2,
.tolerated_failures = 1,
.devs_increment = 1,
.ncopies = 1,
.nparity = 1,
.raid_name = "raid5",
.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
},
[BTRFS_RAID_RAID6] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 3,
.tolerated_failures = 2,
.devs_increment = 1,
.ncopies = 1,
.nparity = 2,
.raid_name = "raid6",
.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
},
};
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
if (index >= BTRFS_NR_RAID_TYPES)
return NULL;
return btrfs_raid_array[index].raid_name;
}
/*
* Fill @buf with textual description of @bg_flags, no more than @size_buf
* bytes including terminating null byte.
*/
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
int i;
int ret;
char *bp = buf;
u64 flags = bg_flags;
u32 size_bp = size_buf;
if (!flags) {
strcpy(bp, "NONE");
return;
}
#define DESCRIBE_FLAG(flag, desc) \
do { \
if (flags & (flag)) { \
ret = snprintf(bp, size_bp, "%s|", (desc)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
flags &= ~(flag); \
} \
} while (0)
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG
if (flags) {
ret = snprintf(bp, size_bp, "0x%llx|", flags);
size_bp -= ret;
}
if (size_bp < size_buf)
buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
/*
* The text is trimmed, it's up to the caller to provide sufficiently
* large buffer
*/
out_overflow:;
}
static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map);
/*
* Device locking
* ==============
*
* There are several mutexes that protect manipulation of devices and low-level
* structures like chunks but not block groups, extents or files
*
* uuid_mutex (global lock)
* ------------------------
* protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
* the SCAN_DEV ioctl registration or from mount either implicitly (the first
* device) or requested by the device= mount option
*
* the mutex can be very coarse and can cover long-running operations
*
* protects: updates to fs_devices counters like missing devices, rw devices,
* seeding, structure cloning, opening/closing devices at mount/umount time
*
* global::fs_devs - add, remove, updates to the global list
*
btrfs: open device without device_list_mutex commit 18c850fdc5a801bad4977b0f1723761d42267e45 upstream. There's long existed a lockdep splat because we open our bdev's under the ->device_list_mutex at mount time, which acquires the bd_mutex. Usually this goes unnoticed, but if you do loopback devices at all suddenly the bd_mutex comes with a whole host of other dependencies, which results in the splat when you mount a btrfs file system. ====================================================== WARNING: possible circular locking dependency detected 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Not tainted ------------------------------------------------------ systemd-journal/509 is trying to acquire lock: ffff970831f84db0 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x44/0x70 [btrfs] but task is already holding lock: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x13e/0x220 btrfs_page_mkwrite+0x59/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 -> #5 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x60/0x80 _copy_from_user+0x20/0xb0 get_sg_io_hdr+0x9a/0xb0 scsi_cmd_ioctl+0x1ea/0x2f0 cdrom_ioctl+0x3c/0x12b4 sr_block_ioctl+0xa4/0xd0 block_ioctl+0x3f/0x50 ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&cd->lock){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 sr_block_open+0xa2/0x180 __blkdev_get+0xdd/0x550 blkdev_get+0x38/0x150 do_dentry_open+0x16b/0x3e0 path_openat+0x3c9/0xa00 do_filp_open+0x75/0x100 do_sys_openat2+0x8a/0x140 __x64_sys_openat+0x46/0x70 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 __blkdev_get+0x6a/0x550 blkdev_get+0x85/0x150 blkdev_get_by_path+0x2c/0x70 btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] open_fs_devices+0x88/0x240 [btrfs] btrfs_open_devices+0x92/0xa0 [btrfs] btrfs_mount_root+0x250/0x490 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x119/0x380 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x8c6/0xca0 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_run_dev_stats+0x36/0x420 [btrfs] commit_cowonly_roots+0x91/0x2d0 [btrfs] btrfs_commit_transaction+0x4e6/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_commit_transaction+0x48e/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 __mutex_lock+0x7b/0x820 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by systemd-journal/509: #0: ffff97083bdec8b8 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x12e/0x4b0 #1: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] #2: ffff97083144d6a8 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3f8/0x500 [btrfs] stack backtrace: CPU: 0 PID: 509 Comm: systemd-journal Not tainted 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x134/0x150 __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] __mutex_lock+0x7b/0x820 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? kvm_sched_clock_read+0x14/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0xc/0xb0 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] ? sched_clock+0x5/0x10 do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7fa3972fdbfe Code: Bad RIP value. Fix this by not holding the ->device_list_mutex at this point. The device_list_mutex exists to protect us from modifying the device list while the file system is running. However it can also be modified by doing a scan on a device. But this action is specifically protected by the uuid_mutex, which we are holding here. We cannot race with opening at this point because we have the ->s_mount lock held during the mount. Not having the ->device_list_mutex here is perfectly safe as we're not going to change the devices at this point. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add some comments ] Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-17 13:12:27 -06:00
* does not protect: manipulation of the fs_devices::devices list in general
* but in mount context it could be used to exclude list modifications by eg.
* scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
* fs_devices::device_list_mutex (per-fs, with RCU)
* ------------------------------------------------
* protects updates to fs_devices::devices, ie. adding and deleting
*
* simple list traversal with read-only actions can be done with RCU protection
*
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
btrfs: open device without device_list_mutex commit 18c850fdc5a801bad4977b0f1723761d42267e45 upstream. There's long existed a lockdep splat because we open our bdev's under the ->device_list_mutex at mount time, which acquires the bd_mutex. Usually this goes unnoticed, but if you do loopback devices at all suddenly the bd_mutex comes with a whole host of other dependencies, which results in the splat when you mount a btrfs file system. ====================================================== WARNING: possible circular locking dependency detected 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Not tainted ------------------------------------------------------ systemd-journal/509 is trying to acquire lock: ffff970831f84db0 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x44/0x70 [btrfs] but task is already holding lock: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x13e/0x220 btrfs_page_mkwrite+0x59/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 -> #5 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x60/0x80 _copy_from_user+0x20/0xb0 get_sg_io_hdr+0x9a/0xb0 scsi_cmd_ioctl+0x1ea/0x2f0 cdrom_ioctl+0x3c/0x12b4 sr_block_ioctl+0xa4/0xd0 block_ioctl+0x3f/0x50 ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&cd->lock){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 sr_block_open+0xa2/0x180 __blkdev_get+0xdd/0x550 blkdev_get+0x38/0x150 do_dentry_open+0x16b/0x3e0 path_openat+0x3c9/0xa00 do_filp_open+0x75/0x100 do_sys_openat2+0x8a/0x140 __x64_sys_openat+0x46/0x70 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 __blkdev_get+0x6a/0x550 blkdev_get+0x85/0x150 blkdev_get_by_path+0x2c/0x70 btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] open_fs_devices+0x88/0x240 [btrfs] btrfs_open_devices+0x92/0xa0 [btrfs] btrfs_mount_root+0x250/0x490 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x119/0x380 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x8c6/0xca0 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_run_dev_stats+0x36/0x420 [btrfs] commit_cowonly_roots+0x91/0x2d0 [btrfs] btrfs_commit_transaction+0x4e6/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_commit_transaction+0x48e/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 __mutex_lock+0x7b/0x820 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by systemd-journal/509: #0: ffff97083bdec8b8 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x12e/0x4b0 #1: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] #2: ffff97083144d6a8 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3f8/0x500 [btrfs] stack backtrace: CPU: 0 PID: 509 Comm: systemd-journal Not tainted 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x134/0x150 __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] __mutex_lock+0x7b/0x820 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? kvm_sched_clock_read+0x14/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0xc/0xb0 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] ? sched_clock+0x5/0x10 do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7fa3972fdbfe Code: Bad RIP value. Fix this by not holding the ->device_list_mutex at this point. The device_list_mutex exists to protect us from modifying the device list while the file system is running. However it can also be modified by doing a scan on a device. But this action is specifically protected by the uuid_mutex, which we are holding here. We cannot race with opening at this point because we have the ->s_mount lock held during the mount. Not having the ->device_list_mutex here is perfectly safe as we're not going to change the devices at this point. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add some comments ] Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-17 13:12:27 -06:00
* Is not required at mount and close times, because our device list is
* protected by the uuid_mutex at that point.
*
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
* several places (internally, ioctl)
*
* chunk_mutex
* -----------
* protects chunks, adding or removing during allocation, trim or when a new
* device is added/removed. Additionally it also protects post_commit_list of
* individual devices, since they can be added to the transaction's
* post_commit_list only with chunk_mutex held.
*
* cleaner_mutex
* -------------
* a big lock that is held by the cleaner thread and prevents running subvolume
* cleaning together with relocation or delayed iputs
*
*
* Lock nesting
* ============
*
* uuid_mutex
* volume_mutex
* device_list_mutex
* chunk_mutex
* balance_mutex
*
*
* Exclusive operations, BTRFS_FS_EXCL_OP
* ======================================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
*
* - Balance (*)
* - Device add
* - Device remove
* - Device replace (*)
* - Resize
*
* The device operations (as above) can be in one of the following states:
*
* - Running state
* - Paused state
* - Completed state
*
* Only device operations marked with (*) can go into the Paused state for the
* following reasons:
*
* - ioctl (only Balance can be Paused through ioctl)
* - filesystem remounted as read-only
* - filesystem unmounted and mounted as read-only
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
* BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
* During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
* BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
* completed.
*/
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head *btrfs_get_fs_uuids(void)
{
return &fs_uuids;
}
/*
* alloc_fs_devices - allocate struct btrfs_fs_devices
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
* @fsid: if not NULL, copy the UUID to fs_devices::fsid
* @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid
*
* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
* The returned struct is not linked onto any lists and can be destroyed with
* kfree() right away.
*/
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devs;
fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
if (metadata_fsid)
memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
else if (fsid)
memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
return fs_devs;
}
void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
kfree(device);
}
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
while (!list_empty(&fs_devices->devices)) {
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_free_device(device);
}
kfree(fs_devices);
}
void __exit btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
fs_devices = list_entry(fs_uuids.next,
struct btrfs_fs_devices, fs_list);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
/*
* Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return ERR_PTR(-ENOMEM);
/*
* Preallocate a bio that's always going to be used for flushing device
* barriers and matches the device lifespan
*/
dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
if (!dev->flush_bio) {
kfree(dev);
return ERR_PTR(-ENOMEM);
}
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
spin_lock_init(&dev->io_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
mm, page_alloc: distinguish between being unable to sleep, unwilling to sleep and avoiding waking kswapd __GFP_WAIT has been used to identify atomic context in callers that hold spinlocks or are in interrupts. They are expected to be high priority and have access one of two watermarks lower than "min" which can be referred to as the "atomic reserve". __GFP_HIGH users get access to the first lower watermark and can be called the "high priority reserve". Over time, callers had a requirement to not block when fallback options were available. Some have abused __GFP_WAIT leading to a situation where an optimisitic allocation with a fallback option can access atomic reserves. This patch uses __GFP_ATOMIC to identify callers that are truely atomic, cannot sleep and have no alternative. High priority users continue to use __GFP_HIGH. __GFP_DIRECT_RECLAIM identifies callers that can sleep and are willing to enter direct reclaim. __GFP_KSWAPD_RECLAIM to identify callers that want to wake kswapd for background reclaim. __GFP_WAIT is redefined as a caller that is willing to enter direct reclaim and wake kswapd for background reclaim. This patch then converts a number of sites o __GFP_ATOMIC is used by callers that are high priority and have memory pools for those requests. GFP_ATOMIC uses this flag. o Callers that have a limited mempool to guarantee forward progress clear __GFP_DIRECT_RECLAIM but keep __GFP_KSWAPD_RECLAIM. bio allocations fall into this category where kswapd will still be woken but atomic reserves are not used as there is a one-entry mempool to guarantee progress. o Callers that are checking if they are non-blocking should use the helper gfpflags_allow_blocking() where possible. This is because checking for __GFP_WAIT as was done historically now can trigger false positives. Some exceptions like dm-crypt.c exist where the code intent is clearer if __GFP_DIRECT_RECLAIM is used instead of the helper due to flag manipulations. o Callers that built their own GFP flags instead of starting with GFP_KERNEL and friends now also need to specify __GFP_KSWAPD_RECLAIM. The first key hazard to watch out for is callers that removed __GFP_WAIT and was depending on access to atomic reserves for inconspicuous reasons. In some cases it may be appropriate for them to use __GFP_HIGH. The second key hazard is callers that assembled their own combination of GFP flags instead of starting with something like GFP_KERNEL. They may now wish to specify __GFP_KSWAPD_RECLAIM. It's almost certainly harmless if it's missed in most cases as other activity will wake kswapd. Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Vitaly Wool <vitalywool@gmail.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 17:28:21 -07:00
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
return dev;
}
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
static noinline struct btrfs_fs_devices *find_fsid(
const u8 *fsid, const u8 *metadata_fsid)
{
struct btrfs_fs_devices *fs_devices;
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
ASSERT(fsid);
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
if (metadata_fsid) {
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by first scanning
* a device which didn't have its fsid/metadata_uuid changed
* at all and the CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (fs_devices->fsid_change &&
memcmp(metadata_fsid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
btrfs: Handle final split-brain possibility during fsid change This patch lands the last case which needs to be handled by the fsid change code. Namely, this is the case where a multidisk filesystem has already undergone at least one successful fsid change i.e all disks have the METADATA_UUID incompat bit and power failure occurs as another fsid change is in progress. When such an event occurs, disks could be split in 2 groups. One of the groups will have both METADATA_UUID and CHANGING_FSID_V2 flags set coupled with old fsid/metadata_uuid pairs. The other group of disks will have only METADATA_UUID bit set and their fsid will be different than the one in disks in the first group. Here we look at the following cases: a) A disk from the first group is scanned first, so fs_devices is created with stale fsid/metdata_uuid. Then when a disk from the second group is scanned it needs to first check whether there exists such an fs_devices that has fsid_change set to true (because it was created with a disk having the CHANGING_FSID_V2 flag), the metadata_uuid and fsid of the fs_devices will be different (since it was created by a disk which already has had at least 1 successful fsid change) and finally the metadata_uuid of the fs_devices will equal that of the currently scanned disk (because metadata_uuid never really changes). When the correct fs_devices is found the information from the scanned disk will replace the current one in fs_devices since the scanned disk will have higher generation number. b) A disk from the second group is scanned so fs_devices is created as usual with differing fsid/metdata_uid. Then when a disk from the first group is scanned the code detects that it has both CHANGING_FSID_V2 and METADATA_UUID flags set and will search for fs_devices that has differing metadata_uuid/fsid and whose metadata_uuid is the same as that of the scanned device. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:28 -06:00
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by a device that
* has an outdated pair of fsid/metadata_uuid and
* CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (fs_devices->fsid_change &&
memcmp(fs_devices->metadata_uuid,
fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
memcmp(metadata_fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
}
/* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
if (metadata_fsid) {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
&& memcmp(metadata_fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0)
return fs_devices;
} else {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
}
return NULL;
}
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
struct buffer_head **bh)
{
int ret;
*bdev = blkdev_get_by_path(device_path, flags, holder);
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
goto error;
}
if (flush)
filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
blkdev_put(*bdev, flags);
goto error;
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
if (IS_ERR(*bh)) {
ret = PTR_ERR(*bh);
blkdev_put(*bdev, flags);
goto error;
}
return 0;
error:
*bdev = NULL;
*bh = NULL;
return ret;
}
static void requeue_list(struct btrfs_pending_bios *pending_bios,
struct bio *head, struct bio *tail)
{
struct bio *old_head;
old_head = pending_bios->head;
pending_bios->head = head;
if (pending_bios->tail)
tail->bi_next = old_head;
else
pending_bios->tail = tail;
}
/*
* we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly
* improves the schedulers ability to collect and merge the bios.
*
* But, it also turns into a long list of bios to process and that is sure
* to eventually make the worker thread block. The solution here is to
* make some progress and then put this work struct back at the end of
* the list if the block device is congested. This way, multiple devices
* can make progress from a single worker thread.
*/
static noinline void run_scheduled_bios(struct btrfs_device *device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct bio *pending;
struct backing_dev_info *bdi;
struct btrfs_pending_bios *pending_bios;
struct bio *tail;
struct bio *cur;
int again = 0;
unsigned long num_run;
unsigned long batch_run = 0;
unsigned long last_waited = 0;
int force_reg = 0;
int sync_pending = 0;
struct blk_plug plug;
/*
* this function runs all the bios we've collected for
* a particular device. We don't want to wander off to
* another device without first sending all of these down.
* So, setup a plug here and finish it off before we return
*/
blk_start_plug(&plug);
bdi = device->bdev->bd_bdi;
loop:
spin_lock(&device->io_lock);
loop_lock:
num_run = 0;
/* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
if (!force_reg && device->pending_sync_bios.head) {
pending_bios = &device->pending_sync_bios;
force_reg = 1;
} else {
pending_bios = &device->pending_bios;
force_reg = 0;
}
pending = pending_bios->head;
tail = pending_bios->tail;
WARN_ON(pending && !tail);
/*
* if pending was null this time around, no bios need processing
* at all and we can stop. Otherwise it'll loop back up again
* and do an additional check so no bios are missed.
*
* device->running_pending is used to synchronize with the
* schedule_bio code.
*/
if (device->pending_sync_bios.head == NULL &&
device->pending_bios.head == NULL) {
again = 0;
device->running_pending = 0;
} else {
again = 1;
device->running_pending = 1;
}
pending_bios->head = NULL;
pending_bios->tail = NULL;
spin_unlock(&device->io_lock);
while (pending) {
rmb();
/* we want to work on both lists, but do more bios on the
* sync list than the regular list
*/
if ((num_run > 32 &&
pending_bios != &device->pending_sync_bios &&
device->pending_sync_bios.head) ||
(num_run > 64 && pending_bios == &device->pending_sync_bios &&
device->pending_bios.head)) {
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
goto loop_lock;
}
cur = pending;
pending = pending->bi_next;
cur->bi_next = NULL;
BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
/*
* if we're doing the sync list, record that our
* plug has some sync requests on it
*
* If we're doing the regular list and there are
* sync requests sitting around, unplug before
* we add more
*/
if (pending_bios == &device->pending_sync_bios) {
sync_pending = 1;
} else if (sync_pending) {
blk_finish_plug(&plug);
blk_start_plug(&plug);
sync_pending = 0;
}
btrfsic_submit_bio(cur);
num_run++;
batch_run++;
cond_resched();
/*
* we made progress, there is more work to do and the bdi
* is now congested. Back off and let other work structs
* run instead
*/
if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
ioc = current->io_context;
/*
* the main goal here is that we don't want to
* block if we're going to be able to submit
* more requests without blocking.
*
* This code does two great things, it pokes into
* the elevator code from a filesystem _and_
* it makes assumptions about how batching works.
*/
if (ioc && ioc->nr_batch_requests > 0 &&
time_before(jiffies, ioc->last_waited + HZ/50UL) &&
(last_waited == 0 ||
ioc->last_waited == last_waited)) {
/*
* we want to go through our batch of
* requests and stop. So, we copy out
* the ioc->last_waited time and test
* against it before looping
*/
last_waited = ioc->last_waited;
cond_resched();
continue;
}
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
device->running_pending = 1;
spin_unlock(&device->io_lock);
btrfs_queue_work(fs_info->submit_workers,
&device->work);
goto done;
}
}
cond_resched();
if (again)
goto loop;
spin_lock(&device->io_lock);
if (device->pending_bios.head || device->pending_sync_bios.head)
goto loop_lock;
spin_unlock(&device->io_lock);
done:
blk_finish_plug(&plug);
}
static void pending_bios_fn(struct btrfs_work *work)
{
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, work);
run_scheduled_bios(device);
}
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
int found;
rcu_read_lock();
found = strcmp(rcu_str_deref(device->name), path);
rcu_read_unlock();
return found == 0;
}
/*
* Search and remove all stale (devices which are not mounted) devices.
* When both inputs are NULL, it will search and release all stale devices.
* path: Optional. When provided will it release all unmounted devices
* matching this path only.
* skip_dev: Optional. Will skip this device when searching for the stale
* devices.
* Return: 0 for success or if @path is NULL.
* -EBUSY if @path is a mounted device.
* -ENOENT if @path does not match any device in the list.
*/
static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
int ret = 0;
if (path)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp_device,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
if (path && !device->name)
continue;
if (path && !device_path_matched(path, device))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
if (path && ret != 0)
ret = -EBUSY;
break;
}
/* delete the stale device */
fs_devices->num_devices--;
list_del(&device->dev_list);
btrfs_free_device(device);
ret = 0;
if (fs_devices->num_devices == 0)
break;
}
mutex_unlock(&fs_devices->device_list_mutex);
if (fs_devices->num_devices == 0) {
btrfs_sysfs_remove_fsid(fs_devices);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
}
return ret;
}
btrfs: open device without device_list_mutex commit 18c850fdc5a801bad4977b0f1723761d42267e45 upstream. There's long existed a lockdep splat because we open our bdev's under the ->device_list_mutex at mount time, which acquires the bd_mutex. Usually this goes unnoticed, but if you do loopback devices at all suddenly the bd_mutex comes with a whole host of other dependencies, which results in the splat when you mount a btrfs file system. ====================================================== WARNING: possible circular locking dependency detected 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Not tainted ------------------------------------------------------ systemd-journal/509 is trying to acquire lock: ffff970831f84db0 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x44/0x70 [btrfs] but task is already holding lock: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x13e/0x220 btrfs_page_mkwrite+0x59/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 -> #5 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x60/0x80 _copy_from_user+0x20/0xb0 get_sg_io_hdr+0x9a/0xb0 scsi_cmd_ioctl+0x1ea/0x2f0 cdrom_ioctl+0x3c/0x12b4 sr_block_ioctl+0xa4/0xd0 block_ioctl+0x3f/0x50 ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&cd->lock){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 sr_block_open+0xa2/0x180 __blkdev_get+0xdd/0x550 blkdev_get+0x38/0x150 do_dentry_open+0x16b/0x3e0 path_openat+0x3c9/0xa00 do_filp_open+0x75/0x100 do_sys_openat2+0x8a/0x140 __x64_sys_openat+0x46/0x70 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 __blkdev_get+0x6a/0x550 blkdev_get+0x85/0x150 blkdev_get_by_path+0x2c/0x70 btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] open_fs_devices+0x88/0x240 [btrfs] btrfs_open_devices+0x92/0xa0 [btrfs] btrfs_mount_root+0x250/0x490 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x119/0x380 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x8c6/0xca0 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_run_dev_stats+0x36/0x420 [btrfs] commit_cowonly_roots+0x91/0x2d0 [btrfs] btrfs_commit_transaction+0x4e6/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_commit_transaction+0x48e/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 __mutex_lock+0x7b/0x820 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by systemd-journal/509: #0: ffff97083bdec8b8 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x12e/0x4b0 #1: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] #2: ffff97083144d6a8 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3f8/0x500 [btrfs] stack backtrace: CPU: 0 PID: 509 Comm: systemd-journal Not tainted 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x134/0x150 __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] __mutex_lock+0x7b/0x820 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? kvm_sched_clock_read+0x14/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0xc/0xb0 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] ? sched_clock+0x5/0x10 do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7fa3972fdbfe Code: Bad RIP value. Fix this by not holding the ->device_list_mutex at this point. The device_list_mutex exists to protect us from modifying the device list while the file system is running. However it can also be modified by doing a scan on a device. But this action is specifically protected by the uuid_mutex, which we are holding here. We cannot race with opening at this point because we have the ->s_mount lock held during the mount. Not having the ->device_list_mutex here is perfectly safe as we're not going to change the devices at this point. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add some comments ] Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-17 13:12:27 -06:00
/*
* This is only used on mount, and we are protected from competing things
* messing with our fs_devices by the uuid_mutex, thus we do not need the
* fs_devices->device_list_mutex here.
*/
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
{
struct request_queue *q;
struct block_device *bdev;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
u64 devid;
int ret;
if (device->bdev)
return -EINVAL;
if (!device->name)
return -EINVAL;
ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
&bdev, &bh);
if (ret)
return ret;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
goto error_brelse;
if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
goto error_brelse;
device->generation = btrfs_super_generation(disk_super);
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
pr_err(
"BTRFS: Invalid seeding and uuid-changed device detected\n");
goto error_brelse;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
fs_devices->seeding = 1;
} else {
if (bdev_read_only(bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
else
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
q = bdev_get_queue(bdev);
if (!blk_queue_nonrot(q))
fs_devices->rotating = 1;
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
fs_devices->rw_devices++;
list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
brelse(bh);
return 0;
error_brelse:
brelse(bh);
blkdev_put(bdev, flags);
return -EINVAL;
}
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
* being created with a disk that has already completed its fsid change.
*/
static struct btrfs_fs_devices *find_fsid_inprogress(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->fsid,
BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
return fs_devices;
}
}
return NULL;
}
btrfs: Handle final split-brain possibility during fsid change This patch lands the last case which needs to be handled by the fsid change code. Namely, this is the case where a multidisk filesystem has already undergone at least one successful fsid change i.e all disks have the METADATA_UUID incompat bit and power failure occurs as another fsid change is in progress. When such an event occurs, disks could be split in 2 groups. One of the groups will have both METADATA_UUID and CHANGING_FSID_V2 flags set coupled with old fsid/metadata_uuid pairs. The other group of disks will have only METADATA_UUID bit set and their fsid will be different than the one in disks in the first group. Here we look at the following cases: a) A disk from the first group is scanned first, so fs_devices is created with stale fsid/metdata_uuid. Then when a disk from the second group is scanned it needs to first check whether there exists such an fs_devices that has fsid_change set to true (because it was created with a disk having the CHANGING_FSID_V2 flag), the metadata_uuid and fsid of the fs_devices will be different (since it was created by a disk which already has had at least 1 successful fsid change) and finally the metadata_uuid of the fs_devices will equal that of the currently scanned disk (because metadata_uuid never really changes). When the correct fs_devices is found the information from the scanned disk will replace the current one in fs_devices since the scanned disk will have higher generation number. b) A disk from the second group is scanned so fs_devices is created as usual with differing fsid/metdata_uid. Then when a disk from the first group is scanned the code detects that it has both CHANGING_FSID_V2 and METADATA_UUID flags set and will search for fs_devices that has differing metadata_uuid/fsid and whose metadata_uuid is the same as that of the scanned device. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:28 -06:00
static struct btrfs_fs_devices *find_fsid_changed(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handles the case where scanned device is part of an fs that had
* multiple successful changes of FSID but curently device didn't
* observe it. Meaning our fsid will be different than theirs. We need
* to handle two subcases :
* 1 - The fs still continues to have different METADATA/FSID uuids.
* 2 - The fs is switched back to its original FSID (METADATA/FSID
* are equal).
btrfs: Handle final split-brain possibility during fsid change This patch lands the last case which needs to be handled by the fsid change code. Namely, this is the case where a multidisk filesystem has already undergone at least one successful fsid change i.e all disks have the METADATA_UUID incompat bit and power failure occurs as another fsid change is in progress. When such an event occurs, disks could be split in 2 groups. One of the groups will have both METADATA_UUID and CHANGING_FSID_V2 flags set coupled with old fsid/metadata_uuid pairs. The other group of disks will have only METADATA_UUID bit set and their fsid will be different than the one in disks in the first group. Here we look at the following cases: a) A disk from the first group is scanned first, so fs_devices is created with stale fsid/metdata_uuid. Then when a disk from the second group is scanned it needs to first check whether there exists such an fs_devices that has fsid_change set to true (because it was created with a disk having the CHANGING_FSID_V2 flag), the metadata_uuid and fsid of the fs_devices will be different (since it was created by a disk which already has had at least 1 successful fsid change) and finally the metadata_uuid of the fs_devices will equal that of the currently scanned disk (because metadata_uuid never really changes). When the correct fs_devices is found the information from the scanned disk will replace the current one in fs_devices since the scanned disk will have higher generation number. b) A disk from the second group is scanned so fs_devices is created as usual with differing fsid/metdata_uid. Then when a disk from the first group is scanned the code detects that it has both CHANGING_FSID_V2 and METADATA_UUID flags set and will search for fs_devices that has differing metadata_uuid/fsid and whose metadata_uuid is the same as that of the scanned device. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:28 -06:00
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
/* Changed UUIDs */
btrfs: Handle final split-brain possibility during fsid change This patch lands the last case which needs to be handled by the fsid change code. Namely, this is the case where a multidisk filesystem has already undergone at least one successful fsid change i.e all disks have the METADATA_UUID incompat bit and power failure occurs as another fsid change is in progress. When such an event occurs, disks could be split in 2 groups. One of the groups will have both METADATA_UUID and CHANGING_FSID_V2 flags set coupled with old fsid/metadata_uuid pairs. The other group of disks will have only METADATA_UUID bit set and their fsid will be different than the one in disks in the first group. Here we look at the following cases: a) A disk from the first group is scanned first, so fs_devices is created with stale fsid/metdata_uuid. Then when a disk from the second group is scanned it needs to first check whether there exists such an fs_devices that has fsid_change set to true (because it was created with a disk having the CHANGING_FSID_V2 flag), the metadata_uuid and fsid of the fs_devices will be different (since it was created by a disk which already has had at least 1 successful fsid change) and finally the metadata_uuid of the fs_devices will equal that of the currently scanned disk (because metadata_uuid never really changes). When the correct fs_devices is found the information from the scanned disk will replace the current one in fs_devices since the scanned disk will have higher generation number. b) A disk from the second group is scanned so fs_devices is created as usual with differing fsid/metdata_uid. Then when a disk from the first group is scanned the code detects that it has both CHANGING_FSID_V2 and METADATA_UUID flags set and will search for fs_devices that has differing metadata_uuid/fsid and whose metadata_uuid is the same as that of the scanned device. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:28 -06:00
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE) != 0)
return fs_devices;
/* Unchanged UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0)
btrfs: Handle final split-brain possibility during fsid change This patch lands the last case which needs to be handled by the fsid change code. Namely, this is the case where a multidisk filesystem has already undergone at least one successful fsid change i.e all disks have the METADATA_UUID incompat bit and power failure occurs as another fsid change is in progress. When such an event occurs, disks could be split in 2 groups. One of the groups will have both METADATA_UUID and CHANGING_FSID_V2 flags set coupled with old fsid/metadata_uuid pairs. The other group of disks will have only METADATA_UUID bit set and their fsid will be different than the one in disks in the first group. Here we look at the following cases: a) A disk from the first group is scanned first, so fs_devices is created with stale fsid/metdata_uuid. Then when a disk from the second group is scanned it needs to first check whether there exists such an fs_devices that has fsid_change set to true (because it was created with a disk having the CHANGING_FSID_V2 flag), the metadata_uuid and fsid of the fs_devices will be different (since it was created by a disk which already has had at least 1 successful fsid change) and finally the metadata_uuid of the fs_devices will equal that of the currently scanned disk (because metadata_uuid never really changes). When the correct fs_devices is found the information from the scanned disk will replace the current one in fs_devices since the scanned disk will have higher generation number. b) A disk from the second group is scanned so fs_devices is created as usual with differing fsid/metdata_uid. Then when a disk from the first group is scanned the code detects that it has both CHANGING_FSID_V2 and METADATA_UUID flags set and will search for fs_devices that has differing metadata_uuid/fsid and whose metadata_uuid is the same as that of the scanned device. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:28 -06:00
return fs_devices;
}
return NULL;
}
static struct btrfs_fs_devices *find_fsid_reverted_metadata(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handle the case where the scanned device is part of an fs whose last
* metadata UUID change reverted it to the original FSID. At the same
* time * fs_devices was first created by another constitutent device
* which didn't fully observe the operation. This results in an
* btrfs_fs_devices created with metadata/fsid different AND
* btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
* fs_devices equal to the FSID of the disk.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->fsid,
BTRFS_FSID_SIZE) == 0 &&
fs_devices->fsid_change)
return fs_devices;
}
return NULL;
}
/*
* Add new device to list of registered devices
*
* Returns:
* device pointer which was just added or updated when successful
* error pointer when failed
*/
static noinline struct btrfs_device *device_list_add(const char *path,
struct btrfs_super_block *disk_super,
bool *new_device_added)
{
struct btrfs_device *device;
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
struct btrfs_fs_devices *fs_devices = NULL;
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
btrfs: Handle final split-brain possibility during fsid change This patch lands the last case which needs to be handled by the fsid change code. Namely, this is the case where a multidisk filesystem has already undergone at least one successful fsid change i.e all disks have the METADATA_UUID incompat bit and power failure occurs as another fsid change is in progress. When such an event occurs, disks could be split in 2 groups. One of the groups will have both METADATA_UUID and CHANGING_FSID_V2 flags set coupled with old fsid/metadata_uuid pairs. The other group of disks will have only METADATA_UUID bit set and their fsid will be different than the one in disks in the first group. Here we look at the following cases: a) A disk from the first group is scanned first, so fs_devices is created with stale fsid/metdata_uuid. Then when a disk from the second group is scanned it needs to first check whether there exists such an fs_devices that has fsid_change set to true (because it was created with a disk having the CHANGING_FSID_V2 flag), the metadata_uuid and fsid of the fs_devices will be different (since it was created by a disk which already has had at least 1 successful fsid change) and finally the metadata_uuid of the fs_devices will equal that of the currently scanned disk (because metadata_uuid never really changes). When the correct fs_devices is found the information from the scanned disk will replace the current one in fs_devices since the scanned disk will have higher generation number. b) A disk from the second group is scanned so fs_devices is created as usual with differing fsid/metdata_uid. Then when a disk from the first group is scanned the code detects that it has both CHANGING_FSID_V2 and METADATA_UUID flags set and will search for fs_devices that has differing metadata_uuid/fsid and whose metadata_uuid is the same as that of the scanned device. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:28 -06:00
if (fsid_change_in_progress) {
if (!has_metadata_uuid) {
/*
* When we have an image which has CHANGING_FSID_V2 set
* it might belong to either a filesystem which has
* disks with completed fsid change or it might belong
* to fs with no UUID changes in effect, handle both.
*/
fs_devices = find_fsid_inprogress(disk_super);
if (!fs_devices)
fs_devices = find_fsid(disk_super->fsid, NULL);
} else {
fs_devices = find_fsid_changed(disk_super);
}
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
} else if (has_metadata_uuid) {
fs_devices = find_fsid(disk_super->fsid,
disk_super->metadata_uuid);
} else {
fs_devices = find_fsid_reverted_metadata(disk_super);
if (!fs_devices)
fs_devices = find_fsid(disk_super->fsid, NULL);
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
}
if (!fs_devices) {
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
if (has_metadata_uuid)
fs_devices = alloc_fs_devices(disk_super->fsid,
disk_super->metadata_uuid);
else
fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
if (IS_ERR(fs_devices))
return ERR_CAST(fs_devices);
fs_devices->fsid_change = fsid_change_in_progress;
mutex_lock(&fs_devices->device_list_mutex);
list_add(&fs_devices->fs_list, &fs_uuids);
device = NULL;
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, devid,
disk_super->dev_item.uuid, NULL, false);
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
/*
* If this disk has been pulled into an fs devices created by
* a device which had the CHANGING_FSID_V2 flag then replace the
* metadata_uuid/fsid values of the fs_devices.
*/
if (fs_devices->fsid_change &&
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
if (has_metadata_uuid)
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid,
BTRFS_FSID_SIZE);
else
memcpy(fs_devices->metadata_uuid,
disk_super->fsid, BTRFS_FSID_SIZE);
btrfs: Handle one more split-brain scenario during fsid change This commit continues hardening the scanning code to handle cases where power loss could have caused disks in a multi-disk filesystem to be in inconsistent state. Namely handle the situation that can occur when some of the disks in multi-disk fs have completed their fsid change i.e they have METADATA_UUID incompat flag set, have cleared the CHANGING_FSID_V2 flag and their fsid/metadata_uuid are different. At the same time the other half of the disks will have their fsid/metadata_uuid unchanged and will only have CHANGING_FSID_V2 flag. This is handled by introducing code in the scan path which: a) Handles the case when a device with CHANGING_FSID_V2 flag is scanned and as a result btrfs_fs_devices is created with matching fsid/metdata_uuid. Subsequently, when a device with completed fsid change is scanned it will detect this via the new code in find_fsid i.e that such an fs_devices exist that fsid_change flag is set to true, it's metadata_uuid/fsid match and the metadata_uuid of the scanned device matches that of the fs_devices. In this case, it's important to note that the devices which has its fsid change completed will have a higher generation number than the device with FSID_CHANGING_V2 flag set, so its superblock block will be used during mount. To prevent an assertion triggering because the sb used for mounting will have differing fsid/metadata_uuid than the ones in the fs_devices struct also add code in device_list_add which overwrites the values in fs_devices. b) Alternatively we can end up with a device that completed its fsid change be scanned first which will create the respective btrfs_fs_devices struct with differing fsid/metadata_uuid. In this case when a device with FSID_CHANGING_V2 flag set is scanned it will call the newly added find_fsid_inprogress function which will return the correct fs_devices. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:27 -06:00
fs_devices->fsid_change = false;
}
}
if (!device) {
if (fs_devices->opened) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
mutex_unlock(&fs_devices->device_list_mutex);
/* we can safely leave the fs_devices entry around */
return device;
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
btrfs_free_device(device);
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
list_add_rcu(&device->dev_list, &fs_devices->devices);
Btrfs: fix race conditions in BTRFS_IOC_FS_INFO ioctl The handler for the ioctl BTRFS_IOC_FS_INFO was reading the number of devices before acquiring the device list mutex. This could lead to inconsistent results because the update of the device list and the number of devices counter (amongst other counters related to the device list) are updated in volumes.c while holding the device list mutex - except for 2 places, one was volumes.c:btrfs_prepare_sprout() and the other was volumes.c:device_list_add(). For example, if we have 2 devices, with IDs 1 and 2 and then add a new device, with ID 3, and while adding the device is in progress an BTRFS_IOC_FS_INFO ioctl arrives, it could return a number of devices of 2 and a max dev id of 3. This would be incorrect. Also, this ioctl handler was reading the fsid while it can be updated concurrently. This can happen when while a new device is being added and the current filesystem is in seeding mode. Example: $ mkfs.btrfs -f /dev/sdb1 $ mkfs.btrfs -f /dev/sdb2 $ btrfstune -S 1 /dev/sdb1 $ mount /dev/sdb1 /mnt/test $ btrfs device add /dev/sdb2 /mnt/test If during the last step a BTRFS_IOC_FS_INFO ioctl was requested, it could read an fsid that was never valid (some bits part of the old fsid and others part of the new fsid). Also, it could read a number of devices that doesn't match the number of devices in the list and the max device id, as explained before. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-12 13:56:58 -06:00
fs_devices->num_devices++;
device->fs_devices = fs_devices;
*new_device_added = true;
if (disk_super->label[0])
pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
disk_super->label, devid, found_transid, path);
else
pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
disk_super->fsid, devid, found_transid, path);
} else if (!device->name || strcmp(device->name->str, path)) {
Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-07-03 04:22:05 -06:00
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
* means this device was missing at time of FS mount.
* 2. If you are here and if the device->name is different
* from 'path' that means either
* a. The same device disappeared and reappeared with
* different name. or
* b. The missing-disk-which-was-replaced, has
* reappeared now.
*
* We must allow 1 and 2a above. But 2b would be a spurious
* and unintentional.
*
* Further in case of 1 and 2a above, the disk at 'path'
* would have missed some transaction when it was away and
* in case of 2a the stale bdev has to be updated as well.
* 2b must not be allowed at all time.
*/
/*
* For now, we do allow update to btrfs_fs_device through the
* btrfs dev scan cli after FS has been mounted. We're still
* tracking a problem where systems fail mount by subvolume id
* when we reject replacement on a mounted FS.
Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-07-03 04:22:05 -06:00
*/
if (!fs_devices->opened && found_transid < device->generation) {
/*
* That is if the FS is _not_ mounted and if you
* are here, that means there is more than one
* disk with same uuid and devid.We keep the one
* with larger generation number or the last-in if
* generation are equal.
*/
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EEXIST);
}
Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-07-03 04:22:05 -06:00
btrfs: harden agaist duplicate fsid on scanned devices It's not that impossible to imagine that a device OR a btrfs image is copied just by using the dd or the cp command. Which in case both the copies of the btrfs will have the same fsid. If on the system with automount enabled, the copied FS gets scanned. We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after it's been copied. For example: Initially.. /dev/mmcblk0p4 is mounted as / $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part / |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda $ dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:0 1 14.9G 0 disk |-sda4 8:4 1 4G 0 part / |-sda2 8:2 1 500M 0 part |-sda3 8:3 1 256M 0 part `-sda1 8:1 1 256M 0 part mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-14 20:45:17 -06:00
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
*/
if (device->bdev) {
struct block_device *path_bdev;
path_bdev = lookup_bdev(path);
if (IS_ERR(path_bdev)) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_CAST(path_bdev);
}
if (device->bdev != path_bdev) {
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs: don't access possibly stale fs_info data for printing duplicate device commit 0697d9a610998b8bdee6b2390836cb2391d8fd1a upstream. Syzbot reported a possible use-after-free when printing a duplicate device warning device_list_add(). At this point it can happen that a btrfs_device::fs_info is not correctly setup yet, so we're accessing stale data, when printing the warning message using the btrfs_printk() wrappers. ================================================================== BUG: KASAN: use-after-free in btrfs_printk+0x3eb/0x435 fs/btrfs/super.c:245 Read of size 8 at addr ffff8880878e06a8 by task syz-executor225/7068 CPU: 1 PID: 7068 Comm: syz-executor225 Not tainted 5.9.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1d6/0x29e lib/dump_stack.c:118 print_address_description+0x66/0x620 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report+0x132/0x1d0 mm/kasan/report.c:530 btrfs_printk+0x3eb/0x435 fs/btrfs/super.c:245 device_list_add+0x1a88/0x1d60 fs/btrfs/volumes.c:943 btrfs_scan_one_device+0x196/0x490 fs/btrfs/volumes.c:1359 btrfs_mount_root+0x48f/0xb60 fs/btrfs/super.c:1634 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x88/0x270 fs/super.c:1547 fc_mount fs/namespace.c:978 [inline] vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008 btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x88/0x270 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x179d/0x29e0 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount+0x126/0x180 fs/namespace.c:3390 do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x44840a RSP: 002b:00007ffedfffd608 EFLAGS: 00000293 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 00007ffedfffd670 RCX: 000000000044840a RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffedfffd630 RBP: 00007ffedfffd630 R08: 00007ffedfffd670 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000293 R12: 000000000000001a R13: 0000000000000004 R14: 0000000000000003 R15: 0000000000000003 Allocated by task 6945: kasan_save_stack mm/kasan/common.c:48 [inline] kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc+0x100/0x130 mm/kasan/common.c:461 kmalloc_node include/linux/slab.h:577 [inline] kvmalloc_node+0x81/0x110 mm/util.c:574 kvmalloc include/linux/mm.h:757 [inline] kvzalloc include/linux/mm.h:765 [inline] btrfs_mount_root+0xd0/0xb60 fs/btrfs/super.c:1613 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x88/0x270 fs/super.c:1547 fc_mount fs/namespace.c:978 [inline] vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008 btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x88/0x270 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x179d/0x29e0 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount+0x126/0x180 fs/namespace.c:3390 do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 6945: kasan_save_stack mm/kasan/common.c:48 [inline] kasan_set_track+0x3d/0x70 mm/kasan/common.c:56 kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0xdd/0x110 mm/kasan/common.c:422 __cache_free mm/slab.c:3418 [inline] kfree+0x113/0x200 mm/slab.c:3756 deactivate_locked_super+0xa7/0xf0 fs/super.c:335 btrfs_mount_root+0x72b/0xb60 fs/btrfs/super.c:1678 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x88/0x270 fs/super.c:1547 fc_mount fs/namespace.c:978 [inline] vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008 btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732 legacy_get_tree+0xea/0x180 fs/fs_context.c:592 vfs_get_tree+0x88/0x270 fs/super.c:1547 do_new_mount fs/namespace.c:2875 [inline] path_mount+0x179d/0x29e0 fs/namespace.c:3192 do_mount fs/namespace.c:3205 [inline] __do_sys_mount fs/namespace.c:3413 [inline] __se_sys_mount+0x126/0x180 fs/namespace.c:3390 do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff8880878e0000 which belongs to the cache kmalloc-16k of size 16384 The buggy address is located 1704 bytes inside of 16384-byte region [ffff8880878e0000, ffff8880878e4000) The buggy address belongs to the page: page:0000000060704f30 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x878e0 head:0000000060704f30 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfffe0000010200(slab|head) raw: 00fffe0000010200 ffffea00028e9a08 ffffea00021e3608 ffff8880aa440b00 raw: 0000000000000000 ffff8880878e0000 0000000100000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880878e0580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880878e0600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8880878e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880878e0700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880878e0780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== The syzkaller reproducer for this use-after-free crafts a filesystem image and loop mounts it twice in a loop. The mount will fail as the crafted image has an invalid chunk tree. When this happens btrfs_mount_root() will call deactivate_locked_super(), which then cleans up fs_info and fs_info::sb. If a second thread now adds the same block-device to the filesystem, it will get detected as a duplicate device and device_list_add() will reject the duplicate and print a warning. But as the fs_info pointer passed in is non-NULL this will result in a use-after-free. Instead of printing possibly uninitialized or already freed memory in btrfs_printk(), explicitly pass in a NULL fs_info so the printing of the device name will be skipped altogether. There was a slightly different approach discussed in https://lore.kernel.org/linux-btrfs/20200114060920.4527-1-anand.jain@oracle.com/t/#u Link: https://lore.kernel.org/linux-btrfs/000000000000c9e14b05afcc41ba@google.com Reported-by: syzbot+582e66e5edf36a22c7b0@syzkaller.appspotmail.com CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-11-18 02:03:26 -07:00
/*
* device->fs_info may not be reliable here, so
* pass in a NULL instead. This avoids a
* possible use-after-free when the fs_info and
* fs_info->sb are already torn down.
*/
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
btrfs: harden agaist duplicate fsid on scanned devices It's not that impossible to imagine that a device OR a btrfs image is copied just by using the dd or the cp command. Which in case both the copies of the btrfs will have the same fsid. If on the system with automount enabled, the copied FS gets scanned. We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after it's been copied. For example: Initially.. /dev/mmcblk0p4 is mounted as / $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part / |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda $ dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:0 1 14.9G 0 disk |-sda4 8:4 1 4G 0 part / |-sda2 8:2 1 500M 0 part |-sda3 8:3 1 256M 0 part `-sda1 8:1 1 256M 0 part mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-14 20:45:17 -06:00
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
task_pid_nr(current));
btrfs: harden agaist duplicate fsid on scanned devices It's not that impossible to imagine that a device OR a btrfs image is copied just by using the dd or the cp command. Which in case both the copies of the btrfs will have the same fsid. If on the system with automount enabled, the copied FS gets scanned. We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after it's been copied. For example: Initially.. /dev/mmcblk0p4 is mounted as / $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part / |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda $ dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:0 1 14.9G 0 disk |-sda4 8:4 1 4G 0 part / |-sda2 8:2 1 500M 0 part |-sda3 8:3 1 256M 0 part `-sda1 8:1 1 256M 0 part mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-14 20:45:17 -06:00
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
}
/*
* Unmount does not free the btrfs_device struct but would zero
* generation along with most of the other members. So just update
* it back. We need it to pick the disk with largest generation
* (as above).
*/
if (!fs_devices->opened) {
device->generation = found_transid;
fs_devices->latest_generation = max_t(u64, found_transid,
fs_devices->latest_generation);
}
fs_devices->total_devices = btrfs_super_num_devices(disk_super);
mutex_unlock(&fs_devices->device_list_mutex);
return device;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
struct btrfs_device *orig_dev;
int ret = 0;
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
/*
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name) {
name = rcu_string_strdup(orig_dev->name->str,
GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
}
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
/*
* After we have read the system tree and know devids belonging to
* this filesystem, remove the device which does not belong there.
*/
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
mutex_lock(&uuid_mutex);
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&device->dev_state) &&
btrfs: include non-missing as a qualifier for the latest_bdev commit 998a0671961f66e9fad4990ed75f80ba3088c2f1 upstream. btrfs_free_extra_devids() updates fs_devices::latest_bdev to point to the bdev with greatest device::generation number. For a typical-missing device the generation number is zero so fs_devices::latest_bdev will never point to it. But if the missing device is due to alienation [1], then device::generation is not zero and if it is greater or equal to the rest of device generations in the list, then fs_devices::latest_bdev ends up pointing to the missing device and reports the error like [2]. [1] We maintain devices of a fsid (as in fs_device::fsid) in the fs_devices::devices list, a device is considered as an alien device if its fsid does not match with the fs_device::fsid Consider a working filesystem with raid1: $ mkfs.btrfs -f -d raid1 -m raid1 /dev/sda /dev/sdb $ mount /dev/sda /mnt-raid1 $ umount /mnt-raid1 While mnt-raid1 was unmounted the user force-adds one of its devices to another btrfs filesystem: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt-single $ btrfs dev add -f /dev/sda /mnt-single Now the original mnt-raid1 fails to mount in degraded mode, because fs_devices::latest_bdev is pointing to the alien device. $ mount -o degraded /dev/sdb /mnt-raid1 [2] mount: wrong fs type, bad option, bad superblock on /dev/sdb, missing codepage or helper program, or other error In some cases useful info is found in syslog - try dmesg | tail or so. kernel: BTRFS warning (device sdb): devid 1 uuid 072a0192-675b-4d5a-8640-a5cf2b2c704d is missing kernel: BTRFS error (device sdb): failed to read devices kernel: BTRFS error (device sdb): open_ctree failed Fix the root cause by checking if the device is not missing before it can be considered for the fs_devices::latest_bdev. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-05-04 12:58:25 -06:00
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
(!latest_dev ||
device->generation > latest_dev->generation)) {
latest_dev = device;
}
continue;
}
btrfs: dev-replace: fail mount if we don't have replace item with target device commit cf89af146b7e62af55470cf5f3ec3c56ec144a5e upstream. If there is a device BTRFS_DEV_REPLACE_DEVID without the device replace item, then it means the filesystem is inconsistent state. This is either corruption or a crafted image. Fail the mount as this needs a closer look what is actually wrong. As of now if BTRFS_DEV_REPLACE_DEVID is present without the replace item, in __btrfs_free_extra_devids() we determine that there is an extra device, and free those extra devices but continue to mount the device. However, we were wrong in keeping tack of the rw_devices so the syzbot testcase failed: WARNING: CPU: 1 PID: 3612 at fs/btrfs/volumes.c:1166 close_fs_devices.part.0+0x607/0x800 fs/btrfs/volumes.c:1166 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 3612 Comm: syz-executor.2 Not tainted 5.9.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 panic+0x347/0x7c0 kernel/panic.c:231 __warn.cold+0x20/0x46 kernel/panic.c:600 report_bug+0x1bd/0x210 lib/bug.c:198 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:234 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:254 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536 RIP: 0010:close_fs_devices.part.0+0x607/0x800 fs/btrfs/volumes.c:1166 RSP: 0018:ffffc900091777e0 EFLAGS: 00010246 RAX: 0000000000040000 RBX: ffffffffffffffff RCX: ffffc9000c8b7000 RDX: 0000000000040000 RSI: ffffffff83097f47 RDI: 0000000000000007 RBP: dffffc0000000000 R08: 0000000000000001 R09: ffff8880988a187f R10: 0000000000000000 R11: 0000000000000001 R12: ffff88809593a130 R13: ffff88809593a1ec R14: ffff8880988a1908 R15: ffff88809593a050 close_fs_devices fs/btrfs/volumes.c:1193 [inline] btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179 open_ctree+0x4984/0x4a2d fs/btrfs/disk-io.c:3434 btrfs_fill_super fs/btrfs/super.c:1316 [inline] btrfs_mount_root.cold+0x14/0x165 fs/btrfs/super.c:1672 The fix here is, when we determine that there isn't a replace item then fail the mount if there is a replace target device (devid 0). CC: stable@vger.kernel.org # 4.19+ Reported-by: syzbot+4cfe71a4da060be47502@syzkaller.appspotmail.com Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-10-29 16:53:56 -06:00
/*
* We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
* in btrfs_init_dev_replace() so just continue.
*/
if (device->devid == BTRFS_DEV_REPLACE_DEVID)
continue;
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
fs_devices->open_devices--;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
btrfs_free_device(device);
}
if (fs_devices->seed) {
fs_devices = fs_devices->seed;
goto again;
}
fs_devices->latest_bdev = latest_dev->bdev;
mutex_unlock(&uuid_mutex);
}
static void btrfs_close_bdev(struct btrfs_device *device)
{
if (!device->bdev)
return;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
sync_blockdev(device->bdev);
invalidate_bdev(device->bdev);
}
blkdev_put(device->bdev, device->mode);
}
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
struct btrfs_device *new_device;
struct rcu_string *name;
if (device->bdev)
fs_devices->open_devices--;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
fs_devices->missing_devices--;
btrfs_close_bdev(device);
new_device = btrfs_alloc_device(NULL, &device->devid,
device->uuid);
BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
/* Safe because we are under uuid_mutex */
if (device->name) {
name = rcu_string_strdup(device->name->str, GFP_NOFS);
BUG_ON(!name); /* -ENOMEM */
rcu_assign_pointer(new_device->name, name);
}
list_replace_rcu(&device->dev_list, &new_device->dev_list);
new_device->fs_devices = device->fs_devices;
synchronize_rcu();
btrfs_free_device(device);
}
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
btrfs_close_one_device(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = 0;
return 0;
}
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_fs_devices *seed_devices = NULL;
int ret;
mutex_lock(&uuid_mutex);
ret = close_fs_devices(fs_devices);
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
}
mutex_unlock(&uuid_mutex);
while (seed_devices) {
fs_devices = seed_devices;
seed_devices = fs_devices->seed;
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
return ret;
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
int ret = 0;
flags |= FMODE_EXCL;
list_for_each_entry(device, &fs_devices->devices, dev_list) {
/* Just open everything we can; ignore failures here */
if (btrfs_open_one_device(fs_devices, device, flags, holder))
continue;
if (!latest_dev ||
device->generation > latest_dev->generation)
latest_dev = device;
}
if (fs_devices->open_devices == 0) {
ret = -EINVAL;
goto out;
}
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
out:
return ret;
}
static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
{
struct btrfs_device *dev1, *dev2;
dev1 = list_entry(a, struct btrfs_device, dev_list);
dev2 = list_entry(b, struct btrfs_device, dev_list);
if (dev1->devid < dev2->devid)
return -1;
else if (dev1->devid > dev2->devid)
return 1;
return 0;
}
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
int ret;
lockdep_assert_held(&uuid_mutex);
btrfs: open device without device_list_mutex commit 18c850fdc5a801bad4977b0f1723761d42267e45 upstream. There's long existed a lockdep splat because we open our bdev's under the ->device_list_mutex at mount time, which acquires the bd_mutex. Usually this goes unnoticed, but if you do loopback devices at all suddenly the bd_mutex comes with a whole host of other dependencies, which results in the splat when you mount a btrfs file system. ====================================================== WARNING: possible circular locking dependency detected 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Not tainted ------------------------------------------------------ systemd-journal/509 is trying to acquire lock: ffff970831f84db0 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x44/0x70 [btrfs] but task is already holding lock: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x13e/0x220 btrfs_page_mkwrite+0x59/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 -> #5 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x60/0x80 _copy_from_user+0x20/0xb0 get_sg_io_hdr+0x9a/0xb0 scsi_cmd_ioctl+0x1ea/0x2f0 cdrom_ioctl+0x3c/0x12b4 sr_block_ioctl+0xa4/0xd0 block_ioctl+0x3f/0x50 ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&cd->lock){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 sr_block_open+0xa2/0x180 __blkdev_get+0xdd/0x550 blkdev_get+0x38/0x150 do_dentry_open+0x16b/0x3e0 path_openat+0x3c9/0xa00 do_filp_open+0x75/0x100 do_sys_openat2+0x8a/0x140 __x64_sys_openat+0x46/0x70 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 __blkdev_get+0x6a/0x550 blkdev_get+0x85/0x150 blkdev_get_by_path+0x2c/0x70 btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] open_fs_devices+0x88/0x240 [btrfs] btrfs_open_devices+0x92/0xa0 [btrfs] btrfs_mount_root+0x250/0x490 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x119/0x380 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x8c6/0xca0 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_run_dev_stats+0x36/0x420 [btrfs] commit_cowonly_roots+0x91/0x2d0 [btrfs] btrfs_commit_transaction+0x4e6/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_commit_transaction+0x48e/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 __mutex_lock+0x7b/0x820 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by systemd-journal/509: #0: ffff97083bdec8b8 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x12e/0x4b0 #1: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] #2: ffff97083144d6a8 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3f8/0x500 [btrfs] stack backtrace: CPU: 0 PID: 509 Comm: systemd-journal Not tainted 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x134/0x150 __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] __mutex_lock+0x7b/0x820 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? kvm_sched_clock_read+0x14/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0xc/0xb0 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] ? sched_clock+0x5/0x10 do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7fa3972fdbfe Code: Bad RIP value. Fix this by not holding the ->device_list_mutex at this point. The device_list_mutex exists to protect us from modifying the device list while the file system is running. However it can also be modified by doing a scan on a device. But this action is specifically protected by the uuid_mutex, which we are holding here. We cannot race with opening at this point because we have the ->s_mount lock held during the mount. Not having the ->device_list_mutex here is perfectly safe as we're not going to change the devices at this point. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add some comments ] Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-17 13:12:27 -06:00
/*
* The device_list_mutex cannot be taken here in case opening the
* underlying device takes further locks like bd_mutex.
*
* We also don't need the lock here as this is called during mount and
* exclusion is provided by uuid_mutex
*/
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
} else {
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
return ret;
}
static void btrfs_release_disk_super(struct page *page)
{
kunmap(page);
put_page(page);
}
static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
struct page **page,
struct btrfs_super_block **disk_super)
{
void *p;
pgoff_t index;
/* make sure our super fits in the device */
if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
return 1;
/* make sure our super fits in the page */
if (sizeof(**disk_super) > PAGE_SIZE)
return 1;
/* make sure our super doesn't straddle pages on disk */
index = bytenr >> PAGE_SHIFT;
if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
return 1;
/* pull in the page with our super */
*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
index, GFP_KERNEL);
if (IS_ERR_OR_NULL(*page))
return 1;
p = kmap(*page);
/* align our pointer to the offset of the super block */
*disk_super = p + offset_in_page(bytenr);
if (btrfs_super_bytenr(*disk_super) != bytenr ||
btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
btrfs_release_disk_super(*page);
return 1;
}
if ((*disk_super)->label[0] &&
(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
return 0;
}
int btrfs_forget_devices(const char *path)
{
int ret;
mutex_lock(&uuid_mutex);
ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
mutex_unlock(&uuid_mutex);
return ret;
}
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
* is read via pagecache
*/
struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
void *holder)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
struct page *page;
u64 bytenr;
lockdep_assert_held(&uuid_mutex);
/*
* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
device = ERR_PTR(-EINVAL);
goto error_bdev_put;
}
device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device)) {
if (new_device_added)
btrfs_free_stale_devices(path, device);
}
btrfs_release_disk_super(page);
error_bdev_put:
blkdev_put(bdev, flags);
return device;
}
/*
* Try to find a chunk that intersects [start, start + len] range and when one
* such is found, record the end of it in *start
*/
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
u64 len)
{
u64 physical_start, physical_end;
lockdep_assert_held(&device->fs_info->chunk_mutex);
if (!find_first_extent_bit(&device->alloc_state, *start,
&physical_start, &physical_end,
CHUNK_ALLOCATED, NULL)) {
Btrfs: fix chunk allocation regression leading to transaction abort With commit 1b9845081633 ("Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole") introduced in the kernel 4.1 merge window, we end up using part of a device hole for which there are already pending chunks or pinned chunks. Before that commit we didn't use the hole and would just move on to the next hole in the device. However when we adjust the start offset for the chunk allocation and we have pinned chunks, we set it blindly to the end offset of the pinned chunk we are currently processing, which is dangerous because we can have a pending chunk that has a start offset that matches the end offset of our pinned chunk - leading us to a case where we end up getting two pending chunks that start at the same physical device offset, which makes us later abort the current transaction with -EEXIST when finishing the chunk allocation at btrfs_create_pending_block_groups(): [194737.659017] ------------[ cut here ]------------ [194737.660192] WARNING: CPU: 15 PID: 31111 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]() [194737.662209] BTRFS: Transaction aborted (error -17) [194737.663175] Modules linked in: btrfs dm_snapshot dm_bufio dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse [194737.674015] CPU: 15 PID: 31111 Comm: xfs_io Tainted: G W 4.0.0-rc5-btrfs-next-9+ #2 [194737.675986] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [194737.682999] 0000000000000009 ffff8800564c7a98 ffffffff8142fa46 ffffffff8108b6a2 [194737.684540] ffff8800564c7ae8 ffff8800564c7ad8 ffffffff81045ea5 ffff8800564c7b78 [194737.686017] ffffffffa0383aa7 00000000ffffffef ffff88000c7ba000 ffff8801a1f66f40 [194737.687509] Call Trace: [194737.688068] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b [194737.689027] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad [194737.690095] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb [194737.691198] [<ffffffffa0383aa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs] [194737.693789] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48 [194737.695065] [<ffffffffa0383aa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs] [194737.696806] [<ffffffffa039a3bd>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs] [194737.698683] [<ffffffffa03aa433>] __btrfs_end_transaction+0x84/0x366 [btrfs] [194737.700329] [<ffffffffa03aa725>] btrfs_end_transaction+0x10/0x12 [btrfs] [194737.701924] [<ffffffffa0394b51>] btrfs_check_data_free_space+0x11f/0x27c [btrfs] [194737.703675] [<ffffffffa03b8ba4>] __btrfs_buffered_write+0x16a/0x4c8 [btrfs] [194737.705417] [<ffffffffa03bb502>] ? btrfs_file_write_iter+0x19a/0x431 [btrfs] [194737.707058] [<ffffffffa03bb511>] ? btrfs_file_write_iter+0x1a9/0x431 [btrfs] [194737.708560] [<ffffffffa03bb68d>] btrfs_file_write_iter+0x325/0x431 [btrfs] [194737.710673] [<ffffffff81067d85>] ? get_parent_ip+0xe/0x3e [194737.712076] [<ffffffff811534c3>] new_sync_write+0x7c/0xa0 [194737.713293] [<ffffffff81153b58>] vfs_write+0xb2/0x117 [194737.714443] [<ffffffff81154424>] SyS_pwrite64+0x64/0x82 [194737.715646] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17 [194737.717175] ---[ end trace f2d5dc04e56d7e48 ]--- [194737.718170] BTRFS: error (device sdc) in btrfs_create_pending_block_groups:9524: errno=-17 Object already exists The -EEXIST failure comes from btrfs_finish_chunk_alloc(), called by btrfs_create_pending_block_groups(), when it attempts to insert a duplicated device extent item via btrfs_alloc_dev_extent(). This issue was reproducible with fstests generic/038 running in a loop for several hours (it's very hard to hit) and using MOUNT_OPTIONS="-o discard". Applying Jeff's recent patch titled "btrfs: add missing discards when unpinning extents with -o discard" makes the issue much easier to reproduce (usually within 4 to 5 hours), since it pins chunks for longer periods of time when an unused block group is deleted by the cleaner kthread. Fix this by making sure that we never adjust the start offset to a lower value than it currently has. Fixes: 1b9845081633 ("Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole" Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-05-14 03:46:03 -06:00
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
physical_end - physical_start)) {
*start = physical_end + 1;
return true;
}
}
return false;
}
/*
2015-06-15 07:41:17 -06:00
* find_free_dev_extent_start - find free space in the specified device
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @search_start: the position from which to begin the search
* @start: store the start of the free space.
* @len: the size of the free space. that we find, or the size
* of the max free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
* of the max free space.
*
* @len is used to store the size of the free space that we find.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*
* NOTE: This function will search *commit* root of device tree, and does extra
* check to ensure dev extents are not double allocated.
* This makes the function safe to allocate dev extents but may not report
* correct usable device space, as device extent freed in current transaction
* is not reported as avaiable.
*/
static int find_free_dev_extent_start(struct btrfs_device *device,
u64 num_bytes, u64 search_start, u64 *start,
u64 *len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 hole_size;
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
Btrfs: fix fitrim discarding device area reserved for boot loader's use As of the 4.3 kernel release, the fitrim ioctl can now discard any region of a disk that is not allocated to any chunk/block group, including the first megabyte which is used for our primary superblock and by the boot loader (grub for example). Fix this by not allowing to trim/discard any region in the device starting with an offset not greater than min(alloc_start_mount_option, 1Mb), just as it was not possible before 4.3. A reproducer test case for xfstests follows. seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { cd / rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 # Write to the [0, 64Kb[ and [68Kb, 1Mb[ ranges of the device. These ranges are # reserved for a boot loader to use (GRUB for example) and btrfs should never # use them - neither for allocating metadata/data nor should trim/discard them. # The range [64Kb, 68Kb[ is used for the primary superblock of the filesystem. $XFS_IO_PROG -c "pwrite -S 0xfd 0 64K" $SCRATCH_DEV | _filter_xfs_io $XFS_IO_PROG -c "pwrite -S 0xfd 68K 956K" $SCRATCH_DEV | _filter_xfs_io # Now mount the filesystem and perform a fitrim against it. _scratch_mount _require_batched_discard $SCRATCH_MNT $FSTRIM_PROG $SCRATCH_MNT # Now unmount the filesystem and verify the content of the ranges was not # modified (no trim/discard happened on them). _scratch_unmount echo "Content of the ranges [0, 64Kb] and [68Kb, 1Mb[ after fitrim:" od -t x1 -N $((64 * 1024)) $SCRATCH_DEV od -t x1 -j $((68 * 1024)) -N $((956 * 1024)) $SCRATCH_DEV status=0 exit Reported-by: Vincent Petry <PVince81@yahoo.fr> Reported-by: Andrei Borzenkov <arvidjaar@gmail.com> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=109341 Fixes: 499f377f49f0 (btrfs: iterate over unused chunk space in FITRIM) Cc: stable@vger.kernel.org # 4.3+ Signed-off-by: Filipe Manana <fdmanana@suse.com>
2016-01-06 15:42:35 -07:00
/*
* We don't want to overwrite the superblock on the drive nor any area
* used by the boot loader (grub for example), so we make sure to start
* at an offset of at least 1MB.
*/
search_start = max_t(u64, search_start, SZ_1M);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
max_hole_start = search_start;
max_hole_size = 0;
again:
if (search_start >= search_end ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -ENOSPC;
goto out;
}
path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto out;
}
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) {
hole_size = key.offset - search_start;
/*
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
if (contains_pending_extent(device, &search_start,
hole_size)) {
if (key.offset >= search_start)
hole_size = key.offset - search_start;
else
hole_size = 0;
}
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
/*
* If this free space is greater than which we need,
* it must be the max free space that we have found
* until now, so max_hole_start must point to the start
* of this free space and the length of this free space
* is stored in max_hole_size. Thus, we return
* max_hole_start and max_hole_size and go back to the
* caller.
*/
if (hole_size >= num_bytes) {
ret = 0;
goto out;
}
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (extent_end > search_start)
search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
/*
* At this point, search_start should be the end of
* allocated dev extents, and when shrinking the device,
* search_end may be smaller than search_start.
*/
if (search_end > search_start) {
hole_size = search_end - search_start;
if (contains_pending_extent(device, &search_start, hole_size)) {
btrfs_release_path(path);
goto again;
}
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
}
/* See above. */
if (max_hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
out:
btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
2015-06-15 07:41:17 -06:00
u64 *start, u64 *len)
{
/* FIXME use last free of some kind */
return find_free_dev_extent_start(device, num_bytes, 0, start, len);
2015-06-15 07:41:17 -06:00
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
struct btrfs_dev_extent *extent = NULL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
goto out;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
key = found_key;
btrfs_release_path(path);
goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
goto out;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Failed to remove dev extent item");
btrfs: Fix out-of-space bug Btrfs will report NO_SPACE when we create and remove files for several times, and we can't write to filesystem until mount it again. Steps to reproduce: 1: Create a single-dev btrfs fs with default option 2: Write a file into it to take up most fs space 3: Delete above file 4: Wait about 100s to let chunk removed 5: goto 2 Script is like following: #!/bin/bash # Recommend 1.2G space, too large disk will make test slow DEV="/dev/sda16" MNT="/mnt/tmp" dev_size="$(lsblk -bn -o SIZE "$DEV")" || exit 2 file_size_m=$((dev_size * 75 / 100 / 1024 / 1024)) echo "Loop write ${file_size_m}M file on $((dev_size / 1024 / 1024))M dev" for ((i = 0; i < 10; i++)); do umount "$MNT" 2>/dev/null; done echo "mkfs $DEV" mkfs.btrfs -f "$DEV" >/dev/null || exit 2 echo "mount $DEV $MNT" mount "$DEV" "$MNT" || exit 2 for ((loop_i = 0; loop_i < 20; loop_i++)); do echo echo "loop $loop_i" echo "dd file..." cmd=(dd if=/dev/zero of="$MNT"/file0 bs=1M count="$file_size_m") "${cmd[@]}" 2>/dev/null || { # NO_SPACE error triggered echo "dd failed: ${cmd[*]}" exit 1 } echo "rm file..." rm -f "$MNT"/file0 || exit 2 for ((i = 0; i < 10; i++)); do df "$MNT" | tail -1 sleep 10 done done Reason: It is triggered by commit: 47ab2a6c689913db23ccae38349714edf8365e0a which is used to remove empty block groups automatically, but the reason is not in that patch. Code before works well because btrfs don't need to create and delete chunks so many times with high complexity. Above bug is caused by many reason, any of them can trigger it. Reason1: When we remove some continuous chunks but leave other chunks after, these disk space should be used by chunk-recreating, but in current code, only first create will successed. Fixed by Forrest Liu <forrestl@synology.com> in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason2: contains_pending_extent() return wrong value in calculation. Fixed by Forrest Liu <forrestl@synology.com> in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason3: btrfs_check_data_free_space() try to commit transaction and retry allocating chunk when the first allocating failed, but space_info->full is set in first allocating, and prevent second allocating in retry. Fixed in this patch by clear space_info->full in commit transaction. Tested for severial times by above script. Changelog v3->v4: use light weight int instead of atomic_t to record have_remove_bgs in transaction, suggested by: Josef Bacik <jbacik@fb.com> Changelog v2->v3: v2 fixed the bug by adding more commit-transaction, but we only need to reclaim space when we are really have no space for new chunk, noticed by: Filipe David Manana <fdmanana@gmail.com> Actually, our code already have this type of commit-and-retry, we only need to make it working with removed-bgs. v3 fixed the bug with above way. Changelog v1->v2: v1 will introduce a new bug when delete and create chunk in same disk space in same transaction, noticed by: Filipe David Manana <fdmanana@gmail.com> V2 fix this bug by commit transaction after remove block grops. Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com> Suggested-by: Filipe David Manana <fdmanana@gmail.com> Suggested-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-02-11 23:18:17 -07:00
} else {
set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
}
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 chunk_offset, u64 start, u64 num_bytes)
{
int ret;
struct btrfs_path *path;
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
if (ret)
goto out;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
btrfs_set_dev_extent_chunk_tree(leaf, extent,
BTRFS_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
n = rb_last(&em_tree->map.rb_root);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
}
read_unlock(&em_tree->lock);
return ret;
}
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
u64 *devid_ret)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
if (ret == 0) {
/* Corruption */
btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
ret = -EUCLEAN;
goto error;
}
ret = btrfs_previous_item(fs_info->chunk_root, path,
BTRFS_DEV_ITEMS_OBJECTID,
BTRFS_DEV_ITEM_KEY);
if (ret) {
*devid_ret = 1;
} else {
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
/*
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
&key, sizeof(*dev_item));
if (ret)
goto out;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_generation(leaf, dev_item, 0);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
btrfs_set_device_start_offset(leaf, dev_item, 0);
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
ptr, BTRFS_FSID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
*/
static void update_dev_time(const char *path_name)
{
struct file *filp;
filp = filp_open(path_name, O_RDWR, 0);
if (IS_ERR(filp))
return;
file_update_time(filp);
filp_close(filp, NULL);
}
static int btrfs_rm_dev_item(struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
}
out:
btrfs_free_path(path);
if (!ret)
ret = btrfs_commit_transaction(trans);
return ret;
}
/*
* Verify that @num_devices satisfies the RAID profile constraints in the whole
* filesystem. It's up to the caller to adjust that number regarding eg. device
* replace.
*/
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
u64 num_devices)
{
u64 all_avail;
unsigned seq;
int i;
do {
seq = read_seqbegin(&fs_info->profiles_lock);
all_avail = fs_info->avail_data_alloc_bits |
fs_info->avail_system_alloc_bits |
fs_info->avail_metadata_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
if (!(all_avail & btrfs_raid_array[i].bg_flag))
continue;
if (num_devices < btrfs_raid_array[i].devs_min) {
int ret = btrfs_raid_array[i].mindev_error;
if (ret)
return ret;
}
}
return 0;
}
static struct btrfs_device * btrfs_find_next_active_device(
struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
{
struct btrfs_device *next_device;
list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
if (next_device != device &&
!test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
&& next_device->bdev)
return next_device;
}
return NULL;
}
/*
* Helper function to check if the given device is part of s_bdev / latest_bdev
* and replace it with the provided or the next active device, in the context
* where this function called, there should be always be another device (or
* this_dev) which is active.
*/
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_device *next_device;
if (this_dev)
next_device = this_dev;
else
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
(fs_info->sb->s_bdev == device->bdev))
fs_info->sb->s_bdev = next_device->bdev;
if (fs_info->fs_devices->latest_bdev == device->bdev)
fs_info->fs_devices->latest_bdev = next_device->bdev;
}
/*
* Return btrfs_fs_devices::num_devices excluding the device that's being
* currently replaced.
*/
static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
u64 num_devices = fs_info->fs_devices->num_devices;
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
ASSERT(num_devices > 1);
num_devices--;
}
up_read(&fs_info->dev_replace.rwsem);
return num_devices;
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 num_devices;
int ret = 0;
mutex_lock(&uuid_mutex);
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
goto out;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
ret = -ETXTBSY;
goto out;
}
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto out;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto out;
}
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
mutex_unlock(&fs_info->chunk_mutex);
}
mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
btrfs: fix readahead hang and use-after-free after removing a device commit 66d204a16c94f24ad08290a7663ab67e7fc04e82 upstream. Very sporadically I had test case btrfs/069 from fstests hanging (for years, it is not a recent regression), with the following traces in dmesg/syslog: [162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started [162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0 [162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished [162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds. [162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000 [162513.514751] Call Trace: [162513.514761] __schedule+0x5ce/0xd00 [162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.514771] schedule+0x46/0xf0 [162513.514844] wait_current_trans+0xde/0x140 [btrfs] [162513.514850] ? finish_wait+0x90/0x90 [162513.514864] start_transaction+0x37c/0x5f0 [btrfs] [162513.514879] transaction_kthread+0xa4/0x170 [btrfs] [162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs] [162513.514894] kthread+0x153/0x170 [162513.514897] ? kthread_stop+0x2c0/0x2c0 [162513.514902] ret_from_fork+0x22/0x30 [162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds. [162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000 [162513.515682] Call Trace: [162513.515688] __schedule+0x5ce/0xd00 [162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.515697] schedule+0x46/0xf0 [162513.515712] wait_current_trans+0xde/0x140 [btrfs] [162513.515716] ? finish_wait+0x90/0x90 [162513.515729] start_transaction+0x37c/0x5f0 [btrfs] [162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs] [162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs] [162513.515758] ? __ia32_sys_fdatasync+0x20/0x20 [162513.515761] iterate_supers+0x87/0xf0 [162513.515765] ksys_sync+0x60/0xb0 [162513.515768] __do_sys_sync+0xa/0x10 [162513.515771] do_syscall_64+0x33/0x80 [162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.515781] RIP: 0033:0x7f5238f50bd7 [162513.515782] Code: Bad RIP value. [162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2 [162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7 [162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a [162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0 [162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a [162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340 [162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds. [162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000 [162513.516620] Call Trace: [162513.516625] __schedule+0x5ce/0xd00 [162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.516634] schedule+0x46/0xf0 [162513.516647] wait_current_trans+0xde/0x140 [btrfs] [162513.516650] ? finish_wait+0x90/0x90 [162513.516662] start_transaction+0x4d7/0x5f0 [btrfs] [162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs] [162513.516686] __vfs_setxattr+0x66/0x80 [162513.516691] __vfs_setxattr_noperm+0x70/0x200 [162513.516697] vfs_setxattr+0x6b/0x120 [162513.516703] setxattr+0x125/0x240 [162513.516709] ? lock_acquire+0xb1/0x480 [162513.516712] ? mnt_want_write+0x20/0x50 [162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0 [162513.516723] ? preempt_count_add+0x49/0xa0 [162513.516725] ? __sb_start_write+0x19b/0x290 [162513.516727] ? preempt_count_add+0x49/0xa0 [162513.516732] path_setxattr+0xba/0xd0 [162513.516739] __x64_sys_setxattr+0x27/0x30 [162513.516741] do_syscall_64+0x33/0x80 [162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.516745] RIP: 0033:0x7f5238f56d5a [162513.516746] Code: Bad RIP value. [162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc [162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a [162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470 [162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700 [162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004 [162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0 [162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds. [162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000 [162513.517780] Call Trace: [162513.517786] __schedule+0x5ce/0xd00 [162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.517796] schedule+0x46/0xf0 [162513.517810] wait_current_trans+0xde/0x140 [btrfs] [162513.517814] ? finish_wait+0x90/0x90 [162513.517829] start_transaction+0x37c/0x5f0 [btrfs] [162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs] [162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs] [162513.517862] ? __ia32_sys_fdatasync+0x20/0x20 [162513.517865] iterate_supers+0x87/0xf0 [162513.517869] ksys_sync+0x60/0xb0 [162513.517872] __do_sys_sync+0xa/0x10 [162513.517875] do_syscall_64+0x33/0x80 [162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.517881] RIP: 0033:0x7f5238f50bd7 [162513.517883] Code: Bad RIP value. [162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2 [162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7 [162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053 [162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0 [162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053 [162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340 [162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds. [162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000 [162513.519160] Call Trace: [162513.519165] __schedule+0x5ce/0xd00 [162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.519174] schedule+0x46/0xf0 [162513.519190] wait_current_trans+0xde/0x140 [btrfs] [162513.519193] ? finish_wait+0x90/0x90 [162513.519206] start_transaction+0x4d7/0x5f0 [btrfs] [162513.519222] btrfs_create+0x57/0x200 [btrfs] [162513.519230] lookup_open+0x522/0x650 [162513.519246] path_openat+0x2b8/0xa50 [162513.519270] do_filp_open+0x91/0x100 [162513.519275] ? find_held_lock+0x32/0x90 [162513.519280] ? lock_acquired+0x33b/0x470 [162513.519285] ? do_raw_spin_unlock+0x4b/0xc0 [162513.519287] ? _raw_spin_unlock+0x29/0x40 [162513.519295] do_sys_openat2+0x20d/0x2d0 [162513.519300] do_sys_open+0x44/0x80 [162513.519304] do_syscall_64+0x33/0x80 [162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.519309] RIP: 0033:0x7f5238f4a903 [162513.519310] Code: Bad RIP value. [162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055 [162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903 [162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470 [162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002 [162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013 [162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620 [162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds. [162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002 [162513.520511] Call Trace: [162513.520516] __schedule+0x5ce/0xd00 [162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.520525] schedule+0x46/0xf0 [162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs] [162513.520548] ? finish_wait+0x90/0x90 [162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs] [162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs] [162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs] [162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs] [162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs] [162513.520643] ? do_sigaction+0xf3/0x240 [162513.520645] ? find_held_lock+0x32/0x90 [162513.520648] ? do_sigaction+0xf3/0x240 [162513.520651] ? lock_acquired+0x33b/0x470 [162513.520655] ? _raw_spin_unlock_irq+0x24/0x50 [162513.520657] ? lockdep_hardirqs_on+0x7d/0x100 [162513.520660] ? _raw_spin_unlock_irq+0x35/0x50 [162513.520662] ? do_sigaction+0xf3/0x240 [162513.520671] ? __x64_sys_ioctl+0x83/0xb0 [162513.520672] __x64_sys_ioctl+0x83/0xb0 [162513.520677] do_syscall_64+0x33/0x80 [162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.520681] RIP: 0033:0x7fc3cd307d87 [162513.520682] Code: Bad RIP value. [162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87 [162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003 [162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003 [162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001 [162513.520703] Showing all locks held in the system: [162513.520712] 1 lock held by khungtaskd/54: [162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197 [162513.520728] 1 lock held by in:imklog/596: [162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60 [162513.520782] 1 lock held by btrfs-transacti/1356167: [162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs] [162513.520798] 1 lock held by btrfs/1356190: [162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60 [162513.520805] 1 lock held by fsstress/1356184: [162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0 [162513.520811] 3 locks held by fsstress/1356185: [162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50 [162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120 [162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] [162513.520833] 1 lock held by fsstress/1356196: [162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0 [162513.520838] 3 locks held by fsstress/1356197: [162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50 [162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50 [162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] [162513.520858] 2 locks held by btrfs/1356211: [162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs] [162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] This was weird because the stack traces show that a transaction commit, triggered by a device replace operation, is blocking trying to pause any running scrubs but there are no stack traces of blocked tasks doing a scrub. After poking around with drgn, I noticed there was a scrub task that was constantly running and blocking for shorts periods of time: >>> t = find_task(prog, 1356190) >>> prog.stack_trace(t) #0 __schedule+0x5ce/0xcfc #1 schedule+0x46/0xe4 #2 schedule_timeout+0x1df/0x475 #3 btrfs_reada_wait+0xda/0x132 #4 scrub_stripe+0x2a8/0x112f #5 scrub_chunk+0xcd/0x134 #6 scrub_enumerate_chunks+0x29e/0x5ee #7 btrfs_scrub_dev+0x2d5/0x91b #8 btrfs_ioctl+0x7f5/0x36e7 #9 __x64_sys_ioctl+0x83/0xb0 #10 do_syscall_64+0x33/0x77 #11 entry_SYSCALL_64+0x7c/0x156 Which corresponds to: int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; struct btrfs_fs_info *fs_info = rc->fs_info; while (atomic_read(&rc->elems)) { if (!atomic_read(&fs_info->reada_works_cnt)) reada_start_machine(fs_info); wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, (HZ + 9) / 10); } (...) So the counter "rc->elems" was set to 1 and never decreased to 0, causing the scrub task to loop forever in that function. Then I used the following script for drgn to check the readahead requests: $ cat dump_reada.py import sys import drgn from drgn import NULL, Object, cast, container_of, execscript, \ reinterpret, sizeof from drgn.helpers.linux import * mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1" mnt = None for mnt in for_each_mount(prog, dst = mnt_path): pass if mnt is None: sys.stderr.write(f'Error: mount point {mnt_path} not found\n') sys.exit(1) fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info) def dump_re(re): nzones = re.nzones.value_() print(f're at {hex(re.value_())}') print(f'\t logical {re.logical.value_()}') print(f'\t refcnt {re.refcnt.value_()}') print(f'\t nzones {nzones}') for i in range(nzones): dev = re.zones[i].device name = dev.name.str.string_() print(f'\t\t dev id {dev.devid.value_()} name {name}') print() for _, e in radix_tree_for_each(fs_info.reada_tree): re = cast('struct reada_extent *', e) dump_re(re) $ drgn dump_reada.py re at 0xffff8f3da9d25ad8 logical 38928384 refcnt 1 nzones 1 dev id 0 name b'/dev/sdd' $ So there was one readahead extent with a single zone corresponding to the source device of that last device replace operation logged in dmesg/syslog. Also the ID of that zone's device was 0 which is a special value set in the source device of a device replace operation when the operation finishes (constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()), confirming again that device /dev/sdd was the source of a device replace operation. Normally there should be as many zones in the readahead extent as there are devices, and I wasn't expecting the extent to be in a block group with a 'single' profile, so I went and confirmed with the following drgn script that there weren't any single profile block groups: $ cat dump_block_groups.py import sys import drgn from drgn import NULL, Object, cast, container_of, execscript, \ reinterpret, sizeof from drgn.helpers.linux import * mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1" mnt = None for mnt in for_each_mount(prog, dst = mnt_path): pass if mnt is None: sys.stderr.write(f'Error: mount point {mnt_path} not found\n') sys.exit(1) fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info) BTRFS_BLOCK_GROUP_DATA = (1 << 0) BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1) BTRFS_BLOCK_GROUP_METADATA = (1 << 2) BTRFS_BLOCK_GROUP_RAID0 = (1 << 3) BTRFS_BLOCK_GROUP_RAID1 = (1 << 4) BTRFS_BLOCK_GROUP_DUP = (1 << 5) BTRFS_BLOCK_GROUP_RAID10 = (1 << 6) BTRFS_BLOCK_GROUP_RAID5 = (1 << 7) BTRFS_BLOCK_GROUP_RAID6 = (1 << 8) BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9) BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10) def bg_flags_string(bg): flags = bg.flags.value_() ret = '' if flags & BTRFS_BLOCK_GROUP_DATA: ret = 'data' if flags & BTRFS_BLOCK_GROUP_METADATA: if len(ret) > 0: ret += '|' ret += 'meta' if flags & BTRFS_BLOCK_GROUP_SYSTEM: if len(ret) > 0: ret += '|' ret += 'system' if flags & BTRFS_BLOCK_GROUP_RAID0: ret += ' raid0' elif flags & BTRFS_BLOCK_GROUP_RAID1: ret += ' raid1' elif flags & BTRFS_BLOCK_GROUP_DUP: ret += ' dup' elif flags & BTRFS_BLOCK_GROUP_RAID10: ret += ' raid10' elif flags & BTRFS_BLOCK_GROUP_RAID5: ret += ' raid5' elif flags & BTRFS_BLOCK_GROUP_RAID6: ret += ' raid6' elif flags & BTRFS_BLOCK_GROUP_RAID1C3: ret += ' raid1c3' elif flags & BTRFS_BLOCK_GROUP_RAID1C4: ret += ' raid1c4' else: ret += ' single' return ret def dump_bg(bg): print() print(f'block group at {hex(bg.value_())}') print(f'\t start {bg.start.value_()} length {bg.length.value_()}') print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}') bg_root = fs_info.block_group_cache_tree.address_of_() for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'): dump_bg(bg) $ drgn dump_block_groups.py block group at 0xffff8f3d673b0400 start 22020096 length 16777216 flags 258 - system raid6 block group at 0xffff8f3d53ddb400 start 38797312 length 536870912 flags 260 - meta raid6 block group at 0xffff8f3d5f4d9c00 start 575668224 length 2147483648 flags 257 - data raid6 block group at 0xffff8f3d08189000 start 2723151872 length 67108864 flags 258 - system raid6 block group at 0xffff8f3db70ff000 start 2790260736 length 1073741824 flags 260 - meta raid6 block group at 0xffff8f3d5f4dd800 start 3864002560 length 67108864 flags 258 - system raid6 block group at 0xffff8f3d67037000 start 3931111424 length 2147483648 flags 257 - data raid6 $ So there were only 2 reasons left for having a readahead extent with a single zone: reada_find_zone(), called when creating a readahead extent, returned NULL either because we failed to find the corresponding block group or because a memory allocation failed. With some additional and custom tracing I figured out that on every further ocurrence of the problem the block group had just been deleted when we were looping to create the zones for the readahead extent (at reada_find_extent()), so we ended up with only one zone in the readahead extent, corresponding to a device that ends up getting replaced. So after figuring that out it became obvious why the hang happens: 1) Task A starts a scrub on any device of the filesystem, except for device /dev/sdd; 2) Task B starts a device replace with /dev/sdd as the source device; 3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently starting to scrub a stripe from block group X. This call to btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add() calls reada_add_block(), it passes the logical address of the extent tree's root node as its 'logical' argument - a value of 38928384; 4) Task A then enters reada_find_extent(), called from reada_add_block(). It finds there isn't any existing readahead extent for the logical address 38928384, so it proceeds to the path of creating a new one. It calls btrfs_map_block() to find out which stripes exist for the block group X. On the first iteration of the for loop that iterates over the stripes, it finds the stripe for device /dev/sdd, so it creates one zone for that device and adds it to the readahead extent. Before getting into the second iteration of the loop, the cleanup kthread deletes block group X because it was empty. So in the iterations for the remaining stripes it does not add more zones to the readahead extent, because the calls to reada_find_zone() returned NULL because they couldn't find block group X anymore. As a result the new readahead extent has a single zone, corresponding to the device /dev/sdd; 4) Before task A returns to btrfs_reada_add() and queues the readahead job for the readahead work queue, task B finishes the device replace and at btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new device /dev/sdg; 5) Task A returns to reada_add_block(), which increments the counter "->elems" of the reada_control structure allocated at btrfs_reada_add(). Then it returns back to btrfs_reada_add() and calls reada_start_machine(). This queues a job in the readahead work queue to run the function reada_start_machine_worker(), which calls __reada_start_machine(). At __reada_start_machine() we take the device list mutex and for each device found in the current device list, we call reada_start_machine_dev() to start the readahead work. However at this point the device /dev/sdd was already freed and is not in the device list anymore. This means the corresponding readahead for the extent at 38928384 is never started, and therefore the "->elems" counter of the reada_control structure allocated at btrfs_reada_add() never goes down to 0, causing the call to btrfs_reada_wait(), done by the scrub task, to wait forever. Note that the readahead request can be made either after the device replace started or before it started, however in pratice it is very unlikely that a device replace is able to start after a readahead request is made and is able to complete before the readahead request completes - maybe only on a very small and nearly empty filesystem. This hang however is not the only problem we can have with readahead and device removals. When the readahead extent has other zones other than the one corresponding to the device that is being removed (either by a device replace or a device remove operation), we risk having a use-after-free on the device when dropping the last reference of the readahead extent. For example if we create a readahead extent with two zones, one for the device /dev/sdd and one for the device /dev/sde: 1) Before the readahead worker starts, the device /dev/sdd is removed, and the corresponding btrfs_device structure is freed. However the readahead extent still has the zone pointing to the device structure; 2) When the readahead worker starts, it only finds device /dev/sde in the current device list of the filesystem; 3) It starts the readahead work, at reada_start_machine_dev(), using the device /dev/sde; 4) Then when it finishes reading the extent from device /dev/sde, it calls __readahead_hook() which ends up dropping the last reference on the readahead extent through the last call to reada_extent_put(); 5) At reada_extent_put() it iterates over each zone of the readahead extent and attempts to delete an element from the device's 'reada_extents' radix tree, resulting in a use-after-free, as the device pointer of the zone for /dev/sdd is now stale. We can also access the device after dropping the last reference of a zone, through reada_zone_release(), also called by reada_extent_put(). And a device remove suffers the same problem, however since it shrinks the device size down to zero before removing the device, it is very unlikely to still have readahead requests not completed by the time we free the device, the only possibility is if the device has a very little space allocated. While the hang problem is exclusive to scrub, since it is currently the only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free problem affects any path that triggers readhead, which includes btree_readahead_hook() and __readahead_hook() (a readahead worker can trigger readahed for the children of a node) for example - any path that ends up calling reada_add_block() can trigger the use-after-free after a device is removed. So fix this by waiting for any readahead requests for a device to complete before removing a device, ensuring that while waiting for existing ones no new ones can be made. This problem has been around for a very long time - the readahead code was added in 2011, device remove exists since 2008 and device replace was introduced in 2013, hard to pick a specific commit for a git Fixes tag. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-10-12 04:55:24 -06:00
if (!ret)
btrfs_reada_remove_dev(device);
mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
/*
* TODO: the superblock still includes this device in its num_devices
* counter although write_all_supers() is not locked out. This
* could give a filesystem state which requires a degraded mount.
*/
ret = btrfs_rm_dev_item(device);
if (ret)
goto error_undo;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
/*
* the device list mutex makes sure that we don't change
* the device list while someone else is writing out all
Btrfs: fix race between removing a dev and writing sbs This change fixes an issue when removing a device and writing all super blocks run simultaneously. Here's the steps necessary for the issue to happen: 1) disk-io.c:write_all_supers() gets a number of N devices from the super_copy, so it will not panic if it fails to write super blocks for N - 1 devices; 2) Then it tries to acquire the device_list_mutex, but blocks because volumes.c:btrfs_rm_device() got it first; 3) btrfs_rm_device() removes the device from the list, then unlocks the mutex and after the unlock it updates the number of devices in super_copy to N - 1. 4) write_all_supers() finally acquires the mutex, iterates over all the devices in the list and gets N - 1 errors, that is, it failed to write super blocks to all the devices; 5) Because write_all_supers() thinks there are a total of N devices, it considers N - 1 errors to be ok, and therefore won't panic. So this change just makes sure that write_all_supers() reads the number of devices from super_copy after it acquires the device_list_mutex. Conversely, it changes btrfs_rm_device() to update the number of devices in super_copy before it releases the device list mutex. The code path to add a new device (volumes.c:btrfs_init_new_device), already has the right behaviour: it updates the number of devices in super_copy while holding the device_list_mutex. The only code path that doesn't lock the device list mutex before updating the number of devices in the super copy is disk-io.c:next_root_backup(), called by open_ctree() during mount time where concurrency issues can't happen. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-09 08:41:36 -06:00
* the device supers. Whoever is writing all supers, should
* lock the device list mutex before getting the number of
* devices in the super block (super_copy). Conversely,
* whoever updates the number of devices in the super block
* (super_copy) should hold the device list mutex.
*/
/*
* In normal cases the cur_devices == fs_devices. But in case
* of deleting a seed device, the cur_devices should point to
* its own fs_devices listed under the fs_devices->seed.
*/
cur_devices = device->fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
cur_devices->num_devices--;
cur_devices->total_devices--;
/* Update total_devices of the parent fs_devices if it's seed */
if (cur_devices != fs_devices)
fs_devices->total_devices--;
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
cur_devices->missing_devices--;
btrfs_assign_next_active_device(device, NULL);
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
btrfs_sysfs_rm_device_link(fs_devices, device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
mutex_unlock(&fs_devices->device_list_mutex);
/*
* at this point, the device is zero sized and detached from
* the devices list. All that's left is to zero out the old
* supers and free the device.
*/
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
btrfs_scratch_superblocks(device->bdev, device->name->str);
btrfs_close_bdev(device);
synchronize_rcu();
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
while (fs_devices) {
if (fs_devices->seed == cur_devices) {
fs_devices->seed = cur_devices->seed;
break;
}
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
out:
mutex_unlock(&uuid_mutex);
return ret;
error_undo:
btrfs: fix readahead hang and use-after-free after removing a device commit 66d204a16c94f24ad08290a7663ab67e7fc04e82 upstream. Very sporadically I had test case btrfs/069 from fstests hanging (for years, it is not a recent regression), with the following traces in dmesg/syslog: [162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started [162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0 [162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished [162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds. [162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000 [162513.514751] Call Trace: [162513.514761] __schedule+0x5ce/0xd00 [162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.514771] schedule+0x46/0xf0 [162513.514844] wait_current_trans+0xde/0x140 [btrfs] [162513.514850] ? finish_wait+0x90/0x90 [162513.514864] start_transaction+0x37c/0x5f0 [btrfs] [162513.514879] transaction_kthread+0xa4/0x170 [btrfs] [162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs] [162513.514894] kthread+0x153/0x170 [162513.514897] ? kthread_stop+0x2c0/0x2c0 [162513.514902] ret_from_fork+0x22/0x30 [162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds. [162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000 [162513.515682] Call Trace: [162513.515688] __schedule+0x5ce/0xd00 [162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.515697] schedule+0x46/0xf0 [162513.515712] wait_current_trans+0xde/0x140 [btrfs] [162513.515716] ? finish_wait+0x90/0x90 [162513.515729] start_transaction+0x37c/0x5f0 [btrfs] [162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs] [162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs] [162513.515758] ? __ia32_sys_fdatasync+0x20/0x20 [162513.515761] iterate_supers+0x87/0xf0 [162513.515765] ksys_sync+0x60/0xb0 [162513.515768] __do_sys_sync+0xa/0x10 [162513.515771] do_syscall_64+0x33/0x80 [162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.515781] RIP: 0033:0x7f5238f50bd7 [162513.515782] Code: Bad RIP value. [162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2 [162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7 [162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a [162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0 [162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a [162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340 [162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds. [162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000 [162513.516620] Call Trace: [162513.516625] __schedule+0x5ce/0xd00 [162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.516634] schedule+0x46/0xf0 [162513.516647] wait_current_trans+0xde/0x140 [btrfs] [162513.516650] ? finish_wait+0x90/0x90 [162513.516662] start_transaction+0x4d7/0x5f0 [btrfs] [162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs] [162513.516686] __vfs_setxattr+0x66/0x80 [162513.516691] __vfs_setxattr_noperm+0x70/0x200 [162513.516697] vfs_setxattr+0x6b/0x120 [162513.516703] setxattr+0x125/0x240 [162513.516709] ? lock_acquire+0xb1/0x480 [162513.516712] ? mnt_want_write+0x20/0x50 [162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0 [162513.516723] ? preempt_count_add+0x49/0xa0 [162513.516725] ? __sb_start_write+0x19b/0x290 [162513.516727] ? preempt_count_add+0x49/0xa0 [162513.516732] path_setxattr+0xba/0xd0 [162513.516739] __x64_sys_setxattr+0x27/0x30 [162513.516741] do_syscall_64+0x33/0x80 [162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.516745] RIP: 0033:0x7f5238f56d5a [162513.516746] Code: Bad RIP value. [162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc [162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a [162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470 [162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700 [162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004 [162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0 [162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds. [162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000 [162513.517780] Call Trace: [162513.517786] __schedule+0x5ce/0xd00 [162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.517796] schedule+0x46/0xf0 [162513.517810] wait_current_trans+0xde/0x140 [btrfs] [162513.517814] ? finish_wait+0x90/0x90 [162513.517829] start_transaction+0x37c/0x5f0 [btrfs] [162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs] [162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs] [162513.517862] ? __ia32_sys_fdatasync+0x20/0x20 [162513.517865] iterate_supers+0x87/0xf0 [162513.517869] ksys_sync+0x60/0xb0 [162513.517872] __do_sys_sync+0xa/0x10 [162513.517875] do_syscall_64+0x33/0x80 [162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.517881] RIP: 0033:0x7f5238f50bd7 [162513.517883] Code: Bad RIP value. [162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2 [162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7 [162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053 [162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0 [162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053 [162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340 [162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds. [162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000 [162513.519160] Call Trace: [162513.519165] __schedule+0x5ce/0xd00 [162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.519174] schedule+0x46/0xf0 [162513.519190] wait_current_trans+0xde/0x140 [btrfs] [162513.519193] ? finish_wait+0x90/0x90 [162513.519206] start_transaction+0x4d7/0x5f0 [btrfs] [162513.519222] btrfs_create+0x57/0x200 [btrfs] [162513.519230] lookup_open+0x522/0x650 [162513.519246] path_openat+0x2b8/0xa50 [162513.519270] do_filp_open+0x91/0x100 [162513.519275] ? find_held_lock+0x32/0x90 [162513.519280] ? lock_acquired+0x33b/0x470 [162513.519285] ? do_raw_spin_unlock+0x4b/0xc0 [162513.519287] ? _raw_spin_unlock+0x29/0x40 [162513.519295] do_sys_openat2+0x20d/0x2d0 [162513.519300] do_sys_open+0x44/0x80 [162513.519304] do_syscall_64+0x33/0x80 [162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.519309] RIP: 0033:0x7f5238f4a903 [162513.519310] Code: Bad RIP value. [162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055 [162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903 [162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470 [162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002 [162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013 [162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620 [162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds. [162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1 [162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002 [162513.520511] Call Trace: [162513.520516] __schedule+0x5ce/0xd00 [162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [162513.520525] schedule+0x46/0xf0 [162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs] [162513.520548] ? finish_wait+0x90/0x90 [162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs] [162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs] [162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs] [162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs] [162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs] [162513.520643] ? do_sigaction+0xf3/0x240 [162513.520645] ? find_held_lock+0x32/0x90 [162513.520648] ? do_sigaction+0xf3/0x240 [162513.520651] ? lock_acquired+0x33b/0x470 [162513.520655] ? _raw_spin_unlock_irq+0x24/0x50 [162513.520657] ? lockdep_hardirqs_on+0x7d/0x100 [162513.520660] ? _raw_spin_unlock_irq+0x35/0x50 [162513.520662] ? do_sigaction+0xf3/0x240 [162513.520671] ? __x64_sys_ioctl+0x83/0xb0 [162513.520672] __x64_sys_ioctl+0x83/0xb0 [162513.520677] do_syscall_64+0x33/0x80 [162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [162513.520681] RIP: 0033:0x7fc3cd307d87 [162513.520682] Code: Bad RIP value. [162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87 [162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003 [162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003 [162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001 [162513.520703] Showing all locks held in the system: [162513.520712] 1 lock held by khungtaskd/54: [162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197 [162513.520728] 1 lock held by in:imklog/596: [162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60 [162513.520782] 1 lock held by btrfs-transacti/1356167: [162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs] [162513.520798] 1 lock held by btrfs/1356190: [162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60 [162513.520805] 1 lock held by fsstress/1356184: [162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0 [162513.520811] 3 locks held by fsstress/1356185: [162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50 [162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120 [162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] [162513.520833] 1 lock held by fsstress/1356196: [162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0 [162513.520838] 3 locks held by fsstress/1356197: [162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50 [162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50 [162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] [162513.520858] 2 locks held by btrfs/1356211: [162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs] [162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] This was weird because the stack traces show that a transaction commit, triggered by a device replace operation, is blocking trying to pause any running scrubs but there are no stack traces of blocked tasks doing a scrub. After poking around with drgn, I noticed there was a scrub task that was constantly running and blocking for shorts periods of time: >>> t = find_task(prog, 1356190) >>> prog.stack_trace(t) #0 __schedule+0x5ce/0xcfc #1 schedule+0x46/0xe4 #2 schedule_timeout+0x1df/0x475 #3 btrfs_reada_wait+0xda/0x132 #4 scrub_stripe+0x2a8/0x112f #5 scrub_chunk+0xcd/0x134 #6 scrub_enumerate_chunks+0x29e/0x5ee #7 btrfs_scrub_dev+0x2d5/0x91b #8 btrfs_ioctl+0x7f5/0x36e7 #9 __x64_sys_ioctl+0x83/0xb0 #10 do_syscall_64+0x33/0x77 #11 entry_SYSCALL_64+0x7c/0x156 Which corresponds to: int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; struct btrfs_fs_info *fs_info = rc->fs_info; while (atomic_read(&rc->elems)) { if (!atomic_read(&fs_info->reada_works_cnt)) reada_start_machine(fs_info); wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, (HZ + 9) / 10); } (...) So the counter "rc->elems" was set to 1 and never decreased to 0, causing the scrub task to loop forever in that function. Then I used the following script for drgn to check the readahead requests: $ cat dump_reada.py import sys import drgn from drgn import NULL, Object, cast, container_of, execscript, \ reinterpret, sizeof from drgn.helpers.linux import * mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1" mnt = None for mnt in for_each_mount(prog, dst = mnt_path): pass if mnt is None: sys.stderr.write(f'Error: mount point {mnt_path} not found\n') sys.exit(1) fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info) def dump_re(re): nzones = re.nzones.value_() print(f're at {hex(re.value_())}') print(f'\t logical {re.logical.value_()}') print(f'\t refcnt {re.refcnt.value_()}') print(f'\t nzones {nzones}') for i in range(nzones): dev = re.zones[i].device name = dev.name.str.string_() print(f'\t\t dev id {dev.devid.value_()} name {name}') print() for _, e in radix_tree_for_each(fs_info.reada_tree): re = cast('struct reada_extent *', e) dump_re(re) $ drgn dump_reada.py re at 0xffff8f3da9d25ad8 logical 38928384 refcnt 1 nzones 1 dev id 0 name b'/dev/sdd' $ So there was one readahead extent with a single zone corresponding to the source device of that last device replace operation logged in dmesg/syslog. Also the ID of that zone's device was 0 which is a special value set in the source device of a device replace operation when the operation finishes (constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()), confirming again that device /dev/sdd was the source of a device replace operation. Normally there should be as many zones in the readahead extent as there are devices, and I wasn't expecting the extent to be in a block group with a 'single' profile, so I went and confirmed with the following drgn script that there weren't any single profile block groups: $ cat dump_block_groups.py import sys import drgn from drgn import NULL, Object, cast, container_of, execscript, \ reinterpret, sizeof from drgn.helpers.linux import * mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1" mnt = None for mnt in for_each_mount(prog, dst = mnt_path): pass if mnt is None: sys.stderr.write(f'Error: mount point {mnt_path} not found\n') sys.exit(1) fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info) BTRFS_BLOCK_GROUP_DATA = (1 << 0) BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1) BTRFS_BLOCK_GROUP_METADATA = (1 << 2) BTRFS_BLOCK_GROUP_RAID0 = (1 << 3) BTRFS_BLOCK_GROUP_RAID1 = (1 << 4) BTRFS_BLOCK_GROUP_DUP = (1 << 5) BTRFS_BLOCK_GROUP_RAID10 = (1 << 6) BTRFS_BLOCK_GROUP_RAID5 = (1 << 7) BTRFS_BLOCK_GROUP_RAID6 = (1 << 8) BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9) BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10) def bg_flags_string(bg): flags = bg.flags.value_() ret = '' if flags & BTRFS_BLOCK_GROUP_DATA: ret = 'data' if flags & BTRFS_BLOCK_GROUP_METADATA: if len(ret) > 0: ret += '|' ret += 'meta' if flags & BTRFS_BLOCK_GROUP_SYSTEM: if len(ret) > 0: ret += '|' ret += 'system' if flags & BTRFS_BLOCK_GROUP_RAID0: ret += ' raid0' elif flags & BTRFS_BLOCK_GROUP_RAID1: ret += ' raid1' elif flags & BTRFS_BLOCK_GROUP_DUP: ret += ' dup' elif flags & BTRFS_BLOCK_GROUP_RAID10: ret += ' raid10' elif flags & BTRFS_BLOCK_GROUP_RAID5: ret += ' raid5' elif flags & BTRFS_BLOCK_GROUP_RAID6: ret += ' raid6' elif flags & BTRFS_BLOCK_GROUP_RAID1C3: ret += ' raid1c3' elif flags & BTRFS_BLOCK_GROUP_RAID1C4: ret += ' raid1c4' else: ret += ' single' return ret def dump_bg(bg): print() print(f'block group at {hex(bg.value_())}') print(f'\t start {bg.start.value_()} length {bg.length.value_()}') print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}') bg_root = fs_info.block_group_cache_tree.address_of_() for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'): dump_bg(bg) $ drgn dump_block_groups.py block group at 0xffff8f3d673b0400 start 22020096 length 16777216 flags 258 - system raid6 block group at 0xffff8f3d53ddb400 start 38797312 length 536870912 flags 260 - meta raid6 block group at 0xffff8f3d5f4d9c00 start 575668224 length 2147483648 flags 257 - data raid6 block group at 0xffff8f3d08189000 start 2723151872 length 67108864 flags 258 - system raid6 block group at 0xffff8f3db70ff000 start 2790260736 length 1073741824 flags 260 - meta raid6 block group at 0xffff8f3d5f4dd800 start 3864002560 length 67108864 flags 258 - system raid6 block group at 0xffff8f3d67037000 start 3931111424 length 2147483648 flags 257 - data raid6 $ So there were only 2 reasons left for having a readahead extent with a single zone: reada_find_zone(), called when creating a readahead extent, returned NULL either because we failed to find the corresponding block group or because a memory allocation failed. With some additional and custom tracing I figured out that on every further ocurrence of the problem the block group had just been deleted when we were looping to create the zones for the readahead extent (at reada_find_extent()), so we ended up with only one zone in the readahead extent, corresponding to a device that ends up getting replaced. So after figuring that out it became obvious why the hang happens: 1) Task A starts a scrub on any device of the filesystem, except for device /dev/sdd; 2) Task B starts a device replace with /dev/sdd as the source device; 3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently starting to scrub a stripe from block group X. This call to btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add() calls reada_add_block(), it passes the logical address of the extent tree's root node as its 'logical' argument - a value of 38928384; 4) Task A then enters reada_find_extent(), called from reada_add_block(). It finds there isn't any existing readahead extent for the logical address 38928384, so it proceeds to the path of creating a new one. It calls btrfs_map_block() to find out which stripes exist for the block group X. On the first iteration of the for loop that iterates over the stripes, it finds the stripe for device /dev/sdd, so it creates one zone for that device and adds it to the readahead extent. Before getting into the second iteration of the loop, the cleanup kthread deletes block group X because it was empty. So in the iterations for the remaining stripes it does not add more zones to the readahead extent, because the calls to reada_find_zone() returned NULL because they couldn't find block group X anymore. As a result the new readahead extent has a single zone, corresponding to the device /dev/sdd; 4) Before task A returns to btrfs_reada_add() and queues the readahead job for the readahead work queue, task B finishes the device replace and at btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new device /dev/sdg; 5) Task A returns to reada_add_block(), which increments the counter "->elems" of the reada_control structure allocated at btrfs_reada_add(). Then it returns back to btrfs_reada_add() and calls reada_start_machine(). This queues a job in the readahead work queue to run the function reada_start_machine_worker(), which calls __reada_start_machine(). At __reada_start_machine() we take the device list mutex and for each device found in the current device list, we call reada_start_machine_dev() to start the readahead work. However at this point the device /dev/sdd was already freed and is not in the device list anymore. This means the corresponding readahead for the extent at 38928384 is never started, and therefore the "->elems" counter of the reada_control structure allocated at btrfs_reada_add() never goes down to 0, causing the call to btrfs_reada_wait(), done by the scrub task, to wait forever. Note that the readahead request can be made either after the device replace started or before it started, however in pratice it is very unlikely that a device replace is able to start after a readahead request is made and is able to complete before the readahead request completes - maybe only on a very small and nearly empty filesystem. This hang however is not the only problem we can have with readahead and device removals. When the readahead extent has other zones other than the one corresponding to the device that is being removed (either by a device replace or a device remove operation), we risk having a use-after-free on the device when dropping the last reference of the readahead extent. For example if we create a readahead extent with two zones, one for the device /dev/sdd and one for the device /dev/sde: 1) Before the readahead worker starts, the device /dev/sdd is removed, and the corresponding btrfs_device structure is freed. However the readahead extent still has the zone pointing to the device structure; 2) When the readahead worker starts, it only finds device /dev/sde in the current device list of the filesystem; 3) It starts the readahead work, at reada_start_machine_dev(), using the device /dev/sde; 4) Then when it finishes reading the extent from device /dev/sde, it calls __readahead_hook() which ends up dropping the last reference on the readahead extent through the last call to reada_extent_put(); 5) At reada_extent_put() it iterates over each zone of the readahead extent and attempts to delete an element from the device's 'reada_extents' radix tree, resulting in a use-after-free, as the device pointer of the zone for /dev/sdd is now stale. We can also access the device after dropping the last reference of a zone, through reada_zone_release(), also called by reada_extent_put(). And a device remove suffers the same problem, however since it shrinks the device size down to zero before removing the device, it is very unlikely to still have readahead requests not completed by the time we free the device, the only possibility is if the device has a very little space allocated. While the hang problem is exclusive to scrub, since it is currently the only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free problem affects any path that triggers readhead, which includes btree_readahead_hook() and __readahead_hook() (a readahead worker can trigger readahed for the children of a node) for example - any path that ends up calling reada_add_block() can trigger the use-after-free after a device is removed. So fix this by waiting for any readahead requests for a device to complete before removing a device, ensuring that while waiting for existing ones no new ones can be made. This problem has been around for a very long time - the readahead code was added in 2011, device remove exists since 2008 and device replace was introduced in 2013, hard to pick a specific commit for a git Fixes tag. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-10-12 04:55:24 -06:00
btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list,
&fs_devices->alloc_list);
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
goto out;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices;
lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
/*
* in case of fs with no seed, srcdev->fs_devices will point
* to fs_devices of fs_info. However when the dev being replaced is
* a seed dev it will point to the seed's local fs_devices. In short
* srcdev will have its correct fs_devices in both the cases.
*/
fs_devices = srcdev->fs_devices;
list_del_rcu(&srcdev->dev_list);
list_del(&srcdev->dev_alloc_list);
fs_devices->num_devices--;
if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
fs_devices->missing_devices--;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
fs_devices->rw_devices--;
if (srcdev->bdev)
fs_devices->open_devices--;
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 02:52:31 -06:00
}
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 02:52:31 -06:00
{
struct btrfs_fs_info *fs_info = srcdev->fs_info;
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 02:52:31 -06:00
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
btrfs: fix lock dep warning move scratch super outside of chunk_mutex Move scratch super outside of the chunk lock to avoid below lockdep warning. The better place to scratch super is in the function btrfs_rm_dev_replace_free_srcdev() just before free_device, which is outside of the chunk lock as well. To reproduce: (fresh boot) mkfs.btrfs -f -draid5 -mraid5 /dev/sdc /dev/sdd /dev/sde mount /dev/sdc /btrfs dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100 (get devmgt from https://github.com/asj/devmgt.git) devmgt detach /dev/sde dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100 sync btrfs replace start -Brf 3 /dev/sdf /btrfs <-- devmgt attach host7 ====================================================== [ INFO: possible circular locking dependency detected ] 4.6.0-rc2asj+ #1 Not tainted --------------------------------------------------- btrfs/2174 is trying to acquire lock: (sb_writers){.+.+.+}, at: [<ffffffff812449b4>] __sb_start_write+0xb4/0xf0 but task is already holding lock: (&fs_info->chunk_mutex){+.+.+.}, at: [<ffffffffa05c5f55>] btrfs_dev_replace_finishing+0x145/0x980 [btrfs] which lock already depends on the new lock. Chain exists of: sb_writers --> &fs_devs->device_list_mutex --> &fs_info->chunk_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); lock(&fs_info->chunk_mutex); lock(sb_writers); *** DEADLOCK *** -> #0 (sb_writers){.+.+.+}: [<ffffffff810e6415>] __lock_acquire+0x1bc5/0x1ee0 [<ffffffff810e707e>] lock_acquire+0xbe/0x210 [<ffffffff810df49a>] percpu_down_read+0x4a/0xa0 [<ffffffff812449b4>] __sb_start_write+0xb4/0xf0 [<ffffffff81265534>] mnt_want_write+0x24/0x50 [<ffffffff812508a2>] path_openat+0x952/0x1190 [<ffffffff81252451>] do_filp_open+0x91/0x100 [<ffffffff8123f5cc>] file_open_name+0xfc/0x140 [<ffffffff8123f643>] filp_open+0x33/0x60 [<ffffffffa0572bb6>] update_dev_time+0x16/0x40 [btrfs] [<ffffffffa057f60d>] btrfs_scratch_superblocks+0x5d/0xb0 [btrfs] [<ffffffffa057f70e>] btrfs_rm_dev_replace_remove_srcdev+0xae/0xd0 [btrfs] [<ffffffffa05c62c5>] btrfs_dev_replace_finishing+0x4b5/0x980 [btrfs] [<ffffffffa05c6ae8>] btrfs_dev_replace_start+0x358/0x530 [btrfs] Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2016-04-12 07:36:16 -06:00
/* zero out the old super if it is writable */
btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
btrfs_close_bdev(srcdev);
synchronize_rcu();
btrfs_free_device(srcdev);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
struct btrfs_fs_devices *tmp_fs_devices;
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
* target added to the sprout FS, so there will be no more
* device left under the seed FS.
*/
ASSERT(fs_devices->seeding);
tmp_fs_devices = fs_info->fs_devices;
while (tmp_fs_devices) {
if (tmp_fs_devices->seed == fs_devices) {
tmp_fs_devices->seed = fs_devices->seed;
break;
}
tmp_fs_devices = tmp_fs_devices->seed;
}
fs_devices->seed = NULL;
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
{
struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
WARN_ON(!tgtdev);
mutex_lock(&fs_devices->device_list_mutex);
btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
btrfs: fix lock dep warning, move scratch dev out of device_list_mutex and uuid_mutex When the replace target fails, the target device will be taken out of fs device list, scratch + update_dev_time and freed. However we could do the scratch + update_dev_time and free part after the device has been taken out of device list, so that we don't have to hold the device_list_mutex and uuid_mutex locks. Reported issue: [ 5375.718845] ====================================================== [ 5375.718846] [ INFO: possible circular locking dependency detected ] [ 5375.718849] 4.4.5-scst31x-debug-11+ #40 Not tainted [ 5375.718849] ------------------------------------------------------- [ 5375.718851] btrfs-health/4662 is trying to acquire lock: [ 5375.718861] (sb_writers){.+.+.+}, at: [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.718862] [ 5375.718862] but task is already holding lock: [ 5375.718907] (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa028263c>] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.718907] [ 5375.718907] which lock already depends on the new lock. [ 5375.718907] [ 5375.718908] [ 5375.718908] the existing dependency chain (in reverse order) is: [ 5375.718911] [ 5375.718911] -> #3 (&fs_devs->device_list_mutex){+.+.+.}: [ 5375.718917] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718921] [<ffffffff81633949>] mutex_lock_nested+0x69/0x3c0 [ 5375.718940] [<ffffffffa0219bf6>] btrfs_show_devname+0x36/0x210 [btrfs] [ 5375.718945] [<ffffffff81267079>] show_vfsmnt+0x49/0x150 [ 5375.718948] [<ffffffff81240b07>] m_show+0x17/0x20 [ 5375.718951] [<ffffffff81246868>] seq_read+0x2d8/0x3b0 [ 5375.718955] [<ffffffff8121df28>] __vfs_read+0x28/0xd0 [ 5375.718959] [<ffffffff8121e806>] vfs_read+0x86/0x130 [ 5375.718962] [<ffffffff8121f4c9>] SyS_read+0x49/0xa0 [ 5375.718966] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718968] [ 5375.718968] -> #2 (namespace_sem){+++++.}: [ 5375.718971] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718974] [<ffffffff81635199>] down_write+0x49/0x80 [ 5375.718977] [<ffffffff81243593>] lock_mount+0x43/0x1c0 [ 5375.718979] [<ffffffff81243c13>] do_add_mount+0x23/0xd0 [ 5375.718982] [<ffffffff81244afb>] do_mount+0x27b/0xe30 [ 5375.718985] [<ffffffff812459dc>] SyS_mount+0x8c/0xd0 [ 5375.718988] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718991] [ 5375.718991] -> #1 (&sb->s_type->i_mutex_key#5){+.+.+.}: [ 5375.718994] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718996] [<ffffffff81633949>] mutex_lock_nested+0x69/0x3c0 [ 5375.719001] [<ffffffff8122d608>] path_openat+0x468/0x1360 [ 5375.719004] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719007] [<ffffffff8121da7b>] do_sys_open+0x12b/0x210 [ 5375.719010] [<ffffffff8121db7e>] SyS_open+0x1e/0x20 [ 5375.719013] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.719015] [ 5375.719015] -> #0 (sb_writers){.+.+.+}: [ 5375.719018] [<ffffffff810d97ca>] __lock_acquire+0x17ba/0x1ae0 [ 5375.719021] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.719026] [<ffffffff810d3bef>] percpu_down_read+0x4f/0xa0 [ 5375.719028] [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.719031] [<ffffffff81242eb4>] mnt_want_write+0x24/0x50 [ 5375.719035] [<ffffffff8122ded2>] path_openat+0xd32/0x1360 [ 5375.719037] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719040] [<ffffffff8121d8a4>] file_open_name+0xe4/0x130 [ 5375.719043] [<ffffffff8121d923>] filp_open+0x33/0x60 [ 5375.719073] [<ffffffffa02776a6>] update_dev_time+0x16/0x40 [btrfs] [ 5375.719099] [<ffffffffa02825be>] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719123] [<ffffffffa0282665>] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719150] [<ffffffffa02c6c80>] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719175] [<ffffffffa02c729e>] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719199] [<ffffffffa02c7f58>] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719222] [<ffffffffa02464e6>] health_kthread+0x246/0x490 [btrfs] [ 5375.719225] [<ffffffff810a70df>] kthread+0xef/0x110 [ 5375.719229] [<ffffffff81637d2f>] ret_from_fork+0x3f/0x70 [ 5375.719230] [ 5375.719230] other info that might help us debug this: [ 5375.719230] [ 5375.719233] Chain exists of: [ 5375.719233] sb_writers --> namespace_sem --> &fs_devs->device_list_mutex [ 5375.719233] [ 5375.719234] Possible unsafe locking scenario: [ 5375.719234] [ 5375.719234] CPU0 CPU1 [ 5375.719235] ---- ---- [ 5375.719236] lock(&fs_devs->device_list_mutex); [ 5375.719238] lock(namespace_sem); [ 5375.719239] lock(&fs_devs->device_list_mutex); [ 5375.719241] lock(sb_writers); [ 5375.719241] [ 5375.719241] *** DEADLOCK *** [ 5375.719241] [ 5375.719243] 4 locks held by btrfs-health/4662: [ 5375.719266] #0: (&fs_info->health_mutex){+.+.+.}, at: [<ffffffffa0246303>] health_kthread+0x63/0x490 [btrfs] [ 5375.719293] #1: (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.+.}, at: [<ffffffffa02c6611>] btrfs_dev_replace_finishing+0x41/0x990 [btrfs] [ 5375.719319] #2: (uuid_mutex){+.+.+.}, at: [<ffffffffa0282620>] btrfs_destroy_dev_replace_tgtdev+0x20/0x150 [btrfs] [ 5375.719343] #3: (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa028263c>] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.719343] [ 5375.719343] stack backtrace: [ 5375.719347] CPU: 2 PID: 4662 Comm: btrfs-health Not tainted 4.4.5-scst31x-debug-11+ #40 [ 5375.719348] Hardware name: Supermicro SYS-6018R-WTRT/X10DRW-iT, BIOS 1.0c 01/07/2015 [ 5375.719352] 0000000000000000 ffff880856f73880 ffffffff813529e3 ffffffff826182a0 [ 5375.719354] ffffffff8260c090 ffff880856f738c0 ffffffff810d667c ffff880856f73930 [ 5375.719357] ffff880861f32b40 ffff880861f32b68 0000000000000003 0000000000000004 [ 5375.719357] Call Trace: [ 5375.719363] [<ffffffff813529e3>] dump_stack+0x85/0xc2 [ 5375.719366] [<ffffffff810d667c>] print_circular_bug+0x1ec/0x260 [ 5375.719369] [<ffffffff810d97ca>] __lock_acquire+0x17ba/0x1ae0 [ 5375.719373] [<ffffffff810f606d>] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719376] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.719378] [<ffffffff812214f7>] ? __sb_start_write+0xb7/0xf0 [ 5375.719383] [<ffffffff810d3bef>] percpu_down_read+0x4f/0xa0 [ 5375.719385] [<ffffffff812214f7>] ? __sb_start_write+0xb7/0xf0 [ 5375.719387] [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.719389] [<ffffffff81242eb4>] mnt_want_write+0x24/0x50 [ 5375.719393] [<ffffffff8122ded2>] path_openat+0xd32/0x1360 [ 5375.719415] [<ffffffffa02462a0>] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719418] [<ffffffff810f606d>] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719420] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719423] [<ffffffff810f615d>] ? rcu_read_lock_sched_held+0x6d/0x80 [ 5375.719426] [<ffffffff81201a9b>] ? kmem_cache_alloc+0x26b/0x5d0 [ 5375.719430] [<ffffffff8122e7d4>] ? getname_kernel+0x34/0x120 [ 5375.719433] [<ffffffff8121d8a4>] file_open_name+0xe4/0x130 [ 5375.719436] [<ffffffff8121d923>] filp_open+0x33/0x60 [ 5375.719462] [<ffffffffa02776a6>] update_dev_time+0x16/0x40 [btrfs] [ 5375.719485] [<ffffffffa02825be>] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719506] [<ffffffffa0282665>] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719530] [<ffffffffa02c6c80>] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719554] [<ffffffffa02c6b23>] ? btrfs_dev_replace_finishing+0x553/0x990 [btrfs] [ 5375.719576] [<ffffffffa02c729e>] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719598] [<ffffffffa02c7f58>] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719621] [<ffffffffa02464e6>] health_kthread+0x246/0x490 [btrfs] [ 5375.719641] [<ffffffffa02463d8>] ? health_kthread+0x138/0x490 [btrfs] [ 5375.719661] [<ffffffffa02462a0>] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719663] [<ffffffff810a70df>] kthread+0xef/0x110 [ 5375.719666] [<ffffffff810a6ff0>] ? kthread_create_on_node+0x200/0x200 [ 5375.719669] [<ffffffff81637d2f>] ret_from_fork+0x3f/0x70 [ 5375.719672] [<ffffffff810a6ff0>] ? kthread_create_on_node+0x200/0x200 [ 5375.719697] ------------[ cut here ]------------ Signed-off-by: Anand Jain <anand.jain@oracle.com> Reported-by: Yauhen Kharuzhy <yauhen.kharuzhy@zavadatar.com> Signed-off-by: David Sterba <dsterba@suse.com>
2016-04-18 02:51:23 -06:00
if (tgtdev->bdev)
fs_devices->open_devices--;
btrfs: fix lock dep warning, move scratch dev out of device_list_mutex and uuid_mutex When the replace target fails, the target device will be taken out of fs device list, scratch + update_dev_time and freed. However we could do the scratch + update_dev_time and free part after the device has been taken out of device list, so that we don't have to hold the device_list_mutex and uuid_mutex locks. Reported issue: [ 5375.718845] ====================================================== [ 5375.718846] [ INFO: possible circular locking dependency detected ] [ 5375.718849] 4.4.5-scst31x-debug-11+ #40 Not tainted [ 5375.718849] ------------------------------------------------------- [ 5375.718851] btrfs-health/4662 is trying to acquire lock: [ 5375.718861] (sb_writers){.+.+.+}, at: [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.718862] [ 5375.718862] but task is already holding lock: [ 5375.718907] (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa028263c>] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.718907] [ 5375.718907] which lock already depends on the new lock. [ 5375.718907] [ 5375.718908] [ 5375.718908] the existing dependency chain (in reverse order) is: [ 5375.718911] [ 5375.718911] -> #3 (&fs_devs->device_list_mutex){+.+.+.}: [ 5375.718917] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718921] [<ffffffff81633949>] mutex_lock_nested+0x69/0x3c0 [ 5375.718940] [<ffffffffa0219bf6>] btrfs_show_devname+0x36/0x210 [btrfs] [ 5375.718945] [<ffffffff81267079>] show_vfsmnt+0x49/0x150 [ 5375.718948] [<ffffffff81240b07>] m_show+0x17/0x20 [ 5375.718951] [<ffffffff81246868>] seq_read+0x2d8/0x3b0 [ 5375.718955] [<ffffffff8121df28>] __vfs_read+0x28/0xd0 [ 5375.718959] [<ffffffff8121e806>] vfs_read+0x86/0x130 [ 5375.718962] [<ffffffff8121f4c9>] SyS_read+0x49/0xa0 [ 5375.718966] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718968] [ 5375.718968] -> #2 (namespace_sem){+++++.}: [ 5375.718971] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718974] [<ffffffff81635199>] down_write+0x49/0x80 [ 5375.718977] [<ffffffff81243593>] lock_mount+0x43/0x1c0 [ 5375.718979] [<ffffffff81243c13>] do_add_mount+0x23/0xd0 [ 5375.718982] [<ffffffff81244afb>] do_mount+0x27b/0xe30 [ 5375.718985] [<ffffffff812459dc>] SyS_mount+0x8c/0xd0 [ 5375.718988] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718991] [ 5375.718991] -> #1 (&sb->s_type->i_mutex_key#5){+.+.+.}: [ 5375.718994] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718996] [<ffffffff81633949>] mutex_lock_nested+0x69/0x3c0 [ 5375.719001] [<ffffffff8122d608>] path_openat+0x468/0x1360 [ 5375.719004] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719007] [<ffffffff8121da7b>] do_sys_open+0x12b/0x210 [ 5375.719010] [<ffffffff8121db7e>] SyS_open+0x1e/0x20 [ 5375.719013] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.719015] [ 5375.719015] -> #0 (sb_writers){.+.+.+}: [ 5375.719018] [<ffffffff810d97ca>] __lock_acquire+0x17ba/0x1ae0 [ 5375.719021] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.719026] [<ffffffff810d3bef>] percpu_down_read+0x4f/0xa0 [ 5375.719028] [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.719031] [<ffffffff81242eb4>] mnt_want_write+0x24/0x50 [ 5375.719035] [<ffffffff8122ded2>] path_openat+0xd32/0x1360 [ 5375.719037] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719040] [<ffffffff8121d8a4>] file_open_name+0xe4/0x130 [ 5375.719043] [<ffffffff8121d923>] filp_open+0x33/0x60 [ 5375.719073] [<ffffffffa02776a6>] update_dev_time+0x16/0x40 [btrfs] [ 5375.719099] [<ffffffffa02825be>] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719123] [<ffffffffa0282665>] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719150] [<ffffffffa02c6c80>] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719175] [<ffffffffa02c729e>] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719199] [<ffffffffa02c7f58>] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719222] [<ffffffffa02464e6>] health_kthread+0x246/0x490 [btrfs] [ 5375.719225] [<ffffffff810a70df>] kthread+0xef/0x110 [ 5375.719229] [<ffffffff81637d2f>] ret_from_fork+0x3f/0x70 [ 5375.719230] [ 5375.719230] other info that might help us debug this: [ 5375.719230] [ 5375.719233] Chain exists of: [ 5375.719233] sb_writers --> namespace_sem --> &fs_devs->device_list_mutex [ 5375.719233] [ 5375.719234] Possible unsafe locking scenario: [ 5375.719234] [ 5375.719234] CPU0 CPU1 [ 5375.719235] ---- ---- [ 5375.719236] lock(&fs_devs->device_list_mutex); [ 5375.719238] lock(namespace_sem); [ 5375.719239] lock(&fs_devs->device_list_mutex); [ 5375.719241] lock(sb_writers); [ 5375.719241] [ 5375.719241] *** DEADLOCK *** [ 5375.719241] [ 5375.719243] 4 locks held by btrfs-health/4662: [ 5375.719266] #0: (&fs_info->health_mutex){+.+.+.}, at: [<ffffffffa0246303>] health_kthread+0x63/0x490 [btrfs] [ 5375.719293] #1: (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.+.}, at: [<ffffffffa02c6611>] btrfs_dev_replace_finishing+0x41/0x990 [btrfs] [ 5375.719319] #2: (uuid_mutex){+.+.+.}, at: [<ffffffffa0282620>] btrfs_destroy_dev_replace_tgtdev+0x20/0x150 [btrfs] [ 5375.719343] #3: (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa028263c>] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.719343] [ 5375.719343] stack backtrace: [ 5375.719347] CPU: 2 PID: 4662 Comm: btrfs-health Not tainted 4.4.5-scst31x-debug-11+ #40 [ 5375.719348] Hardware name: Supermicro SYS-6018R-WTRT/X10DRW-iT, BIOS 1.0c 01/07/2015 [ 5375.719352] 0000000000000000 ffff880856f73880 ffffffff813529e3 ffffffff826182a0 [ 5375.719354] ffffffff8260c090 ffff880856f738c0 ffffffff810d667c ffff880856f73930 [ 5375.719357] ffff880861f32b40 ffff880861f32b68 0000000000000003 0000000000000004 [ 5375.719357] Call Trace: [ 5375.719363] [<ffffffff813529e3>] dump_stack+0x85/0xc2 [ 5375.719366] [<ffffffff810d667c>] print_circular_bug+0x1ec/0x260 [ 5375.719369] [<ffffffff810d97ca>] __lock_acquire+0x17ba/0x1ae0 [ 5375.719373] [<ffffffff810f606d>] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719376] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.719378] [<ffffffff812214f7>] ? __sb_start_write+0xb7/0xf0 [ 5375.719383] [<ffffffff810d3bef>] percpu_down_read+0x4f/0xa0 [ 5375.719385] [<ffffffff812214f7>] ? __sb_start_write+0xb7/0xf0 [ 5375.719387] [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.719389] [<ffffffff81242eb4>] mnt_want_write+0x24/0x50 [ 5375.719393] [<ffffffff8122ded2>] path_openat+0xd32/0x1360 [ 5375.719415] [<ffffffffa02462a0>] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719418] [<ffffffff810f606d>] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719420] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719423] [<ffffffff810f615d>] ? rcu_read_lock_sched_held+0x6d/0x80 [ 5375.719426] [<ffffffff81201a9b>] ? kmem_cache_alloc+0x26b/0x5d0 [ 5375.719430] [<ffffffff8122e7d4>] ? getname_kernel+0x34/0x120 [ 5375.719433] [<ffffffff8121d8a4>] file_open_name+0xe4/0x130 [ 5375.719436] [<ffffffff8121d923>] filp_open+0x33/0x60 [ 5375.719462] [<ffffffffa02776a6>] update_dev_time+0x16/0x40 [btrfs] [ 5375.719485] [<ffffffffa02825be>] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719506] [<ffffffffa0282665>] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719530] [<ffffffffa02c6c80>] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719554] [<ffffffffa02c6b23>] ? btrfs_dev_replace_finishing+0x553/0x990 [btrfs] [ 5375.719576] [<ffffffffa02c729e>] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719598] [<ffffffffa02c7f58>] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719621] [<ffffffffa02464e6>] health_kthread+0x246/0x490 [btrfs] [ 5375.719641] [<ffffffffa02463d8>] ? health_kthread+0x138/0x490 [btrfs] [ 5375.719661] [<ffffffffa02462a0>] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719663] [<ffffffff810a70df>] kthread+0xef/0x110 [ 5375.719666] [<ffffffff810a6ff0>] ? kthread_create_on_node+0x200/0x200 [ 5375.719669] [<ffffffff81637d2f>] ret_from_fork+0x3f/0x70 [ 5375.719672] [<ffffffff810a6ff0>] ? kthread_create_on_node+0x200/0x200 [ 5375.719697] ------------[ cut here ]------------ Signed-off-by: Anand Jain <anand.jain@oracle.com> Reported-by: Yauhen Kharuzhy <yauhen.kharuzhy@zavadatar.com> Signed-off-by: David Sterba <dsterba@suse.com>
2016-04-18 02:51:23 -06:00
fs_devices->num_devices--;
btrfs_assign_next_active_device(tgtdev, NULL);
list_del_rcu(&tgtdev->dev_list);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs: fix lock dep warning, move scratch dev out of device_list_mutex and uuid_mutex When the replace target fails, the target device will be taken out of fs device list, scratch + update_dev_time and freed. However we could do the scratch + update_dev_time and free part after the device has been taken out of device list, so that we don't have to hold the device_list_mutex and uuid_mutex locks. Reported issue: [ 5375.718845] ====================================================== [ 5375.718846] [ INFO: possible circular locking dependency detected ] [ 5375.718849] 4.4.5-scst31x-debug-11+ #40 Not tainted [ 5375.718849] ------------------------------------------------------- [ 5375.718851] btrfs-health/4662 is trying to acquire lock: [ 5375.718861] (sb_writers){.+.+.+}, at: [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.718862] [ 5375.718862] but task is already holding lock: [ 5375.718907] (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa028263c>] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.718907] [ 5375.718907] which lock already depends on the new lock. [ 5375.718907] [ 5375.718908] [ 5375.718908] the existing dependency chain (in reverse order) is: [ 5375.718911] [ 5375.718911] -> #3 (&fs_devs->device_list_mutex){+.+.+.}: [ 5375.718917] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718921] [<ffffffff81633949>] mutex_lock_nested+0x69/0x3c0 [ 5375.718940] [<ffffffffa0219bf6>] btrfs_show_devname+0x36/0x210 [btrfs] [ 5375.718945] [<ffffffff81267079>] show_vfsmnt+0x49/0x150 [ 5375.718948] [<ffffffff81240b07>] m_show+0x17/0x20 [ 5375.718951] [<ffffffff81246868>] seq_read+0x2d8/0x3b0 [ 5375.718955] [<ffffffff8121df28>] __vfs_read+0x28/0xd0 [ 5375.718959] [<ffffffff8121e806>] vfs_read+0x86/0x130 [ 5375.718962] [<ffffffff8121f4c9>] SyS_read+0x49/0xa0 [ 5375.718966] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718968] [ 5375.718968] -> #2 (namespace_sem){+++++.}: [ 5375.718971] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718974] [<ffffffff81635199>] down_write+0x49/0x80 [ 5375.718977] [<ffffffff81243593>] lock_mount+0x43/0x1c0 [ 5375.718979] [<ffffffff81243c13>] do_add_mount+0x23/0xd0 [ 5375.718982] [<ffffffff81244afb>] do_mount+0x27b/0xe30 [ 5375.718985] [<ffffffff812459dc>] SyS_mount+0x8c/0xd0 [ 5375.718988] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718991] [ 5375.718991] -> #1 (&sb->s_type->i_mutex_key#5){+.+.+.}: [ 5375.718994] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.718996] [<ffffffff81633949>] mutex_lock_nested+0x69/0x3c0 [ 5375.719001] [<ffffffff8122d608>] path_openat+0x468/0x1360 [ 5375.719004] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719007] [<ffffffff8121da7b>] do_sys_open+0x12b/0x210 [ 5375.719010] [<ffffffff8121db7e>] SyS_open+0x1e/0x20 [ 5375.719013] [<ffffffff81637976>] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.719015] [ 5375.719015] -> #0 (sb_writers){.+.+.+}: [ 5375.719018] [<ffffffff810d97ca>] __lock_acquire+0x17ba/0x1ae0 [ 5375.719021] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.719026] [<ffffffff810d3bef>] percpu_down_read+0x4f/0xa0 [ 5375.719028] [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.719031] [<ffffffff81242eb4>] mnt_want_write+0x24/0x50 [ 5375.719035] [<ffffffff8122ded2>] path_openat+0xd32/0x1360 [ 5375.719037] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719040] [<ffffffff8121d8a4>] file_open_name+0xe4/0x130 [ 5375.719043] [<ffffffff8121d923>] filp_open+0x33/0x60 [ 5375.719073] [<ffffffffa02776a6>] update_dev_time+0x16/0x40 [btrfs] [ 5375.719099] [<ffffffffa02825be>] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719123] [<ffffffffa0282665>] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719150] [<ffffffffa02c6c80>] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719175] [<ffffffffa02c729e>] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719199] [<ffffffffa02c7f58>] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719222] [<ffffffffa02464e6>] health_kthread+0x246/0x490 [btrfs] [ 5375.719225] [<ffffffff810a70df>] kthread+0xef/0x110 [ 5375.719229] [<ffffffff81637d2f>] ret_from_fork+0x3f/0x70 [ 5375.719230] [ 5375.719230] other info that might help us debug this: [ 5375.719230] [ 5375.719233] Chain exists of: [ 5375.719233] sb_writers --> namespace_sem --> &fs_devs->device_list_mutex [ 5375.719233] [ 5375.719234] Possible unsafe locking scenario: [ 5375.719234] [ 5375.719234] CPU0 CPU1 [ 5375.719235] ---- ---- [ 5375.719236] lock(&fs_devs->device_list_mutex); [ 5375.719238] lock(namespace_sem); [ 5375.719239] lock(&fs_devs->device_list_mutex); [ 5375.719241] lock(sb_writers); [ 5375.719241] [ 5375.719241] *** DEADLOCK *** [ 5375.719241] [ 5375.719243] 4 locks held by btrfs-health/4662: [ 5375.719266] #0: (&fs_info->health_mutex){+.+.+.}, at: [<ffffffffa0246303>] health_kthread+0x63/0x490 [btrfs] [ 5375.719293] #1: (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.+.}, at: [<ffffffffa02c6611>] btrfs_dev_replace_finishing+0x41/0x990 [btrfs] [ 5375.719319] #2: (uuid_mutex){+.+.+.}, at: [<ffffffffa0282620>] btrfs_destroy_dev_replace_tgtdev+0x20/0x150 [btrfs] [ 5375.719343] #3: (&fs_devs->device_list_mutex){+.+.+.}, at: [<ffffffffa028263c>] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.719343] [ 5375.719343] stack backtrace: [ 5375.719347] CPU: 2 PID: 4662 Comm: btrfs-health Not tainted 4.4.5-scst31x-debug-11+ #40 [ 5375.719348] Hardware name: Supermicro SYS-6018R-WTRT/X10DRW-iT, BIOS 1.0c 01/07/2015 [ 5375.719352] 0000000000000000 ffff880856f73880 ffffffff813529e3 ffffffff826182a0 [ 5375.719354] ffffffff8260c090 ffff880856f738c0 ffffffff810d667c ffff880856f73930 [ 5375.719357] ffff880861f32b40 ffff880861f32b68 0000000000000003 0000000000000004 [ 5375.719357] Call Trace: [ 5375.719363] [<ffffffff813529e3>] dump_stack+0x85/0xc2 [ 5375.719366] [<ffffffff810d667c>] print_circular_bug+0x1ec/0x260 [ 5375.719369] [<ffffffff810d97ca>] __lock_acquire+0x17ba/0x1ae0 [ 5375.719373] [<ffffffff810f606d>] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719376] [<ffffffff810da4be>] lock_acquire+0xce/0x1e0 [ 5375.719378] [<ffffffff812214f7>] ? __sb_start_write+0xb7/0xf0 [ 5375.719383] [<ffffffff810d3bef>] percpu_down_read+0x4f/0xa0 [ 5375.719385] [<ffffffff812214f7>] ? __sb_start_write+0xb7/0xf0 [ 5375.719387] [<ffffffff812214f7>] __sb_start_write+0xb7/0xf0 [ 5375.719389] [<ffffffff81242eb4>] mnt_want_write+0x24/0x50 [ 5375.719393] [<ffffffff8122ded2>] path_openat+0xd32/0x1360 [ 5375.719415] [<ffffffffa02462a0>] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719418] [<ffffffff810f606d>] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719420] [<ffffffff8122f86e>] do_filp_open+0x7e/0xe0 [ 5375.719423] [<ffffffff810f615d>] ? rcu_read_lock_sched_held+0x6d/0x80 [ 5375.719426] [<ffffffff81201a9b>] ? kmem_cache_alloc+0x26b/0x5d0 [ 5375.719430] [<ffffffff8122e7d4>] ? getname_kernel+0x34/0x120 [ 5375.719433] [<ffffffff8121d8a4>] file_open_name+0xe4/0x130 [ 5375.719436] [<ffffffff8121d923>] filp_open+0x33/0x60 [ 5375.719462] [<ffffffffa02776a6>] update_dev_time+0x16/0x40 [btrfs] [ 5375.719485] [<ffffffffa02825be>] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719506] [<ffffffffa0282665>] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719530] [<ffffffffa02c6c80>] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719554] [<ffffffffa02c6b23>] ? btrfs_dev_replace_finishing+0x553/0x990 [btrfs] [ 5375.719576] [<ffffffffa02c729e>] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719598] [<ffffffffa02c7f58>] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719621] [<ffffffffa02464e6>] health_kthread+0x246/0x490 [btrfs] [ 5375.719641] [<ffffffffa02463d8>] ? health_kthread+0x138/0x490 [btrfs] [ 5375.719661] [<ffffffffa02462a0>] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719663] [<ffffffff810a70df>] kthread+0xef/0x110 [ 5375.719666] [<ffffffff810a6ff0>] ? kthread_create_on_node+0x200/0x200 [ 5375.719669] [<ffffffff81637d2f>] ret_from_fork+0x3f/0x70 [ 5375.719672] [<ffffffff810a6ff0>] ? kthread_create_on_node+0x200/0x200 [ 5375.719697] ------------[ cut here ]------------ Signed-off-by: Anand Jain <anand.jain@oracle.com> Reported-by: Yauhen Kharuzhy <yauhen.kharuzhy@zavadatar.com> Signed-off-by: David Sterba <dsterba@suse.com>
2016-04-18 02:51:23 -06:00
/*
* The update_dev_time() with in btrfs_scratch_superblocks()
* may lead to a call to btrfs_show_devname() which will try
* to hold device_list_mutex. And here this device
* is already out of device list, so we don't have to hold
* the device_list_mutex lock.
*/
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
btrfs_close_bdev(tgtdev);
synchronize_rcu();
btrfs_free_device(tgtdev);
}
static struct btrfs_device *btrfs_find_device_by_path(
struct btrfs_fs_info *fs_info, const char *device_path)
{
int ret = 0;
struct btrfs_super_block *disk_super;
u64 devid;
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
struct btrfs_device *device;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
return ERR_PTR(ret);
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->metadata_uuid, true);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
else
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
disk_super->fsid, true);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
brelse(bh);
if (!device)
device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
struct btrfs_device *btrfs_find_device_by_devspec(
struct btrfs_fs_info *fs_info, u64 devid,
const char *device_path)
{
struct btrfs_device *device;
if (devid) {
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
NULL, true);
if (!device)
return ERR_PTR(-ENOENT);
return device;
}
if (!device_path || !device_path[0])
return ERR_PTR(-EINVAL);
if (strcmp(device_path, "missing") == 0) {
/* Find first missing device */
list_for_each_entry(device, &fs_info->fs_devices->devices,
dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) && !device->bdev)
return device;
}
return ERR_PTR(-ENOENT);
}
return btrfs_find_device_by_path(fs_info, device_path);
}
/*
* does all the dirty work required for changing file system's UUID.
*/
static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
struct btrfs_super_block *disk_super = fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
return -EINVAL;
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
return PTR_ERR(old_devices);
}
list_add(&old_devices->fs_list, &fs_uuids);
memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
seed_devices->opened = 1;
INIT_LIST_HEAD(&seed_devices->devices);
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
mutex_lock(&fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
mutex_lock(&fs_info->chunk_mutex);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
mutex_unlock(&fs_info->chunk_mutex);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_unlock(&fs_devices->device_list_mutex);
Btrfs: fix race conditions in BTRFS_IOC_FS_INFO ioctl The handler for the ioctl BTRFS_IOC_FS_INFO was reading the number of devices before acquiring the device list mutex. This could lead to inconsistent results because the update of the device list and the number of devices counter (amongst other counters related to the device list) are updated in volumes.c while holding the device list mutex - except for 2 places, one was volumes.c:btrfs_prepare_sprout() and the other was volumes.c:device_list_add(). For example, if we have 2 devices, with IDs 1 and 2 and then add a new device, with ID 3, and while adding the device is in progress an BTRFS_IOC_FS_INFO ioctl arrives, it could return a number of devices of 2 and a max dev id of 3. This would be incorrect. Also, this ioctl handler was reading the fsid while it can be updated concurrently. This can happen when while a new device is being added and the current filesystem is in seeding mode. Example: $ mkfs.btrfs -f /dev/sdb1 $ mkfs.btrfs -f /dev/sdb2 $ btrfstune -S 1 /dev/sdb1 $ mount /dev/sdb1 /mnt/test $ btrfs device add /dev/sdb2 /mnt/test If during the last step a BTRFS_IOC_FS_INFO ioctl was requested, it could read an fsid that was never valid (some bits part of the old fsid and others part of the new fsid). Also, it could read a number of devices that doesn't match the number of devices in the list and the max device id, as explained before. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-12 13:56:58 -06:00
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
return 0;
}
/*
* Store the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
struct btrfs_key key;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
u64 devid;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto error;
leaf = path->nodes[0];
next_slot:
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret > 0)
break;
if (ret < 0)
goto error;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
key.type != BTRFS_DEV_ITEM_KEY)
break;
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
fs_uuid, true);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
device->generation);
btrfs_mark_buffer_dirty(leaf);
}
path->slots[0]++;
goto next_slot;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct super_block *sb = fs_info->sb;
struct rcu_string *name;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 orig_super_total_bytes;
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
bool unlocked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
if (fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
mutex_unlock(
&fs_devices->device_list_mutex);
goto error;
}
}
mutex_unlock(&fs_devices->device_list_mutex);
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
goto error_free_device;
}
rcu_assign_pointer(device->name, name);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto error_free_device;
}
q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes = round_down(i_size_read(bdev->bd_inode),
fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
device->fs_info = fs_info;
device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 14:05:09 -07:00
sb->s_flags &= ~SB_RDONLY;
ret = btrfs_prepare_sprout(fs_info);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_trans;
}
}
device->fs_devices = fs_devices;
mutex_lock(&fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->num_devices++;
fs_devices->open_devices++;
fs_devices->rw_devices++;
fs_devices->total_devices++;
fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
fs_devices->rotating = 1;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
round_down(orig_super_total_bytes + device->total_bytes,
fs_info->sectorsize));
orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
/*
* we've got more storage, clear any full flags on the space
* infos
*/
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
btrfs: sysfs: init devices outside of the chunk_mutex [ Upstream commit ca10845a56856fff4de3804c85e6424d0f6d0cde ] While running btrfs/061, btrfs/073, btrfs/078, or btrfs/178 we hit the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc3+ #4 Not tainted ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff96ecc22ef4a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc+0x37/0x270 alloc_inode+0x82/0xb0 iget_locked+0x10d/0x2c0 kernfs_get_inode+0x1b/0x130 kernfs_get_tree+0x136/0x240 sysfs_get_tree+0x16/0x40 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (kernfs_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 kernfs_add_one+0x23/0x150 kernfs_create_link+0x63/0xa0 sysfs_do_create_link_sd+0x5e/0xd0 btrfs_sysfs_add_devices_dir+0x81/0x130 btrfs_init_new_device+0x67f/0x1250 btrfs_ioctl+0x1ef/0x2e20 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_insert_empty_items+0x64/0xb0 btrfs_insert_delayed_items+0x90/0x4f0 btrfs_commit_inode_delayed_items+0x93/0x140 btrfs_log_inode+0x5de/0x2020 btrfs_log_inode_parent+0x429/0xc90 btrfs_log_new_name+0x95/0x9b btrfs_rename2+0xbb9/0x1800 vfs_rename+0x64f/0x9f0 do_renameat2+0x320/0x4e0 __x64_sys_rename+0x1f/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> kernfs_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(kernfs_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff8dd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff96ed2ade30e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 0 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb8 check_noncircular+0x12d/0x150 __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa7/0x3d0 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x41/0x50 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This happens because we are holding the chunk_mutex at the time of adding in a new device. However we only need to hold the device_list_mutex, as we're going to iterate over the fs_devices devices. Move the sysfs init stuff outside of the chunk_mutex to get rid of this lockdep splat. CC: stable@vger.kernel.org # 4.4.x: f3cd2c58110dad14e: btrfs: sysfs, rename device_link add/remove functions CC: stable@vger.kernel.org # 4.4.x Reported-by: David Sterba <dsterba@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-09-01 06:09:01 -06:00
/* Add sysfs device entry */
btrfs_sysfs_add_device_link(fs_devices, device);
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
mutex_lock(&fs_info->chunk_mutex);
ret = init_first_rw_device(trans);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
}
ret = btrfs_add_dev_item(trans, device);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
if (seeding_dev) {
ret = btrfs_finish_sprout(trans);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto error_sysfs;
}
btrfs_sysfs_update_sprout_fsid(fs_devices,
fs_info->fs_devices->fsid);
}
ret = btrfs_commit_transaction(trans);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
unlocked = true;
if (ret) /* transaction commit */
return ret;
ret = btrfs_relocate_sys_chunks(fs_info);
if (ret < 0)
btrfs_handle_fs_error(fs_info, ret,
"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) == -ENOENT)
return 0;
ret = PTR_ERR(trans);
trans = NULL;
goto error_sysfs;
}
ret = btrfs_commit_transaction(trans);
}
/*
* Now that we have written a new super block to this device, check all
* other fs_devices list if device_path alienates any other scanned
* device.
* We can ignore the return value as it typically returns -EINVAL and
* only succeeds if the device was an alien.
*/
btrfs_forget_devices(device_path);
/* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
return ret;
error_sysfs:
btrfs_sysfs_rm_device_link(fs_devices, device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
list_del(&device->dev_alloc_list);
fs_info->fs_devices->num_devices--;
fs_info->fs_devices->open_devices--;
fs_info->fs_devices->rw_devices--;
fs_info->fs_devices->total_devices--;
fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
btrfs_set_super_total_bytes(fs_info->super_copy,
orig_super_total_bytes);
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
if (seeding_dev)
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-27 14:05:09 -07:00
sb->s_flags |= SB_RDONLY;
if (trans)
btrfs_end_transaction(trans);
error_free_device:
btrfs_free_device(device);
error:
block: make blkdev_get/put() handle exclusive access Over time, block layer has accumulated a set of APIs dealing with bdev open, close, claim and release. * blkdev_get/put() are the primary open and close functions. * bd_claim/release() deal with exclusive open. * open/close_bdev_exclusive() are combination of open and claim and the other way around, respectively. * bd_link/unlink_disk_holder() to create and remove holder/slave symlinks. * open_by_devnum() wraps bdget() + blkdev_get(). The interface is a bit confusing and the decoupling of open and claim makes it impossible to properly guarantee exclusive access as in-kernel open + claim sequence can disturb the existing exclusive open even before the block layer knows the current open if for another exclusive access. Reorganize the interface such that, * blkdev_get() is extended to include exclusive access management. @holder argument is added and, if is @FMODE_EXCL specified, it will gain exclusive access atomically w.r.t. other exclusive accesses. * blkdev_put() is similarly extended. It now takes @mode argument and if @FMODE_EXCL is set, it releases an exclusive access. Also, when the last exclusive claim is released, the holder/slave symlinks are removed automatically. * bd_claim/release() and close_bdev_exclusive() are no longer necessary and either made static or removed. * bd_link_disk_holder() remains the same but bd_unlink_disk_holder() is no longer necessary and removed. * open_bdev_exclusive() becomes a simple wrapper around lookup_bdev() and blkdev_get(). It also has an unexpected extra bdev_read_only() test which probably should be moved into blkdev_get(). * open_by_devnum() is modified to take @holder argument and pass it to blkdev_get(). Most of bdev open/close operations are unified into blkdev_get/put() and most exclusive accesses are tested atomically at the open time (as it should). This cleans up code and removes some, both valid and invalid, but unnecessary all the same, corner cases. open_bdev_exclusive() and open_by_devnum() can use further cleanup - rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop special features. Well, let's leave them for another day. Most conversions are straight-forward. drbd conversion is a bit more involved as there was some reordering, but the logic should stay the same. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Neil Brown <neilb@suse.de> Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Acked-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Philipp Reisner <philipp.reisner@linbit.com> Cc: Peter Osterlund <petero2@telia.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Jan Kara <jack@suse.cz> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <joel.becker@oracle.com> Cc: Alex Elder <aelder@sgi.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: dm-devel@redhat.com Cc: drbd-dev@lists.linbit.com Cc: Leo Chen <leochen@broadcom.com> Cc: Scott Branden <sbranden@broadcom.com> Cc: Chris Mason <chris.mason@oracle.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Cc: Joern Engel <joern@logfs.org> Cc: reiserfs-devel@vger.kernel.org Cc: Alexander Viro <viro@zeniv.linux.org.uk>
2010-11-13 03:55:17 -07:00
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev && !unlocked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
return ret;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root = device->fs_info->chunk_root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total;
u64 diff;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return -EACCES;
new_size = round_down(new_size, fs_info->sectorsize);
mutex_lock(&fs_info->chunk_mutex);
old_total = btrfs_super_total_bytes(super_copy);
diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
if (new_size <= device->total_bytes ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
mutex_unlock(&fs_info->chunk_mutex);
return -EINVAL;
}
btrfs_set_super_total_bytes(super_copy,
round_down(old_total + diff, fs_info->sectorsize));
device->fs_devices->total_rw_bytes += diff;
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->fs_info);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
&trans->transaction->dev_update_list);
mutex_unlock(&fs_info->chunk_mutex);
return btrfs_update_device(trans, device);
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
btrfs_handle_fs_error(fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
btrfs_handle_fs_error(fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur;
struct btrfs_key key;
mutex_lock(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
cur = 0;
while (cur < array_size) {
disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
len = sizeof(*disk_key);
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
chunk = (struct btrfs_chunk *)(ptr + len);
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
len += btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
key.offset == chunk_offset) {
memmove(ptr, ptr + len, array_size - (cur + len));
array_size -= len;
btrfs_set_super_sys_array_size(super_copy, array_size);
} else {
ptr += len;
cur += len;
}
}
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
/*
* btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
* @logical: Logical block offset in bytes.
* @length: Length of extent in bytes.
*
* Return: Chunk mapping or ERR_PTR.
*/
struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, length);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_crit(fs_info, "unable to find logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info,
"found a bad mapping, wanted %llu-%llu, found %llu-%llu",
logical, length, em->start, em->start + em->len);
free_extent_map(em);
return ERR_PTR(-EINVAL);
}
/* callers are responsible for dropping em's ref. */
return em;
}
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
return PTR_ERR(em);
}
map = em->map_lookup;
mutex_lock(&fs_info->chunk_mutex);
check_system_chunk(trans, map->type);
mutex_unlock(&fs_info->chunk_mutex);
Btrfs: fix race between device replace and block group removal When it's finishing, the device replace code iterates all extent maps representing block group and for each one that has a stripe that refers to the source device, it replaces its device with the target device. However when it replaces the source device with the target device it, the target device still has an ID of 0ULL (BTRFS_DEV_REPLACE_DEVID), only after its ID is changed to match the one from the source device. This leads to races with the chunk removal code that can temporarly see a device with an ID of 0ULL and then attempt to use that ID to remove items from the device tree and fail, causing a transaction abort: [ 9238.594364] BTRFS info (device sdf): dev_replace from /dev/sdf (devid 3) to /dev/sde finished [ 9238.594377] ------------[ cut here ]------------ [ 9238.594402] WARNING: CPU: 14 PID: 21566 at fs/btrfs/volumes.c:2771 btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594403] BTRFS: Transaction aborted (error 1) [ 9238.594416] Modules linked in: btrfs crc32c_generic acpi_cpufreq xor tpm_tis tpm raid6_pq ppdev parport_pc processor psmouse parport i2c_piix4 evdev sg i2c_core se rio_raw pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio e1000 scsi_mod fl oppy [last unloaded: btrfs] [ 9238.594418] CPU: 14 PID: 21566 Comm: btrfs-cleaner Not tainted 4.6.0-rc7-btrfs-next-29+ #1 [ 9238.594419] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 9238.594421] 0000000000000000 ffff88017f1dbc60 ffffffff8126b42c ffff88017f1dbcb0 [ 9238.594422] 0000000000000000 ffff88017f1dbca0 ffffffff81052b14 00000ad37f1dbd18 [ 9238.594423] 0000000000000001 ffff88018068a558 ffff88005c4b9c00 ffff880233f60db0 [ 9238.594424] Call Trace: [ 9238.594428] [<ffffffff8126b42c>] dump_stack+0x67/0x90 [ 9238.594430] [<ffffffff81052b14>] __warn+0xc2/0xdd [ 9238.594432] [<ffffffff81052b7a>] warn_slowpath_fmt+0x4b/0x53 [ 9238.594434] [<ffffffff8116c311>] ? kmem_cache_free+0x128/0x188 [ 9238.594450] [<ffffffffa04d43f5>] btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594452] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc [ 9238.594464] [<ffffffffa04a26fa>] btrfs_delete_unused_bgs+0x317/0x382 [btrfs] [ 9238.594476] [<ffffffffa04a961d>] cleaner_kthread+0x1ad/0x1c7 [btrfs] [ 9238.594489] [<ffffffffa04a9470>] ? btree_invalidatepage+0x8e/0x8e [btrfs] [ 9238.594490] [<ffffffff8106f403>] kthread+0xd4/0xdc [ 9238.594494] [<ffffffff8149e242>] ret_from_fork+0x22/0x40 [ 9238.594495] [<ffffffff8106f32f>] ? kthread_stop+0x286/0x286 [ 9238.594496] ---[ end trace 183efbe50275f059 ]--- The sequence of steps leading to this is like the following: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_delete_unused_bgs() btrfs_remove_chunk() looks up for the extent map corresponding to the chunk lock_chunks() (chunk_mutex) check_system_chunk() unlock_chunks() (chunk_mutex) locks fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) iterates over all stripes from the extent map --> calls btrfs_free_dev_extent() passing it the target device that still has an ID of 0ULL --> btrfs_free_dev_extent() fails --> aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid (which is necessarily > 0) frees the srcdev unlocks fs_info->chunk_mutex So fix this by taking the device list mutex while processing the stripes for the chunk's extent map. This is similar to the race between device replace and block group creation that was fixed by commit 50460e37186a ("Btrfs: fix race when finishing dev replace leading to transaction abort"). Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com>
2016-05-19 21:34:23 -06:00
/*
* Take the device list mutex to prevent races with the final phase of
* a device replace operation that replaces the device object associated
* with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
*/
mutex_lock(&fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
if (ret) {
Btrfs: fix race between device replace and block group removal When it's finishing, the device replace code iterates all extent maps representing block group and for each one that has a stripe that refers to the source device, it replaces its device with the target device. However when it replaces the source device with the target device it, the target device still has an ID of 0ULL (BTRFS_DEV_REPLACE_DEVID), only after its ID is changed to match the one from the source device. This leads to races with the chunk removal code that can temporarly see a device with an ID of 0ULL and then attempt to use that ID to remove items from the device tree and fail, causing a transaction abort: [ 9238.594364] BTRFS info (device sdf): dev_replace from /dev/sdf (devid 3) to /dev/sde finished [ 9238.594377] ------------[ cut here ]------------ [ 9238.594402] WARNING: CPU: 14 PID: 21566 at fs/btrfs/volumes.c:2771 btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594403] BTRFS: Transaction aborted (error 1) [ 9238.594416] Modules linked in: btrfs crc32c_generic acpi_cpufreq xor tpm_tis tpm raid6_pq ppdev parport_pc processor psmouse parport i2c_piix4 evdev sg i2c_core se rio_raw pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio e1000 scsi_mod fl oppy [last unloaded: btrfs] [ 9238.594418] CPU: 14 PID: 21566 Comm: btrfs-cleaner Not tainted 4.6.0-rc7-btrfs-next-29+ #1 [ 9238.594419] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 9238.594421] 0000000000000000 ffff88017f1dbc60 ffffffff8126b42c ffff88017f1dbcb0 [ 9238.594422] 0000000000000000 ffff88017f1dbca0 ffffffff81052b14 00000ad37f1dbd18 [ 9238.594423] 0000000000000001 ffff88018068a558 ffff88005c4b9c00 ffff880233f60db0 [ 9238.594424] Call Trace: [ 9238.594428] [<ffffffff8126b42c>] dump_stack+0x67/0x90 [ 9238.594430] [<ffffffff81052b14>] __warn+0xc2/0xdd [ 9238.594432] [<ffffffff81052b7a>] warn_slowpath_fmt+0x4b/0x53 [ 9238.594434] [<ffffffff8116c311>] ? kmem_cache_free+0x128/0x188 [ 9238.594450] [<ffffffffa04d43f5>] btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594452] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc [ 9238.594464] [<ffffffffa04a26fa>] btrfs_delete_unused_bgs+0x317/0x382 [btrfs] [ 9238.594476] [<ffffffffa04a961d>] cleaner_kthread+0x1ad/0x1c7 [btrfs] [ 9238.594489] [<ffffffffa04a9470>] ? btree_invalidatepage+0x8e/0x8e [btrfs] [ 9238.594490] [<ffffffff8106f403>] kthread+0xd4/0xdc [ 9238.594494] [<ffffffff8149e242>] ret_from_fork+0x22/0x40 [ 9238.594495] [<ffffffff8106f32f>] ? kthread_stop+0x286/0x286 [ 9238.594496] ---[ end trace 183efbe50275f059 ]--- The sequence of steps leading to this is like the following: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_delete_unused_bgs() btrfs_remove_chunk() looks up for the extent map corresponding to the chunk lock_chunks() (chunk_mutex) check_system_chunk() unlock_chunks() (chunk_mutex) locks fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) iterates over all stripes from the extent map --> calls btrfs_free_dev_extent() passing it the target device that still has an ID of 0ULL --> btrfs_free_dev_extent() fails --> aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid (which is necessarily > 0) frees the srcdev unlocks fs_info->chunk_mutex So fix this by taking the device list mutex while processing the stripes for the chunk's extent map. This is similar to the race between device replace and block group creation that was fixed by commit 50460e37186a ("Btrfs: fix race when finishing dev replace leading to transaction abort"). Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com>
2016-05-19 21:34:23 -06:00
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
}
if (device->bytes_used > 0) {
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
}
ret = btrfs_update_device(trans, device);
if (ret) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
goto out;
}
}
Btrfs: fix race between device replace and block group removal When it's finishing, the device replace code iterates all extent maps representing block group and for each one that has a stripe that refers to the source device, it replaces its device with the target device. However when it replaces the source device with the target device it, the target device still has an ID of 0ULL (BTRFS_DEV_REPLACE_DEVID), only after its ID is changed to match the one from the source device. This leads to races with the chunk removal code that can temporarly see a device with an ID of 0ULL and then attempt to use that ID to remove items from the device tree and fail, causing a transaction abort: [ 9238.594364] BTRFS info (device sdf): dev_replace from /dev/sdf (devid 3) to /dev/sde finished [ 9238.594377] ------------[ cut here ]------------ [ 9238.594402] WARNING: CPU: 14 PID: 21566 at fs/btrfs/volumes.c:2771 btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594403] BTRFS: Transaction aborted (error 1) [ 9238.594416] Modules linked in: btrfs crc32c_generic acpi_cpufreq xor tpm_tis tpm raid6_pq ppdev parport_pc processor psmouse parport i2c_piix4 evdev sg i2c_core se rio_raw pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio e1000 scsi_mod fl oppy [last unloaded: btrfs] [ 9238.594418] CPU: 14 PID: 21566 Comm: btrfs-cleaner Not tainted 4.6.0-rc7-btrfs-next-29+ #1 [ 9238.594419] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 9238.594421] 0000000000000000 ffff88017f1dbc60 ffffffff8126b42c ffff88017f1dbcb0 [ 9238.594422] 0000000000000000 ffff88017f1dbca0 ffffffff81052b14 00000ad37f1dbd18 [ 9238.594423] 0000000000000001 ffff88018068a558 ffff88005c4b9c00 ffff880233f60db0 [ 9238.594424] Call Trace: [ 9238.594428] [<ffffffff8126b42c>] dump_stack+0x67/0x90 [ 9238.594430] [<ffffffff81052b14>] __warn+0xc2/0xdd [ 9238.594432] [<ffffffff81052b7a>] warn_slowpath_fmt+0x4b/0x53 [ 9238.594434] [<ffffffff8116c311>] ? kmem_cache_free+0x128/0x188 [ 9238.594450] [<ffffffffa04d43f5>] btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594452] [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc [ 9238.594464] [<ffffffffa04a26fa>] btrfs_delete_unused_bgs+0x317/0x382 [btrfs] [ 9238.594476] [<ffffffffa04a961d>] cleaner_kthread+0x1ad/0x1c7 [btrfs] [ 9238.594489] [<ffffffffa04a9470>] ? btree_invalidatepage+0x8e/0x8e [btrfs] [ 9238.594490] [<ffffffff8106f403>] kthread+0xd4/0xdc [ 9238.594494] [<ffffffff8149e242>] ret_from_fork+0x22/0x40 [ 9238.594495] [<ffffffff8106f32f>] ? kthread_stop+0x286/0x286 [ 9238.594496] ---[ end trace 183efbe50275f059 ]--- The sequence of steps leading to this is like the following: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_delete_unused_bgs() btrfs_remove_chunk() looks up for the extent map corresponding to the chunk lock_chunks() (chunk_mutex) check_system_chunk() unlock_chunks() (chunk_mutex) locks fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) iterates over all stripes from the extent map --> calls btrfs_free_dev_extent() passing it the target device that still has an ID of 0ULL --> btrfs_free_dev_extent() fails --> aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid (which is necessarily > 0) frees the srcdev unlocks fs_info->chunk_mutex So fix this by taking the device list mutex while processing the stripes for the chunk's extent map. This is similar to the race between device replace and block group creation that was fixed by commit 50460e37186a ("Btrfs: fix race when finishing dev replace leading to transaction abort"). Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <jbacik@fb.com>
2016-05-19 21:34:23 -06:00
mutex_unlock(&fs_devices->device_list_mutex);
ret = btrfs_free_chunk(trans, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_remove_block_group(trans, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
out:
/* once for us */
free_extent_map(em);
return ret;
}
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
int ret;
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
* chunk_offset, automatic removal of the block group can kick in,
* resulting in a failure when calling btrfs_remove_chunk() below.
*
* Make sure to acquire this mutex before doing a tree search (dev
* or chunk trees) to find chunks. Otherwise the cleaner kthread might
* call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
ret = btrfs_relocate_block_group(fs_info, chunk_offset);
btrfs_scrub_continue(fs_info);
if (ret)
return ret;
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_handle_fs_error(root->fs_info, ret, NULL);
return ret;
}
/*
* step two, delete the device extents and the
* chunk tree entries
*/
ret = btrfs_remove_chunk(trans, chunk_offset);
btrfs_end_transaction(trans);
return ret;
}
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
struct btrfs_key found_key;
u64 chunk_type;
bool retried = false;
int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
}
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (ret)
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto error;
if (ret > 0)
break;
2008-09-26 08:09:34 -06:00
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2008-09-26 08:09:34 -06:00
chunk = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
if (ret == -ENOSPC)
failed++;
else
BUG_ON(ret);
}
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
ret = 0;
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
error:
btrfs_free_path(path);
return ret;
}
/*
* return 1 : allocate a data chunk successfully,
* return <0: errors during allocating a data chunk,
* return 0 : no need to allocate a data chunk.
*/
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
u64 chunk_offset)
{
struct btrfs_block_group_cache *cache;
u64 bytes_used;
u64 chunk_type;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
ASSERT(cache);
chunk_type = cache->flags;
btrfs_put_block_group(cache);
if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
spin_lock(&fs_info->data_sinfo->lock);
bytes_used = fs_info->data_sinfo->bytes_used;
spin_unlock(&fs_info->data_sinfo->lock);
if (!bytes_used) {
struct btrfs_trans_handle *trans;
int ret;
trans = btrfs_join_transaction(fs_info->tree_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_force_chunk_alloc(trans,
BTRFS_BLOCK_GROUP_DATA);
btrfs_end_transaction(trans);
if (ret < 0)
return ret;
return 1;
}
}
return 0;
}
static int insert_balance_item(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*item));
if (ret)
goto out;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
btrfs_set_balance_data(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
btrfs_set_balance_flags(leaf, item, bctl->flags);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
return ret;
}
static int del_balance_item(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans);
if (err && !ret)
ret = err;
return ret;
}
/*
* This is a heuristic used to reduce the number of chunks balanced on
* resume after balance was interrupted.
*/
static void update_balance_args(struct btrfs_balance_control *bctl)
{
/*
* Turn on soft mode for chunk types that were being converted.
*/
if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
/*
* Turn on usage filter if is not already used. The idea is
* that chunks that we have already balanced should be
* reasonably full. Don't do it for chunks that are being
* converted - that will keep us from relocating unconverted
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
}
}
/*
* Clear the balance status in fs_info and delete the balance item from disk.
*/
static void reset_balance_state(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
int ret;
BUG_ON(!fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
spin_unlock(&fs_info->balance_lock);
kfree(bctl);
ret = del_balance_item(fs_info);
if (ret)
btrfs_handle_fs_error(fs_info, ret, NULL);
}
/*
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
static int chunk_profiles_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
return 0;
return 1;
}
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
user_thresh_min = div_factor_fine(cache->key.offset,
bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
user_thresh_max = cache->key.offset;
else
user_thresh_max = div_factor_fine(cache->key.offset,
bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
btrfs_put_block_group(cache);
return ret;
}
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
u64 chunk_offset, struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
else
user_thresh = div_factor_fine(cache->key.offset,
bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
btrfs_put_block_group(cache);
return ret;
}
static int chunk_devid_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
int i;
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
return 0;
}
return 1;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
{
const int index = btrfs_bg_flags_to_raid_index(type);
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
if (nparity)
return num_stripes - nparity;
else
return num_stripes / ncopies;
}
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
u64 stripe_offset;
u64 stripe_length;
u64 type;
int factor;
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
return 0;
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
continue;
stripe_offset = btrfs_stripe_offset(leaf, stripe);
stripe_length = btrfs_chunk_length(leaf, chunk);
stripe_length = div_u64(stripe_length, factor);
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
return 0;
}
return 1;
}
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
return 0;
return 1;
}
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
if (bargs->stripes_min <= num_stripes
&& num_stripes <= bargs->stripes_max)
return 0;
return 1;
}
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return 0;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
return 1;
return 0;
}
static int should_balance_chunk(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
return 0;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
bargs = &bctl->data;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
bargs = &bctl->sys;
else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
bargs = &bctl->meta;
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
return 0;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(fs_info, chunk_offset, bargs)) {
return 0;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
return 0;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
return 0;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, bargs)) {
return 0;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
return 0;
}
/* stripes filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
chunk_stripes_range_filter(leaf, chunk, bargs)) {
return 0;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
return 0;
}
/*
* limited by count, must be the last filter
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
return 0;
else
bargs->limit--;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
/*
* Same logic as the 'limit' filter; the minimum cannot be
* determined here because we do not have the global information
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
return 0;
else
bargs->limit_max--;
}
return 1;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf;
int slot;
int ret;
int enospc_errors = 0;
bool counting = true;
/* The single value limit and min/max limits use the same bytes in the */
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
u32 count_data = 0;
u32 count_meta = 0;
u32 count_sys = 0;
btrfs: Fix lost-data-profile caused by balance bg Reproduce: (In integration-4.3 branch) TEST_DEV=(/dev/vdg /dev/vdh) TEST_DIR=/mnt/tmp umount "$TEST_DEV" >/dev/null mkfs.btrfs -f -d raid1 "${TEST_DEV[@]}" mount -o nospace_cache "$TEST_DEV" "$TEST_DIR" btrfs balance start -dusage=0 $TEST_DIR btrfs filesystem usage $TEST_DIR dd if=/dev/zero of="$TEST_DIR"/file count=100 btrfs filesystem usage $TEST_DIR Result: We can see "no data chunk" in first "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 1.07GiB And "data chunks changed from raid1 to single" in second "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Data,single: Size:256.00MiB, Used:0.00B /dev/vdh 256.00MiB Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 841.92MiB Reason: btrfs balance delete last data chunk in case of no data in the filesystem, then we can see "no data chunk" by "fi usage" command. And when we do write operation to fs, the only available data profile is 0x0, result is all new chunks are allocated single type. Fix: Allocate a data chunk explicitly to ensure we don't lose the raid profile for data. Test: Test by above script, and confirmed the logic by debug output. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-11-08 20:51:32 -07:00
int chunk_reserved = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto error;
}
/* zero out stat counters */
spin_lock(&fs_info->balance_lock);
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
/*
* The single value limit and min/max limits use the same bytes
* in the
*/
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
atomic_read(&fs_info->balance_cancel_req)) {
ret = -ECANCELED;
goto error;
}
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
}
/*
* this shouldn't happen, it means the last relocate
* failed
*/
if (ret == 0)
BUG(); /* FIXME break ? */
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ret = 0;
break;
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (found_key.objectid != key.objectid) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
break;
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
}
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
bctl->stat.considered++;
spin_unlock(&fs_info->balance_lock);
}
ret = should_balance_chunk(leaf, chunk, found_key.offset);
btrfs: Fix lost-data-profile caused by balance bg Reproduce: (In integration-4.3 branch) TEST_DEV=(/dev/vdg /dev/vdh) TEST_DIR=/mnt/tmp umount "$TEST_DEV" >/dev/null mkfs.btrfs -f -d raid1 "${TEST_DEV[@]}" mount -o nospace_cache "$TEST_DEV" "$TEST_DIR" btrfs balance start -dusage=0 $TEST_DIR btrfs filesystem usage $TEST_DIR dd if=/dev/zero of="$TEST_DIR"/file count=100 btrfs filesystem usage $TEST_DIR Result: We can see "no data chunk" in first "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 1.07GiB And "data chunks changed from raid1 to single" in second "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Data,single: Size:256.00MiB, Used:0.00B /dev/vdh 256.00MiB Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 841.92MiB Reason: btrfs balance delete last data chunk in case of no data in the filesystem, then we can see "no data chunk" by "fi usage" command. And when we do write operation to fs, the only available data profile is 0x0, result is all new chunks are allocated single type. Fix: Allocate a data chunk explicitly to ensure we don't lose the raid profile for data. Test: Test by above script, and confirmed the logic by debug output. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-11-08 20:51:32 -07:00
btrfs_release_path(path);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
}
if (counting) {
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
count_data++;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
count_sys++;
else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
count_meta++;
goto loop;
}
/*
* Apply limit_min filter, no need to check if the LIMITS
* filter is used, limit_min is 0 by default
*/
if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
count_data < bctl->data.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
}
if (!chunk_reserved) {
/*
* We may be relocating the only data chunk we have,
* which could potentially end up with losing data's
* raid profile, so lets allocate an empty one in
* advance.
*/
ret = btrfs_may_alloc_data_chunk(fs_info,
found_key.offset);
btrfs: Fix lost-data-profile caused by balance bg Reproduce: (In integration-4.3 branch) TEST_DEV=(/dev/vdg /dev/vdh) TEST_DIR=/mnt/tmp umount "$TEST_DEV" >/dev/null mkfs.btrfs -f -d raid1 "${TEST_DEV[@]}" mount -o nospace_cache "$TEST_DEV" "$TEST_DIR" btrfs balance start -dusage=0 $TEST_DIR btrfs filesystem usage $TEST_DIR dd if=/dev/zero of="$TEST_DIR"/file count=100 btrfs filesystem usage $TEST_DIR Result: We can see "no data chunk" in first "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 1.07GiB And "data chunks changed from raid1 to single" in second "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Data,single: Size:256.00MiB, Used:0.00B /dev/vdh 256.00MiB Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 841.92MiB Reason: btrfs balance delete last data chunk in case of no data in the filesystem, then we can see "no data chunk" by "fi usage" command. And when we do write operation to fs, the only available data profile is 0x0, result is all new chunks are allocated single type. Fix: Allocate a data chunk explicitly to ensure we don't lose the raid profile for data. Test: Test by above script, and confirmed the logic by debug output. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-11-08 20:51:32 -07:00
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
} else if (ret == 1) {
chunk_reserved = 1;
btrfs: Fix lost-data-profile caused by balance bg Reproduce: (In integration-4.3 branch) TEST_DEV=(/dev/vdg /dev/vdh) TEST_DIR=/mnt/tmp umount "$TEST_DEV" >/dev/null mkfs.btrfs -f -d raid1 "${TEST_DEV[@]}" mount -o nospace_cache "$TEST_DEV" "$TEST_DIR" btrfs balance start -dusage=0 $TEST_DIR btrfs filesystem usage $TEST_DIR dd if=/dev/zero of="$TEST_DIR"/file count=100 btrfs filesystem usage $TEST_DIR Result: We can see "no data chunk" in first "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 1.07GiB And "data chunks changed from raid1 to single" in second "btrfs filesystem usage": # btrfs filesystem usage $TEST_DIR Overall: ... Data,single: Size:256.00MiB, Used:0.00B /dev/vdh 256.00MiB Metadata,single: Size:8.00MiB, Used:0.00B /dev/vdg 8.00MiB Metadata,RAID1: Size:122.88MiB, Used:112.00KiB /dev/vdg 122.88MiB /dev/vdh 122.88MiB System,single: Size:4.00MiB, Used:0.00B /dev/vdg 4.00MiB System,RAID1: Size:8.00MiB, Used:16.00KiB /dev/vdg 8.00MiB /dev/vdh 8.00MiB Unallocated: /dev/vdg 1.06GiB /dev/vdh 841.92MiB Reason: btrfs balance delete last data chunk in case of no data in the filesystem, then we can see "no data chunk" by "fi usage" command. And when we do write operation to fs, the only available data profile is 0x0, result is all new chunks are allocated single type. Fix: Allocate a data chunk explicitly to ensure we don't lose the raid profile for data. Test: Test by above script, and confirmed the logic by debug output. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-11-08 20:51:32 -07:00
}
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret == -ENOSPC) {
enospc_errors++;
} else if (ret == -ETXTBSY) {
btrfs_info(fs_info,
"skipping relocation of block group %llu due to active swapfile",
found_key.offset);
ret = 0;
} else if (ret) {
goto error;
} else {
spin_lock(&fs_info->balance_lock);
bctl->stat.completed++;
spin_unlock(&fs_info->balance_lock);
}
loop:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
if (counting) {
btrfs_release_path(path);
counting = false;
goto again;
}
error:
btrfs_free_path(path);
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
if (!ret)
ret = -ENOSPC;
}
return ret;
}
/**
* alloc_profile_is_valid - see if a given profile is valid and reduced
* @flags: profile to validate
* @extended: if true @flags is treated as an extended profile
*/
static int alloc_profile_is_valid(u64 flags, int extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
/* 1) check that all other bits are zeroed */
if (flags & ~mask)
return 0;
/* 2) see if profile is reduced */
if (flags == 0)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
/*
* Don't use is_power_of_2(unsigned long) because it won't work
* for the single profile (1ULL << 48) on 32-bit CPUs.
*/
return flags != 0 && (flags & (flags - 1)) == 0;
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
/* cancel requested || normal exit path */
return atomic_read(&fs_info->balance_cancel_req) ||
(atomic_read(&fs_info->balance_pause_req) == 0 &&
atomic_read(&fs_info->balance_cancel_req) == 0);
}
/* Non-zero return value signifies invalidity */
static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
u64 allowed)
{
return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl_arg->target, 1) ||
(bctl_arg->target & ~allowed)));
}
/*
* Fill @buf with textual description of balance filter flags @bargs, up to
* @size_buf including the terminating null. The output may be trimmed if it
* does not fit into the provided buffer.
*/
static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
u32 size_buf)
{
int ret;
u32 size_bp = size_buf;
char *bp = buf;
u64 flags = bargs->flags;
char tmp_buf[128] = {'\0'};
if (!flags)
return;
#define CHECK_APPEND_NOARG(a) \
do { \
ret = snprintf(bp, size_bp, (a)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
#define CHECK_APPEND_1ARG(a, v1) \
do { \
ret = snprintf(bp, size_bp, (a), (v1)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
#define CHECK_APPEND_2ARG(a, v1, v2) \
do { \
ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
if (flags & BTRFS_BALANCE_ARGS_CONVERT)
CHECK_APPEND_1ARG("convert=%s,",
btrfs_bg_type_to_raid_name(bargs->target));
if (flags & BTRFS_BALANCE_ARGS_SOFT)
CHECK_APPEND_NOARG("soft,");
if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
btrfs_describe_block_groups(bargs->profiles, tmp_buf,
sizeof(tmp_buf));
CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
}
if (flags & BTRFS_BALANCE_ARGS_USAGE)
CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
CHECK_APPEND_2ARG("usage=%u..%u,",
bargs->usage_min, bargs->usage_max);
if (flags & BTRFS_BALANCE_ARGS_DEVID)
CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
if (flags & BTRFS_BALANCE_ARGS_DRANGE)
CHECK_APPEND_2ARG("drange=%llu..%llu,",
bargs->pstart, bargs->pend);
if (flags & BTRFS_BALANCE_ARGS_VRANGE)
CHECK_APPEND_2ARG("vrange=%llu..%llu,",
bargs->vstart, bargs->vend);
if (flags & BTRFS_BALANCE_ARGS_LIMIT)
CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
CHECK_APPEND_2ARG("limit=%u..%u,",
bargs->limit_min, bargs->limit_max);
if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
CHECK_APPEND_2ARG("stripes=%u..%u,",
bargs->stripes_min, bargs->stripes_max);
#undef CHECK_APPEND_2ARG
#undef CHECK_APPEND_1ARG
#undef CHECK_APPEND_NOARG
out_overflow:
if (size_bp < size_buf)
buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
else
buf[0] = '\0';
}
static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
u32 size_buf = 1024;
char tmp_buf[192] = {'\0'};
char *buf;
char *bp;
u32 size_bp = size_buf;
int ret;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
buf = kzalloc(size_buf, GFP_KERNEL);
if (!buf)
return;
bp = buf;
#define CHECK_APPEND_1ARG(a, v1) \
do { \
ret = snprintf(bp, size_bp, (a), (v1)); \
if (ret < 0 || ret >= size_bp) \
goto out_overflow; \
size_bp -= ret; \
bp += ret; \
} while (0)
if (bctl->flags & BTRFS_BALANCE_FORCE)
CHECK_APPEND_1ARG("%s", "-f ");
if (bctl->flags & BTRFS_BALANCE_DATA) {
describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
CHECK_APPEND_1ARG("-d%s ", tmp_buf);
}
if (bctl->flags & BTRFS_BALANCE_METADATA) {
describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
CHECK_APPEND_1ARG("-m%s ", tmp_buf);
}
if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
CHECK_APPEND_1ARG("-s%s ", tmp_buf);
}
#undef CHECK_APPEND_1ARG
out_overflow:
if (size_bp < size_buf)
buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
btrfs_info(fs_info, "balance: %s %s",
(bctl->flags & BTRFS_BALANCE_RESUME) ?
"resume" : "start", buf);
kfree(buf);
}
/*
btrfs: kill btrfs_fs_info::volume_mutex Mutual exclusion of device add/rm and balance was done by the volume mutex up to version 3.7. The commit 5ac00addc7ac091109 ("Btrfs: disallow mutually exclusive admin operations from user mode") added a bit that essentially tracked the same information. The status bit has an advantage over a mutex that it can be set without restrictions of function context, so it started to be used in the mount-time resuming of balance or device replace. But we don't really need to track the same information in two ways. 1) After the previous cleanups, the main ioctl handlers for add/del/resize copy the EXCL_OP bit next to the volume mutex, here it's clearly safe. 2) Resuming balance during mount or after rw remount will set only the EXCL_OP bit and the volume_mutex is held in the kernel thread that calls btrfs_balance. 3) Resuming device replace during mount or after rw remount is done after balance and is excluded by the EXCL_OP bit. It does not take the volume_mutex at all and completely relies on the EXCL_OP bit. 4) The resuming of balance and dev-replace cannot hapen at the same time as the ioctls cannot be started in parallel. Nevertheless, a crafted image could trigger that and a warning is printed. 5) Balance is normally excluded by EXCL_OP and also uses own mutex to protect against concurrent access to its status data. There's some trickery to maintain the right lock nesting in case we need to reexamine the status in btrfs_ioctl_balance. The volume_mutex is removed and the unlock/lock sequence is left in place as we might expect other waiters to proceed. 6) Similar to 5, the unlock/lock sequence is kept in btrfs_cancel_balance to allow waiters to continue. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-03-20 17:20:05 -06:00
* Should be called with balance mutexe held
*/
int btrfs_balance(struct btrfs_fs_info *fs_info,
struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
u64 meta_target, data_target;
u64 allowed;
int mixed = 0;
int ret;
u64 num_devices;
unsigned seq;
Btrfs: fix access to available allocation bits when starting balance The available allocation bits members from struct btrfs_fs_info are protected by a sequence lock, and when starting balance we access them incorrectly in two different ways: 1) In the read sequence lock loop at btrfs_balance() we use the values we read from fs_info->avail_*_alloc_bits and we can immediately do actions that have side effects and can not be undone (printing a message and jumping to a label). This is wrong because a retry might be needed, so our actions must not have side effects and must be repeatable as long as read_seqretry() returns a non-zero value. In other words, we were essentially ignoring the sequence lock; 2) Right below the read sequence lock loop, we were reading the values from avail_metadata_alloc_bits and avail_data_alloc_bits without any protection from concurrent writers, that is, reading them outside of the read sequence lock critical section. So fix this by making sure we only read the available allocation bits while in a read sequence lock critical section and that what we do in the critical section is repeatable (has nothing that can not be undone) so that any eventual retry that is needed is handled properly. Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits") Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or metadata") Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-19 02:48:12 -07:00
bool reducing_integrity;
int i;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
atomic_read(&fs_info->balance_cancel_req)) {
ret = -EINVAL;
goto out;
}
allowed = btrfs_super_incompat_flags(fs_info->super_copy);
if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
mixed = 1;
/*
* In case of mixed groups both data and meta should be picked,
* and identical options should be given for both of them.
*/
allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
if (mixed && (bctl->flags & allowed)) {
if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
btrfs_err(fs_info,
"balance: mixed groups data and metadata options must be the same");
ret = -EINVAL;
goto out;
}
}
btrfs: check rw_devices, not num_devices for balance commit b35cf1f0bf1f2b0b193093338414b9bd63b29015 upstream. The fstest btrfs/154 reports [ 8675.381709] BTRFS: Transaction aborted (error -28) [ 8675.383302] WARNING: CPU: 1 PID: 31900 at fs/btrfs/block-group.c:2038 btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs] [ 8675.390925] CPU: 1 PID: 31900 Comm: btrfs Not tainted 5.5.0-rc6-default+ #935 [ 8675.392780] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 [ 8675.395452] RIP: 0010:btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs] [ 8675.402672] RSP: 0018:ffffb2090888fb00 EFLAGS: 00010286 [ 8675.404413] RAX: 0000000000000000 RBX: ffff92026dfa91c8 RCX: 0000000000000001 [ 8675.406609] RDX: 0000000000000000 RSI: ffffffff8e100899 RDI: ffffffff8e100971 [ 8675.408775] RBP: ffff920247c61660 R08: 0000000000000000 R09: 0000000000000000 [ 8675.410978] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffffffe4 [ 8675.412647] R13: ffff92026db74000 R14: ffff920247c616b8 R15: ffff92026dfbc000 [ 8675.413994] FS: 00007fd5e57248c0(0000) GS:ffff92027d800000(0000) knlGS:0000000000000000 [ 8675.416146] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8675.417833] CR2: 0000564aa51682d8 CR3: 000000006dcbc004 CR4: 0000000000160ee0 [ 8675.419801] Call Trace: [ 8675.420742] btrfs_start_dirty_block_groups+0x355/0x480 [btrfs] [ 8675.422600] btrfs_commit_transaction+0xc8/0xaf0 [btrfs] [ 8675.424335] reset_balance_state+0x14a/0x190 [btrfs] [ 8675.425824] btrfs_balance.cold+0xe7/0x154 [btrfs] [ 8675.427313] ? kmem_cache_alloc_trace+0x235/0x2c0 [ 8675.428663] btrfs_ioctl_balance+0x298/0x350 [btrfs] [ 8675.430285] btrfs_ioctl+0x466/0x2550 [btrfs] [ 8675.431788] ? mem_cgroup_charge_statistics+0x51/0xf0 [ 8675.433487] ? mem_cgroup_commit_charge+0x56/0x400 [ 8675.435122] ? do_raw_spin_unlock+0x4b/0xc0 [ 8675.436618] ? _raw_spin_unlock+0x1f/0x30 [ 8675.438093] ? __handle_mm_fault+0x499/0x740 [ 8675.439619] ? do_vfs_ioctl+0x56e/0x770 [ 8675.441034] do_vfs_ioctl+0x56e/0x770 [ 8675.442411] ksys_ioctl+0x3a/0x70 [ 8675.443718] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 8675.445333] __x64_sys_ioctl+0x16/0x20 [ 8675.446705] do_syscall_64+0x50/0x210 [ 8675.448059] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 8675.479187] BTRFS: error (device vdb) in btrfs_create_pending_block_groups:2038: errno=-28 No space left We now use btrfs_can_overcommit() to see if we can flip a block group read only. Before this would fail because we weren't taking into account the usable un-allocated space for allocating chunks. With my patches we were allowed to do the balance, which is technically correct. The test is trying to start balance on degraded mount. So now we're trying to allocate a chunk and cannot because we want to allocate a RAID1 chunk, but there's only 1 device that's available for usage. This results in an ENOSPC. But we shouldn't even be making it this far, we don't have enough devices to restripe. The problem is we're using btrfs_num_devices(), that also includes missing devices. That's not actually what we want, we need to use rw_devices. The chunk_mutex is not needed here, rw_devices changes only in device add, remove or replace, all are excluded by EXCL_OP mechanism. Fixes: e4d8ec0f65b9 ("Btrfs: implement online profile changing") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add stacktrace, update changelog, drop chunk_mutex ] Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-01-10 09:11:24 -07:00
/*
* rw_devices will not change at the moment, device add/delete/replace
* are excluded by EXCL_OP
*/
num_devices = fs_info->fs_devices->rw_devices;
/*
* SINGLE profile on-disk has no profile bit, but in-memory we have a
* special bit for it, to make it easier to distinguish. Thus we need
* to set it manually, or balance would refuse the profile.
*/
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
if (validate_convert_profile(&bctl->data, allowed)) {
btrfs_err(fs_info,
"balance: invalid convert data profile %s",
btrfs_bg_type_to_raid_name(bctl->data.target));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->meta, allowed)) {
btrfs_err(fs_info,
"balance: invalid convert metadata profile %s",
btrfs_bg_type_to_raid_name(bctl->meta.target));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->sys, allowed)) {
btrfs_err(fs_info,
"balance: invalid convert system profile %s",
btrfs_bg_type_to_raid_name(bctl->sys.target));
ret = -EINVAL;
goto out;
}
/*
* Allow to reduce metadata or system integrity only if force set for
* profiles with redundancy (copies, parity)
*/
allowed = 0;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
if (btrfs_raid_array[i].ncopies >= 2 ||
btrfs_raid_array[i].tolerated_failures >= 1)
allowed |= btrfs_raid_array[i].bg_flag;
}
do {
seq = read_seqbegin(&fs_info->profiles_lock);
if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_system_alloc_bits & allowed) &&
!(bctl->sys.target & allowed)) ||
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
Btrfs: fix access to available allocation bits when starting balance The available allocation bits members from struct btrfs_fs_info are protected by a sequence lock, and when starting balance we access them incorrectly in two different ways: 1) In the read sequence lock loop at btrfs_balance() we use the values we read from fs_info->avail_*_alloc_bits and we can immediately do actions that have side effects and can not be undone (printing a message and jumping to a label). This is wrong because a retry might be needed, so our actions must not have side effects and must be repeatable as long as read_seqretry() returns a non-zero value. In other words, we were essentially ignoring the sequence lock; 2) Right below the read sequence lock loop, we were reading the values from avail_metadata_alloc_bits and avail_data_alloc_bits without any protection from concurrent writers, that is, reading them outside of the read sequence lock critical section. So fix this by making sure we only read the available allocation bits while in a read sequence lock critical section and that what we do in the critical section is repeatable (has nothing that can not be undone) so that any eventual retry that is needed is handled properly. Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits") Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or metadata") Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-19 02:48:12 -07:00
!(bctl->meta.target & allowed)))
reducing_integrity = true;
else
reducing_integrity = false;
/* if we're not converting, the target field is uninitialized */
meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
bctl->meta.target : fs_info->avail_metadata_alloc_bits;
data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
Btrfs: fix access to available allocation bits when starting balance The available allocation bits members from struct btrfs_fs_info are protected by a sequence lock, and when starting balance we access them incorrectly in two different ways: 1) In the read sequence lock loop at btrfs_balance() we use the values we read from fs_info->avail_*_alloc_bits and we can immediately do actions that have side effects and can not be undone (printing a message and jumping to a label). This is wrong because a retry might be needed, so our actions must not have side effects and must be repeatable as long as read_seqretry() returns a non-zero value. In other words, we were essentially ignoring the sequence lock; 2) Right below the read sequence lock loop, we were reading the values from avail_metadata_alloc_bits and avail_data_alloc_bits without any protection from concurrent writers, that is, reading them outside of the read sequence lock critical section. So fix this by making sure we only read the available allocation bits while in a read sequence lock critical section and that what we do in the critical section is repeatable (has nothing that can not be undone) so that any eventual retry that is needed is handled properly. Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits") Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or metadata") Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-19 02:48:12 -07:00
if (reducing_integrity) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
"balance: force reducing metadata integrity");
} else {
btrfs_err(fs_info,
"balance: reduces metadata integrity, use --force if you want this");
ret = -EINVAL;
goto out;
}
}
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
btrfs_warn(fs_info,
"balance: metadata profile %s has lower redundancy than data profile %s",
btrfs_bg_type_to_raid_name(meta_target),
btrfs_bg_type_to_raid_name(data_target));
}
Btrfs: prevent send failures and crashes due to concurrent relocation Send always operates on read-only trees and always expected that while it is in progress, nothing changes in those trees. Due to that expectation and the fact that send is a read-only operation, it operates on commit roots and does not hold transaction handles. However relocation can COW nodes and leafs from read-only trees, which can cause unexpected failures and crashes (hitting BUG_ONs). while send using a node/leaf, it gets COWed, the transaction used to COW it is committed, a new transaction starts, the extent previously used for that node/leaf gets allocated, possibly for another tree, and the respective extent buffer' content changes while send is still using it. When this happens send normally fails with EIO being returned to user space and messages like the following are found in dmesg/syslog: [ 3408.699121] BTRFS error (device sdc): parent transid verify failed on 58703872 wanted 250 found 253 [ 3441.523123] BTRFS error (device sdc): did not find backref in send_root. inode=63211, offset=0, disk_byte=5222825984 found extent=5222825984 Other times, less often, we hit a BUG_ON() because an extent buffer that send is using used to be a node, and while send is still using it, it got COWed and got reused as a leaf while send is still using, producing the following trace: [ 3478.466280] ------------[ cut here ]------------ [ 3478.466282] kernel BUG at fs/btrfs/ctree.c:1806! [ 3478.466965] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [ 3478.467635] CPU: 0 PID: 2165 Comm: btrfs Not tainted 5.0.0-btrfs-next-46 #1 [ 3478.468311] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 3478.469681] RIP: 0010:read_node_slot+0x122/0x130 [btrfs] (...) [ 3478.471758] RSP: 0018:ffffa437826bfaa0 EFLAGS: 00010246 [ 3478.472457] RAX: ffff961416ed7000 RBX: 000000000000003d RCX: 0000000000000002 [ 3478.473151] RDX: 000000000000003d RSI: ffff96141e387408 RDI: ffff961599b30000 [ 3478.473837] RBP: ffffa437826bfb8e R08: 0000000000000001 R09: ffffa437826bfb8e [ 3478.474515] R10: ffffa437826bfa70 R11: 0000000000000000 R12: ffff9614385c8708 [ 3478.475186] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 3478.475840] FS: 00007f8e0e9cc8c0(0000) GS:ffff9615b6a00000(0000) knlGS:0000000000000000 [ 3478.476489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3478.477127] CR2: 00007f98b67a056e CR3: 0000000005df6005 CR4: 00000000003606f0 [ 3478.477762] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3478.478385] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3478.479003] Call Trace: [ 3478.479600] ? do_raw_spin_unlock+0x49/0xc0 [ 3478.480202] tree_advance+0x173/0x1d0 [btrfs] [ 3478.480810] btrfs_compare_trees+0x30c/0x690 [btrfs] [ 3478.481388] ? process_extent+0x1280/0x1280 [btrfs] [ 3478.481954] btrfs_ioctl_send+0x1037/0x1270 [btrfs] [ 3478.482510] _btrfs_ioctl_send+0x80/0x110 [btrfs] [ 3478.483062] btrfs_ioctl+0x13fe/0x3120 [btrfs] [ 3478.483581] ? rq_clock_task+0x2e/0x60 [ 3478.484086] ? wake_up_new_task+0x1f3/0x370 [ 3478.484582] ? do_vfs_ioctl+0xa2/0x6f0 [ 3478.485075] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 3478.485552] do_vfs_ioctl+0xa2/0x6f0 [ 3478.486016] ? __fget+0x113/0x200 [ 3478.486467] ksys_ioctl+0x70/0x80 [ 3478.486911] __x64_sys_ioctl+0x16/0x20 [ 3478.487337] do_syscall_64+0x60/0x1b0 [ 3478.487751] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 3478.488159] RIP: 0033:0x7f8e0d7d4dd7 (...) [ 3478.489349] RSP: 002b:00007ffcf6fb4908 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 3478.489742] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f8e0d7d4dd7 [ 3478.490142] RDX: 00007ffcf6fb4990 RSI: 0000000040489426 RDI: 0000000000000005 [ 3478.490548] RBP: 0000000000000005 R08: 00007f8e0d6f3700 R09: 00007f8e0d6f3700 [ 3478.490953] R10: 00007f8e0d6f39d0 R11: 0000000000000202 R12: 0000000000000005 [ 3478.491343] R13: 00005624e0780020 R14: 0000000000000000 R15: 0000000000000001 (...) [ 3478.493352] ---[ end trace d5f537302be4f8c8 ]--- Another possibility, much less likely to happen, is that send will not fail but the contents of the stream it produces may not be correct. To avoid this, do not allow send and relocation (balance) to run in parallel. In the long term the goal is to allow for both to be able to run concurrently without any problems, but that will take a significant effort in development and testing. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 09:44:09 -06:00
if (fs_info->send_in_progress) {
btrfs_warn_rl(fs_info,
"cannot run balance while send operations are in progress (%d in progress)",
fs_info->send_in_progress);
ret = -EAGAIN;
goto out;
}
ret = insert_balance_item(fs_info, bctl);
if (ret && ret != -EEXIST)
goto out;
if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
BUG_ON(ret == -EEXIST);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
} else {
BUG_ON(ret != -EEXIST);
spin_lock(&fs_info->balance_lock);
update_balance_args(bctl);
spin_unlock(&fs_info->balance_lock);
}
ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
describe_balance_start_or_resume(fs_info);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
btrfs_info(fs_info, "balance: paused");
btrfs: relocation: review the call sites which can be interrupted by signal commit 44d354abf33e92a5e73b965c84caf5a5d5e58a0b upstream. Since most metadata reservation calls can return -EINTR when get interrupted by fatal signal, we need to review the all the metadata reservation call sites. In relocation code, the metadata reservation happens in the following sites: - btrfs_block_rsv_refill() in merge_reloc_root() merge_reloc_root() is a pretty critical section, we don't want to be interrupted by signal, so change the flush status to BTRFS_RESERVE_FLUSH_LIMIT, so it won't get interrupted by signal. Since such change can be ENPSPC-prone, also shrink the amount of metadata to reserve least amount avoid deadly ENOSPC there. - btrfs_block_rsv_refill() in reserve_metadata_space() It calls with BTRFS_RESERVE_FLUSH_LIMIT, which won't get interrupted by signal. - btrfs_block_rsv_refill() in prepare_to_relocate() - btrfs_block_rsv_add() in prepare_to_relocate() - btrfs_block_rsv_refill() in relocate_block_group() - btrfs_delalloc_reserve_metadata() in relocate_file_extent_cluster() - btrfs_start_transaction() in relocate_block_group() - btrfs_start_transaction() in create_reloc_inode() Can be interrupted by fatal signal and we can handle it easily. For these call sites, just catch the -EINTR value in btrfs_balance() and count them as canceled. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo <wqu@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-12 19:03:21 -06:00
/*
* Balance can be canceled by:
*
* - Regular cancel request
* Then ret == -ECANCELED and balance_cancel_req > 0
*
* - Fatal signal to "btrfs" process
* Either the signal caught by wait_reserve_ticket() and callers
* got -EINTR, or caught by btrfs_should_cancel_balance() and
* got -ECANCELED.
* Either way, in this case balance_cancel_req = 0, and
* ret == -EINTR or ret == -ECANCELED.
*
* So here we only check the return value to catch canceled balance.
*/
else if (ret == -ECANCELED || ret == -EINTR)
btrfs_info(fs_info, "balance: canceled");
else
btrfs_info(fs_info, "balance: ended with status: %d", ret);
clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
if (bargs) {
memset(bargs, 0, sizeof(*bargs));
btrfs_update_ioctl_balance_args(fs_info, bargs);
}
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
wake_up(&fs_info->balance_wait_q);
return ret;
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
reset_balance_state(fs_info);
else
kfree(bctl);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
static int balance_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
int ret = 0;
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
return ret;
}
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return 0;
}
mutex_unlock(&fs_info->balance_mutex);
if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
btrfs_info(fs_info, "balance: resume skipped");
return 0;
}
/*
* A ro->rw remount sequence should continue with the paused balance
* regardless of who pauses it, system or the user as of now, so set
* the resume flag.
*/
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
return PTR_ERR_OR_ZERO(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) { /* ret = -ENOENT; */
ret = 0;
goto out;
}
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
ret = -ENOMEM;
goto out;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
bctl->flags = btrfs_balance_flags(leaf, item);
bctl->flags |= BTRFS_BALANCE_RESUME;
btrfs_balance_data(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
btrfs_balance_meta(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
/*
* This should never happen, as the paused balance state is recovered
* during mount without any chance of other exclusive ops to collide.
*
* This gives the exclusive op status to balance and keeps in paused
* state until user intervention (cancel or umount). If the ownership
* cannot be assigned, show a message but do not fail. The balance
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
mutex_unlock(&fs_info->balance_mutex);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
int ret = 0;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
atomic_inc(&fs_info->balance_pause_req);
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
/* we are good with balance_ctl ripped off from under us */
BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_pause_req);
} else {
ret = -ENOTCONN;
}
mutex_unlock(&fs_info->balance_mutex);
return ret;
}
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
/*
* A paused balance with the item stored on disk can be resumed at
* mount time if the mount is read-write. Otherwise it's still paused
* and we must not allow cancelling as it deletes the item.
*/
if (sb_rdonly(fs_info->sb)) {
mutex_unlock(&fs_info->balance_mutex);
return -EROFS;
}
atomic_inc(&fs_info->balance_cancel_req);
/*
* if we are running just wait and return, balance item is
* deleted in btrfs_balance in this case
*/
if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
mutex_lock(&fs_info->balance_mutex);
} else {
mutex_unlock(&fs_info->balance_mutex);
btrfs: kill btrfs_fs_info::volume_mutex Mutual exclusion of device add/rm and balance was done by the volume mutex up to version 3.7. The commit 5ac00addc7ac091109 ("Btrfs: disallow mutually exclusive admin operations from user mode") added a bit that essentially tracked the same information. The status bit has an advantage over a mutex that it can be set without restrictions of function context, so it started to be used in the mount-time resuming of balance or device replace. But we don't really need to track the same information in two ways. 1) After the previous cleanups, the main ioctl handlers for add/del/resize copy the EXCL_OP bit next to the volume mutex, here it's clearly safe. 2) Resuming balance during mount or after rw remount will set only the EXCL_OP bit and the volume_mutex is held in the kernel thread that calls btrfs_balance. 3) Resuming device replace during mount or after rw remount is done after balance and is excluded by the EXCL_OP bit. It does not take the volume_mutex at all and completely relies on the EXCL_OP bit. 4) The resuming of balance and dev-replace cannot hapen at the same time as the ioctls cannot be started in parallel. Nevertheless, a crafted image could trigger that and a warning is printed. 5) Balance is normally excluded by EXCL_OP and also uses own mutex to protect against concurrent access to its status data. There's some trickery to maintain the right lock nesting in case we need to reexamine the status in btrfs_ioctl_balance. The volume_mutex is removed and the unlock/lock sequence is left in place as we might expect other waiters to proceed. 6) Similar to 5, the unlock/lock sequence is kept in btrfs_cancel_balance to allow waiters to continue. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-03-20 17:20:05 -06:00
/*
* Lock released to allow other waiters to continue, we'll
* reexamine the status again.
*/
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
btrfs_info(fs_info, "balance: canceled");
}
}
BUG_ON(fs_info->balance_ctl ||
test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
}
static int btrfs_uuid_scan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_key key;
struct btrfs_path *path = NULL;
int ret = 0;
struct extent_buffer *eb;
int slot;
struct btrfs_root_item root_item;
u32 item_size;
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
struct btrfs_trans_handle *trans = NULL;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = 0;
while (1) {
ret = btrfs_search_forward(root, &key, path,
BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
break;
}
if (key.type != BTRFS_ROOT_ITEM_KEY ||
(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.objectid != BTRFS_FS_TREE_OBJECTID) ||
key.objectid > BTRFS_LAST_FREE_OBJECTID)
goto skip;
eb = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size_nr(eb, slot);
if (item_size < sizeof(root_item))
goto skip;
read_extent_buffer(eb, &root_item,
btrfs_item_ptr_offset(eb, slot),
(int)sizeof(root_item));
if (btrfs_root_refs(&root_item) == 0)
goto skip;
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
if (!btrfs_is_empty_uuid(root_item.uuid) ||
!btrfs_is_empty_uuid(root_item.received_uuid)) {
if (trans)
goto update_tree;
btrfs_release_path(path);
/*
* 1 - subvol uuid item
* 1 - received_subvol uuid item
*/
trans = btrfs_start_transaction(fs_info->uuid_root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
continue;
} else {
goto skip;
}
update_tree:
btrfs: drop path before adding new uuid tree entry commit 9771a5cf937129307d9f58922d60484d58ababe7 upstream. With the conversion of the tree locks to rwsem I got the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Not tainted ------------------------------------------------------ btrfs-uuid/7955 is trying to acquire lock: ffff88bfbafec0f8 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 but task is already holding lock: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (btrfs-uuid-00){++++}-{3:3}: down_read_nested+0x3e/0x140 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_uuid_tree_add+0x89/0x2d0 btrfs_uuid_scan_kthread+0x330/0x390 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 -> #0 (btrfs-root-00){++++}-{3:3}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 down_read_nested+0x3e/0x140 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_find_root+0x45/0x1b0 btrfs_read_tree_root+0x61/0x100 btrfs_get_root_ref.part.50+0x143/0x630 btrfs_uuid_tree_iterate+0x207/0x314 btrfs_uuid_rescan_kthread+0x12/0x50 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-uuid-00); lock(btrfs-root-00); lock(btrfs-uuid-00); lock(btrfs-root-00); *** DEADLOCK *** 1 lock held by btrfs-uuid/7955: #0: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 stack backtrace: CPU: 73 PID: 7955 Comm: btrfs-uuid Kdump: loaded Not tainted 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 ? __btrfs_tree_read_lock+0x39/0x180 ? btrfs_root_node+0x1c/0x1d0 down_read_nested+0x3e/0x140 ? __btrfs_tree_read_lock+0x39/0x180 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_find_root+0x45/0x1b0 btrfs_read_tree_root+0x61/0x100 btrfs_get_root_ref.part.50+0x143/0x630 btrfs_uuid_tree_iterate+0x207/0x314 ? btree_readpage+0x20/0x20 btrfs_uuid_rescan_kthread+0x12/0x50 kthread+0x133/0x150 ? kthread_create_on_node+0x60/0x60 ret_from_fork+0x1f/0x30 This problem exists because we have two different rescan threads, btrfs_uuid_scan_kthread which creates the uuid tree, and btrfs_uuid_tree_iterate that goes through and updates or deletes any out of date roots. The problem is they both do things in different order. btrfs_uuid_scan_kthread() reads the tree_root, and then inserts entries into the uuid_root. btrfs_uuid_tree_iterate() scans the uuid_root, but then does a btrfs_get_fs_root() which can read from the tree_root. It's actually easy enough to not be holding the path in btrfs_uuid_scan_kthread() when we add a uuid entry, as we already drop it further down and re-start the search when we loop. So simply move the path release before we add our entry to the uuid tree. This also fixes a problem where we're holding a path open after we do btrfs_end_transaction(), which has it's own problems. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-08-10 09:42:26 -06:00
btrfs_release_path(path);
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
if (ret < 0) {
btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
}
if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
ret = btrfs_uuid_tree_add(trans,
root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
if (ret < 0) {
btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
}
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
skip:
btrfs: drop path before adding new uuid tree entry commit 9771a5cf937129307d9f58922d60484d58ababe7 upstream. With the conversion of the tree locks to rwsem I got the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Not tainted ------------------------------------------------------ btrfs-uuid/7955 is trying to acquire lock: ffff88bfbafec0f8 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 but task is already holding lock: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (btrfs-uuid-00){++++}-{3:3}: down_read_nested+0x3e/0x140 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_uuid_tree_add+0x89/0x2d0 btrfs_uuid_scan_kthread+0x330/0x390 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 -> #0 (btrfs-root-00){++++}-{3:3}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 down_read_nested+0x3e/0x140 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_find_root+0x45/0x1b0 btrfs_read_tree_root+0x61/0x100 btrfs_get_root_ref.part.50+0x143/0x630 btrfs_uuid_tree_iterate+0x207/0x314 btrfs_uuid_rescan_kthread+0x12/0x50 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-uuid-00); lock(btrfs-root-00); lock(btrfs-uuid-00); lock(btrfs-root-00); *** DEADLOCK *** 1 lock held by btrfs-uuid/7955: #0: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 stack backtrace: CPU: 73 PID: 7955 Comm: btrfs-uuid Kdump: loaded Not tainted 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 ? __btrfs_tree_read_lock+0x39/0x180 ? btrfs_root_node+0x1c/0x1d0 down_read_nested+0x3e/0x140 ? __btrfs_tree_read_lock+0x39/0x180 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_find_root+0x45/0x1b0 btrfs_read_tree_root+0x61/0x100 btrfs_get_root_ref.part.50+0x143/0x630 btrfs_uuid_tree_iterate+0x207/0x314 ? btree_readpage+0x20/0x20 btrfs_uuid_rescan_kthread+0x12/0x50 kthread+0x133/0x150 ? kthread_create_on_node+0x60/0x60 ret_from_fork+0x1f/0x30 This problem exists because we have two different rescan threads, btrfs_uuid_scan_kthread which creates the uuid tree, and btrfs_uuid_tree_iterate that goes through and updates or deletes any out of date roots. The problem is they both do things in different order. btrfs_uuid_scan_kthread() reads the tree_root, and then inserts entries into the uuid_root. btrfs_uuid_tree_iterate() scans the uuid_root, but then does a btrfs_get_fs_root() which can read from the tree_root. It's actually easy enough to not be holding the path in btrfs_uuid_scan_kthread() when we add a uuid entry, as we already drop it further down and re-start the search when we loop. So simply move the path release before we add our entry to the uuid tree. This also fixes a problem where we're holding a path open after we do btrfs_end_transaction(), which has it's own problems. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-08-10 09:42:26 -06:00
btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
trans = NULL;
if (ret)
break;
}
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
} else if (key.objectid < (u64)-1) {
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
key.objectid++;
} else {
break;
}
cond_resched();
}
out:
btrfs_free_path(path);
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 03:28:34 -06:00
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
else
set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
up(&fs_info->uuid_tree_rescan_sem);
return 0;
}
/*
* Callback for btrfs_uuid_tree_iterate().
* returns:
* 0 check succeeded, the entry is not outdated.
* < 0 if an error occurred.
* > 0 if the check failed, which means the caller shall remove the entry.
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
u8 *uuid, u8 type, u64 subid)
{
struct btrfs_key key;
int ret = 0;
struct btrfs_root *subvol_root;
if (type != BTRFS_UUID_KEY_SUBVOL &&
type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
goto out;
key.objectid = subid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(subvol_root)) {
ret = PTR_ERR(subvol_root);
if (ret == -ENOENT)
ret = 1;
goto out;
}
switch (type) {
case BTRFS_UUID_KEY_SUBVOL:
if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
ret = 1;
break;
case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
if (memcmp(uuid, subvol_root->root_item.received_uuid,
BTRFS_UUID_SIZE))
ret = 1;
break;
}
out:
return ret;
}
static int btrfs_uuid_rescan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
int ret;
/*
* 1st step is to iterate through the existing UUID tree and
* to delete all entries that contain outdated data.
* 2nd step is to add all missing entries to the UUID tree.
*/
ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
if (ret < 0) {
btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
up(&fs_info->uuid_tree_rescan_sem);
return ret;
}
return btrfs_uuid_scan_kthread(data);
}
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *uuid_root;
struct task_struct *task;
int ret;
/*
* 1 - root node
* 1 - root item
*/
trans = btrfs_start_transaction(tree_root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
fs_info->uuid_root = uuid_root;
ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
down(&fs_info->uuid_tree_rescan_sem);
task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
btrfs_warn(fs_info, "failed to start uuid_scan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
return 0;
}
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct task_struct *task;
down(&fs_info->uuid_tree_rescan_sem);
task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
btrfs_warn(fs_info, "failed to start uuid_rescan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
return 0;
}
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
* The chunk relocation code actually frees the device extent
*/
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
u64 chunk_offset;
int ret;
int slot;
int failed = 0;
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff;
u64 start;
new_size = round_down(new_size, fs_info->sectorsize);
start = new_size;
diff = round_down(old_size - new_size, fs_info->sectorsize);
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
return -EINVAL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_BACK;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_total_bytes(device, new_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
device->fs_devices->total_rw_bytes -= diff;
atomic64_sub(diff, &fs_info->free_chunk_space);
}
/*
* Once the device's size has been set to the new size, ensure all
* in-memory chunks are synced to disk so that the loop below sees them
* and relocates them accordingly.
*/
if (contains_pending_extent(device, &start, diff)) {
mutex_unlock(&fs_info->chunk_mutex);
ret = btrfs_commit_transaction(trans);
if (ret)
goto done;
} else {
mutex_unlock(&fs_info->chunk_mutex);
btrfs_end_transaction(trans);
}
again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
do {
mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto done;
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
}
ret = btrfs_previous_item(root, path, 0, key.type);
Btrfs: fix race between balance and unused block group deletion We have a race between deleting an unused block group and balancing the same block group that leads to an assertion failure/BUG(), producing the following trace: [181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622 [181631.220591] ------------[ cut here ]------------ [181631.222959] kernel BUG at fs/btrfs/ctree.h:4062! [181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$ [181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000 [181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs] [181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246 [181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce [181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff [181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000 [181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200 [181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000 [181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000 [181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0 [181631.224566] Stack: [181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e [181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001 [181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000 [181631.224566] Call Trace: [181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs] [181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs] [181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs] [181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs] [181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs] [181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs] [181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf [181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs] [181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15 [181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2 [181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424 [181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479 (...) The sequence of steps leading to this are: CPU 0 CPU 1 btrfs_balance() btrfs_relocate_chunk() btrfs_relocate_block_group(bg X) btrfs_lookup_block_group(bg X) cleaner_kthread locks fs_info->cleaner_mutex btrfs_delete_unused_bgs() finds bg X, which became unused in the previous transaction checks bg X ->ro == 0, so it proceeds sets bg X ->ro to 1 (btrfs_set_block_group_ro(bg X)) blocks on fs_info->cleaner_mutex btrfs_remove_chunk(bg X) unlocks fs_info->cleaner_mutex acquires fs_info->cleaner_mutex relocate_block_group() --> does nothing, no extents found in the extent tree from bg X unlocks fs_info->cleaner_mutex btrfs_relocate_block_group(bg X) returns btrfs_remove_chunk(bg X) extent map not found --> ASSERT(0) Fix this by using a new mutex to make sure these 2 operations, block group relocation and removal, are serialized. This issue is reproducible by running fstests generic/038 (which stresses chunk allocation and automatic removal of unused block groups) together with the following balance loop: while true; do btrfs balance start -dusage=0 <mountpoint> ; done Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-10 17:58:53 -06:00
if (ret)
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto done;
if (ret) {
ret = 0;
btrfs_release_path(path);
break;
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
/*
* We may be relocating the only data chunk we have,
* which could potentially end up with losing data's
* raid profile, so lets allocate an empty one in
* advance.
*/
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto done;
}
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret == -ENOSPC) {
failed++;
} else if (ret) {
if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
"could not shrink block group %llu due to active swapfile",
chunk_offset);
}
goto done;
}
} while (key.offset-- > 0);
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
goto done;
}
/* Shrinking succeeded, else we would be at "done". */
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto done;
}
mutex_lock(&fs_info->chunk_mutex);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
list_add_tail(&device->post_commit_list,
&trans->transaction->dev_update_list);
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy,
round_down(old_total - diff, fs_info->sectorsize));
mutex_unlock(&fs_info->chunk_mutex);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
} else {
ret = btrfs_commit_transaction(trans);
}
done:
btrfs_free_path(path);
Btrfs: check pending chunks when shrinking fs to avoid corruption When we shrink the usable size of a device (its total_bytes), we go over all the device extent items in the device tree and attempt to relocate the chunk of any device extent that goes beyond the new usable size for the device. We do that after setting the new usable size (total_bytes) in the device object, so that all new allocations (and reallocations) don't use areas of the device that go beyond the new (shorter) size. However we were not considering that before setting the new size in the device, pending chunks might have been created that use device extents that go beyond the new size, and those device extents are not yet in the device tree after we search the device tree - they are still attached to the list of new block group for some ongoing transaction handle, and they are only added to the device tree when the transaction handle is ended (via btrfs_create_pending_block_groups()). So check for pending chunks with device extents that go beyond the new size and if any exists, commit the current transaction and repeat the search in the device tree. Not doing this it would mean we would return success to user space while still having extents that go beyond the new size, and later user space could override those locations on the device while the fs still references them, causing all sorts of corruption and unexpected events. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-02 07:43:21 -06:00
if (ret) {
mutex_lock(&fs_info->chunk_mutex);
Btrfs: check pending chunks when shrinking fs to avoid corruption When we shrink the usable size of a device (its total_bytes), we go over all the device extent items in the device tree and attempt to relocate the chunk of any device extent that goes beyond the new usable size for the device. We do that after setting the new usable size (total_bytes) in the device object, so that all new allocations (and reallocations) don't use areas of the device that go beyond the new (shorter) size. However we were not considering that before setting the new size in the device, pending chunks might have been created that use device extents that go beyond the new size, and those device extents are not yet in the device tree after we search the device tree - they are still attached to the list of new block group for some ongoing transaction handle, and they are only added to the device tree when the transaction handle is ended (via btrfs_create_pending_block_groups()). So check for pending chunks with device extents that go beyond the new size and if any exists, commit the current transaction and repeat the search in the device tree. Not doing this it would mean we would return success to user space while still having extents that go beyond the new size, and later user space could override those locations on the device while the fs still references them, causing all sorts of corruption and unexpected events. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-02 07:43:21 -06:00
btrfs_device_set_total_bytes(device, old_size);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Btrfs: check pending chunks when shrinking fs to avoid corruption When we shrink the usable size of a device (its total_bytes), we go over all the device extent items in the device tree and attempt to relocate the chunk of any device extent that goes beyond the new usable size for the device. We do that after setting the new usable size (total_bytes) in the device object, so that all new allocations (and reallocations) don't use areas of the device that go beyond the new (shorter) size. However we were not considering that before setting the new size in the device, pending chunks might have been created that use device extents that go beyond the new size, and those device extents are not yet in the device tree after we search the device tree - they are still attached to the list of new block group for some ongoing transaction handle, and they are only added to the device tree when the transaction handle is ended (via btrfs_create_pending_block_groups()). So check for pending chunks with device extents that go beyond the new size and if any exists, commit the current transaction and repeat the search in the device tree. Not doing this it would mean we would return success to user space while still having extents that go beyond the new size, and later user space could override those locations on the device while the fs still references them, causing all sorts of corruption and unexpected events. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-02 07:43:21 -06:00
device->fs_devices->total_rw_bytes += diff;
atomic64_add(diff, &fs_info->free_chunk_space);
mutex_unlock(&fs_info->chunk_mutex);
Btrfs: check pending chunks when shrinking fs to avoid corruption When we shrink the usable size of a device (its total_bytes), we go over all the device extent items in the device tree and attempt to relocate the chunk of any device extent that goes beyond the new usable size for the device. We do that after setting the new usable size (total_bytes) in the device object, so that all new allocations (and reallocations) don't use areas of the device that go beyond the new (shorter) size. However we were not considering that before setting the new size in the device, pending chunks might have been created that use device extents that go beyond the new size, and those device extents are not yet in the device tree after we search the device tree - they are still attached to the list of new block group for some ongoing transaction handle, and they are only added to the device tree when the transaction handle is ended (via btrfs_create_pending_block_groups()). So check for pending chunks with device extents that go beyond the new size and if any exists, commit the current transaction and repeat the search in the device tree. Not doing this it would mean we would return success to user space while still having extents that go beyond the new size, and later user space could override those locations on the device while the fs still references them, causing all sorts of corruption and unexpected events. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-02 07:43:21 -06:00
}
return ret;
}
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
mutex_lock(&fs_info->chunk_mutex);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
mutex_unlock(&fs_info->chunk_mutex);
return -EFBIG;
}
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
memcpy(ptr, &disk_key, sizeof(disk_key));
ptr += sizeof(disk_key);
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
mutex_unlock(&fs_info->chunk_mutex);
return 0;
}
/*
* sort the devices in descending order by max_avail, total_avail
*/
static int btrfs_cmp_device_info(const void *a, const void *b)
{
const struct btrfs_device_info *di_a = a;
const struct btrfs_device_info *di_b = b;
if (di_a->max_avail > di_b->max_avail)
return -1;
if (di_a->max_avail < di_b->max_avail)
return 1;
if (di_a->total_avail > di_b->total_avail)
return -1;
if (di_a->total_avail < di_b->total_avail)
return 1;
return 0;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
return;
btrfs_set_fs_incompat(info, RAID56);
}
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 start, u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct btrfs_device *device;
struct map_lookup *map = NULL;
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_device_info *devices_info = NULL;
u64 total_avail;
int num_stripes; /* total number of stripes to allocate */
int data_stripes; /* number of stripes that count for
block group size */
int sub_stripes; /* sub_stripes info for map */
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
int nparity; /* number of stripes worth of bytes to
store parity information */
int ret;
u64 max_stripe_size;
u64 max_chunk_size;
u64 stripe_size;
u64 chunk_size;
int ndevs;
int i;
int j;
int index;
BUG_ON(!alloc_profile_is_valid(type, 0));
if (list_empty(&fs_devices->alloc_list)) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info, "%s: no writable device", __func__);
return -ENOSPC;
}
index = btrfs_bg_flags_to_raid_index(type);
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
devs_max = btrfs_raid_array[index].devs_max;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info);
devs_min = btrfs_raid_array[index].devs_min;
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
nparity = btrfs_raid_array[index].nparity;
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = SZ_1G;
max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
max_stripe_size = SZ_1G;
else
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
btrfs: use BUG() instead of BUG_ON(1) BUG_ON(1) leads to bogus warnings from clang when CONFIG_PROFILE_ANNOTATED_BRANCHES is set: fs/btrfs/volumes.c:5041:3: error: variable 'max_chunk_size' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] BUG_ON(1); ^~~~~~~~~ include/asm-generic/bug.h:61:36: note: expanded from macro 'BUG_ON' #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) ^~~~~~~~~~~~~~~~~~~ include/linux/compiler.h:48:23: note: expanded from macro 'unlikely' # define unlikely(x) (__branch_check__(x, 0, __builtin_constant_p(x))) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/btrfs/volumes.c:5046:9: note: uninitialized use occurs here max_chunk_size); ^~~~~~~~~~~~~~ include/linux/kernel.h:860:36: note: expanded from macro 'min' #define min(x, y) __careful_cmp(x, y, <) ^ include/linux/kernel.h:853:17: note: expanded from macro '__careful_cmp' __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op)) ^ include/linux/kernel.h:847:25: note: expanded from macro '__cmp_once' typeof(y) unique_y = (y); \ ^ fs/btrfs/volumes.c:5041:3: note: remove the 'if' if its condition is always true BUG_ON(1); ^ include/asm-generic/bug.h:61:32: note: expanded from macro 'BUG_ON' #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) ^ fs/btrfs/volumes.c:4993:20: note: initialize the variable 'max_chunk_size' to silence this warning u64 max_chunk_size; ^ = 0 Change it to BUG() so clang can see that this code path can never continue. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: David Sterba <dsterba@suse.com>
2019-03-25 07:02:25 -06:00
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
/*
* in the first pass through the devices list, we gather information
* about the available holes on each device.
*/
ndevs = 0;
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 max_avail;
u64 dev_offset;
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
continue;
}
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&device->dev_state) ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
continue;
if (device->total_bytes > device->bytes_used)
total_avail = device->total_bytes - device->bytes_used;
else
total_avail = 0;
/* If there is no space on this device, skip it. */
if (total_avail == 0)
continue;
ret = find_free_dev_extent(device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
goto error;
if (ret == 0)
max_avail = max_stripe_size * dev_stripes;
if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
if (btrfs_test_opt(info, ENOSPC_DEBUG))
btrfs_debug(info,
"%s: devid %llu has no free space, have=%llu want=%u",
__func__, device->devid, max_avail,
BTRFS_STRIPE_LEN * dev_stripes);
continue;
}
if (ndevs == fs_devices->rw_devices) {
WARN(1, "%s: found more than %llu devices\n",
__func__, fs_devices->rw_devices);
break;
}
devices_info[ndevs].dev_offset = dev_offset;
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
++ndevs;
}
/*
* now sort the devices by hole size / available space
*/
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
/* round down to number of usable stripes */
ndevs = round_down(ndevs, devs_increment);
if (ndevs < devs_min) {
ret = -ENOSPC;
if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
btrfs_debug(info,
"%s: not enough devices with free space: have=%d minimum required=%d",
__func__, ndevs, devs_min);
}
goto error;
}
ndevs = min(ndevs, devs_max);
/*
btrfs: alloc_chunk: fix DUP stripe size handling In case of using DUP, we search for enough unallocated disk space on a device to hold two stripes. The devices_info[ndevs-1].max_avail that holds the amount of unallocated space found is directly assigned to stripe_size, while it's actually twice the stripe size. Later on in the code, an unconditional division of stripe_size by dev_stripes corrects the value, but in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size. The unconditional division later tries to correct stripe_size, but will actually make sure we can't allocate more than half the max_chunk_size. Fix this by moving the division by dev_stripes before the max chunk size check, so it always contains the right value, instead of putting a duct tape division in further on to get it fixed again. Since in all other cases than DUP, dev_stripes is 1, this change only affects DUP. Other attempts in the past were made to fix this: * 37db63a400 "Btrfs: fix max chunk size check in chunk allocator" tried to fix the same problem, but still resulted in part of the code acting on a wrongly doubled stripe_size value. * 86db25785a "Btrfs: fix max chunk size on raid5/6" unintentionally broke this fix again. The real problem was already introduced with the rest of the code in 73c5de0051. The user visible result however will be that the max chunk size for DUP will suddenly double, while it's actually acting according to the limits in the code again like it was 5 years ago. Reported-by: Naohiro Aota <naohiro.aota@wdc.com> Link: https://www.spinics.net/lists/linux-btrfs/msg69752.html Fixes: 73c5de0051 ("btrfs: quasi-round-robin for chunk allocation") Fixes: 86db25785a ("Btrfs: fix max chunk size on raid5/6") Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Reviewed-by: David Sterba <dsterba@suse.com> [ update comment ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-02-05 09:45:11 -07:00
* The primary goal is to maximize the number of stripes, so use as
* many devices as possible, even if the stripes are not maximum sized.
*
* The DUP profile stores more than one stripe per device, the
* max_avail is the total size so we have to adjust.
*/
btrfs: alloc_chunk: fix DUP stripe size handling In case of using DUP, we search for enough unallocated disk space on a device to hold two stripes. The devices_info[ndevs-1].max_avail that holds the amount of unallocated space found is directly assigned to stripe_size, while it's actually twice the stripe size. Later on in the code, an unconditional division of stripe_size by dev_stripes corrects the value, but in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size. The unconditional division later tries to correct stripe_size, but will actually make sure we can't allocate more than half the max_chunk_size. Fix this by moving the division by dev_stripes before the max chunk size check, so it always contains the right value, instead of putting a duct tape division in further on to get it fixed again. Since in all other cases than DUP, dev_stripes is 1, this change only affects DUP. Other attempts in the past were made to fix this: * 37db63a400 "Btrfs: fix max chunk size check in chunk allocator" tried to fix the same problem, but still resulted in part of the code acting on a wrongly doubled stripe_size value. * 86db25785a "Btrfs: fix max chunk size on raid5/6" unintentionally broke this fix again. The real problem was already introduced with the rest of the code in 73c5de0051. The user visible result however will be that the max chunk size for DUP will suddenly double, while it's actually acting according to the limits in the code again like it was 5 years ago. Reported-by: Naohiro Aota <naohiro.aota@wdc.com> Link: https://www.spinics.net/lists/linux-btrfs/msg69752.html Fixes: 73c5de0051 ("btrfs: quasi-round-robin for chunk allocation") Fixes: 86db25785a ("Btrfs: fix max chunk size on raid5/6") Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Reviewed-by: David Sterba <dsterba@suse.com> [ update comment ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-02-05 09:45:11 -07:00
stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
num_stripes = ndevs * dev_stripes;
/*
* this will have to be fixed for RAID1 and RAID10 over
* more drives
*/
data_stripes = (num_stripes - nparity) / ncopies;
/*
* Use the number of data stripes to figure out how big this chunk
* is really going to be in terms of logical address space,
btrfs: alloc_chunk: fix more DUP stripe size handling Commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling" fixed calculating the stripe_size for a new DUP chunk. However, the same calculation reappears a bit later, and that one was not changed yet. The resulting bug that is exposed is that the newly allocated device extents ('stripes') can have a few MiB overlap with the next thing stored after them, which is another device extent or the end of the disk. The scenario in which this can happen is: * The block device for the filesystem is less than 10GiB in size. * The amount of contiguous free unallocated disk space chosen to use for chunk allocation is 20% of the total device size, or a few MiB more or less. An example: - The filesystem device is 7880MiB (max_chunk_size gets set to 788MiB) - There's 1578MiB unallocated raw disk space left in one contiguous piece. In this case stripe_size is first calculated as 789MiB, (half of 1578MiB). Since 789MiB (stripe_size * data_stripes) > 788MiB (max_chunk_size), we enter the if block. Now stripe_size value is immediately overwritten while calculating an adjusted value based on max_chunk_size, which ends up as 788MiB. Next, the value is rounded up to a 16MiB boundary, 800MiB, which is actually more than the value we had before. However, the last comparison fails to detect this, because it's comparing the value with the total amount of free space, which is about twice the size of stripe_size. In the example above, this means that the resulting raw disk space being allocated is 1600MiB, while only a gap of 1578MiB has been found. The second device extent object for this DUP chunk will overlap for 22MiB with whatever comes next. The underlying problem here is that the stripe_size is reused all the time for different things. So, when entering the code in the if block, stripe_size is immediately overwritten with something else. If later we decide we want to have the previous value back, then the logic to compute it was copy pasted in again. With this change, the value in stripe_size is not unnecessarily destroyed, so the duplicated calculation is not needed any more. Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-04 15:24:40 -06:00
* and compare that answer with the max chunk size. If it's higher,
* we try to reduce stripe_size.
*/
if (stripe_size * data_stripes > max_chunk_size) {
/*
btrfs: alloc_chunk: fix more DUP stripe size handling Commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling" fixed calculating the stripe_size for a new DUP chunk. However, the same calculation reappears a bit later, and that one was not changed yet. The resulting bug that is exposed is that the newly allocated device extents ('stripes') can have a few MiB overlap with the next thing stored after them, which is another device extent or the end of the disk. The scenario in which this can happen is: * The block device for the filesystem is less than 10GiB in size. * The amount of contiguous free unallocated disk space chosen to use for chunk allocation is 20% of the total device size, or a few MiB more or less. An example: - The filesystem device is 7880MiB (max_chunk_size gets set to 788MiB) - There's 1578MiB unallocated raw disk space left in one contiguous piece. In this case stripe_size is first calculated as 789MiB, (half of 1578MiB). Since 789MiB (stripe_size * data_stripes) > 788MiB (max_chunk_size), we enter the if block. Now stripe_size value is immediately overwritten while calculating an adjusted value based on max_chunk_size, which ends up as 788MiB. Next, the value is rounded up to a 16MiB boundary, 800MiB, which is actually more than the value we had before. However, the last comparison fails to detect this, because it's comparing the value with the total amount of free space, which is about twice the size of stripe_size. In the example above, this means that the resulting raw disk space being allocated is 1600MiB, while only a gap of 1578MiB has been found. The second device extent object for this DUP chunk will overlap for 22MiB with whatever comes next. The underlying problem here is that the stripe_size is reused all the time for different things. So, when entering the code in the if block, stripe_size is immediately overwritten with something else. If later we decide we want to have the previous value back, then the logic to compute it was copy pasted in again. With this change, the value in stripe_size is not unnecessarily destroyed, so the duplicated calculation is not needed any more. Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-04 15:24:40 -06:00
* Reduce stripe_size, round it up to a 16MB boundary again and
* then use it, unless it ends up being even bigger than the
* previous value we had already.
*/
btrfs: alloc_chunk: fix more DUP stripe size handling Commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling" fixed calculating the stripe_size for a new DUP chunk. However, the same calculation reappears a bit later, and that one was not changed yet. The resulting bug that is exposed is that the newly allocated device extents ('stripes') can have a few MiB overlap with the next thing stored after them, which is another device extent or the end of the disk. The scenario in which this can happen is: * The block device for the filesystem is less than 10GiB in size. * The amount of contiguous free unallocated disk space chosen to use for chunk allocation is 20% of the total device size, or a few MiB more or less. An example: - The filesystem device is 7880MiB (max_chunk_size gets set to 788MiB) - There's 1578MiB unallocated raw disk space left in one contiguous piece. In this case stripe_size is first calculated as 789MiB, (half of 1578MiB). Since 789MiB (stripe_size * data_stripes) > 788MiB (max_chunk_size), we enter the if block. Now stripe_size value is immediately overwritten while calculating an adjusted value based on max_chunk_size, which ends up as 788MiB. Next, the value is rounded up to a 16MiB boundary, 800MiB, which is actually more than the value we had before. However, the last comparison fails to detect this, because it's comparing the value with the total amount of free space, which is about twice the size of stripe_size. In the example above, this means that the resulting raw disk space being allocated is 1600MiB, while only a gap of 1578MiB has been found. The second device extent object for this DUP chunk will overlap for 22MiB with whatever comes next. The underlying problem here is that the stripe_size is reused all the time for different things. So, when entering the code in the if block, stripe_size is immediately overwritten with something else. If later we decide we want to have the previous value back, then the logic to compute it was copy pasted in again. With this change, the value in stripe_size is not unnecessarily destroyed, so the duplicated calculation is not needed any more. Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-04 15:24:40 -06:00
stripe_size = min(round_up(div_u64(max_chunk_size,
data_stripes), SZ_16M),
stripe_size);
}
/* align to BTRFS_STRIPE_LEN */
stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
ret = -ENOMEM;
goto error;
}
map->num_stripes = num_stripes;
for (i = 0; i < ndevs; ++i) {
for (j = 0; j < dev_stripes; ++j) {
int s = i * dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
j * stripe_size;
}
}
map->stripe_len = BTRFS_STRIPE_LEN;
map->io_align = BTRFS_STRIPE_LEN;
map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
map->sub_stripes = sub_stripes;
chunk_size = stripe_size * data_stripes;
trace_btrfs_chunk_alloc(info, map, start, chunk_size);
Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
em = alloc_extent_map();
if (!em) {
Btrfs: fix NULL pointer crash when running balance and scrub concurrently While running balance, scrub, fsstress concurrently we hit the following kernel crash: [56561.448845] BTRFS info (device sde): relocating block group 11005853696 flags 132 [56561.524077] BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [56561.524237] IP: [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.524297] PGD 9be28067 PUD 7f3dd067 PMD 0 [56561.524325] Oops: 0000 [#1] SMP [....] [56561.527237] Call Trace: [56561.527309] [<ffffffffa038980e>] scrub_enumerate_chunks+0x24e/0x490 [btrfs] [56561.527392] [<ffffffff810abe00>] ? abort_exclusive_wait+0x50/0xb0 [56561.527476] [<ffffffffa038add4>] btrfs_scrub_dev+0x1a4/0x530 [btrfs] [56561.527561] [<ffffffffa0368107>] btrfs_ioctl+0x13f7/0x2a90 [btrfs] [56561.527639] [<ffffffff811c82f0>] do_vfs_ioctl+0x2e0/0x4c0 [56561.527712] [<ffffffff8109c384>] ? vtime_account_user+0x54/0x60 [56561.527788] [<ffffffff810f768c>] ? __audit_syscall_entry+0x9c/0xf0 [56561.527870] [<ffffffff811c8551>] SyS_ioctl+0x81/0xa0 [56561.527941] [<ffffffff815707f7>] tracesys+0xdd/0xe2 [...] [56561.528304] RIP [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.528395] RSP <ffff88004c0f5be8> [56561.528454] CR2: 0000000000000078 This is because in btrfs_relocate_chunk(), we will free @bdev directly while scrub may still hold extent mapping, and may access freed memory. Fix this problem by wrapping freeing @bdev work into free_extent_map() which is based on reference count. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-06-18 20:42:52 -06:00
kfree(map);
ret = -ENOMEM;
goto error;
}
Btrfs: fix NULL pointer crash when running balance and scrub concurrently While running balance, scrub, fsstress concurrently we hit the following kernel crash: [56561.448845] BTRFS info (device sde): relocating block group 11005853696 flags 132 [56561.524077] BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [56561.524237] IP: [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.524297] PGD 9be28067 PUD 7f3dd067 PMD 0 [56561.524325] Oops: 0000 [#1] SMP [....] [56561.527237] Call Trace: [56561.527309] [<ffffffffa038980e>] scrub_enumerate_chunks+0x24e/0x490 [btrfs] [56561.527392] [<ffffffff810abe00>] ? abort_exclusive_wait+0x50/0xb0 [56561.527476] [<ffffffffa038add4>] btrfs_scrub_dev+0x1a4/0x530 [btrfs] [56561.527561] [<ffffffffa0368107>] btrfs_ioctl+0x13f7/0x2a90 [btrfs] [56561.527639] [<ffffffff811c82f0>] do_vfs_ioctl+0x2e0/0x4c0 [56561.527712] [<ffffffff8109c384>] ? vtime_account_user+0x54/0x60 [56561.527788] [<ffffffff810f768c>] ? __audit_syscall_entry+0x9c/0xf0 [56561.527870] [<ffffffff811c8551>] SyS_ioctl+0x81/0xa0 [56561.527941] [<ffffffff815707f7>] tracesys+0xdd/0xe2 [...] [56561.528304] RIP [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.528395] RSP <ffff88004c0f5be8> [56561.528454] CR2: 0000000000000078 This is because in btrfs_relocate_chunk(), we will free @bdev directly while scrub may still hold extent mapping, and may access freed memory. Fix this problem by wrapping freeing @bdev work into free_extent_map() which is based on reference count. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-06-18 20:42:52 -06:00
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = start;
em->len = chunk_size;
em->block_start = 0;
em->block_len = em->len;
em->orig_block_len = stripe_size;
em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
2013-04-05 14:51:15 -06:00
ret = add_extent_mapping(em_tree, em, 0);
if (ret) {
write_unlock(&em_tree->lock);
free_extent_map(em);
goto error;
}
write_unlock(&em_tree->lock);
ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
if (ret)
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
if (list_empty(&dev->post_commit_list))
list_add_tail(&dev->post_commit_list,
&trans->transaction->dev_update_list);
}
atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
free_extent_map(em);
check_raid56_incompat_flag(info, type);
kfree(devices_info);
return 0;
error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* One for our allocation */
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
error:
kfree(devices_info);
return ret;
}
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 chunk_offset, u64 chunk_size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_key key;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
u64 dev_offset;
u64 stripe_size;
int i = 0;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(em))
return PTR_ERR(em);
map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
chunk = kzalloc(item_size, GFP_NOFS);
if (!chunk) {
ret = -ENOMEM;
goto out;
}
Btrfs: fix race when finishing dev replace leading to transaction abort During the final phase of a device replace operation, I ran into a transaction abort that resulted in the following trace: [23919.655368] WARNING: CPU: 10 PID: 30175 at fs/btrfs/extent-tree.c:9843 btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs]() [23919.664742] BTRFS: Transaction aborted (error -2) [23919.665749] Modules linked in: btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc i2c_piix4 parport psmouse acpi_cpufreq processor i2c_core evdev microcode pcspkr button serio_raw ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom virtio_scsi ata_generic ata_piix virtio_pci floppy virtio_ring libata e1000 virtio scsi_mod [last unloaded: btrfs] [23919.679442] CPU: 10 PID: 30175 Comm: fsstress Not tainted 4.3.0-rc5-btrfs-next-17+ #1 [23919.682392] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [23919.689151] 0000000000000000 ffff8804020cbb50 ffffffff812566f4 ffff8804020cbb98 [23919.692604] ffff8804020cbb88 ffffffff8104d0a6 ffffffffa03eea69 ffff88041b678a48 [23919.694230] ffff88042ac38000 ffff88041b678930 00000000fffffffe ffff8804020cbbf0 [23919.696716] Call Trace: [23919.698669] [<ffffffff812566f4>] dump_stack+0x4e/0x79 [23919.700597] [<ffffffff8104d0a6>] warn_slowpath_common+0x9f/0xb8 [23919.701958] [<ffffffffa03eea69>] ? btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.703612] [<ffffffff8104d107>] warn_slowpath_fmt+0x48/0x50 [23919.705047] [<ffffffffa03eea69>] btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.706967] [<ffffffffa0402097>] __btrfs_end_transaction+0x84/0x2dd [btrfs] [23919.708611] [<ffffffffa0402300>] btrfs_end_transaction+0x10/0x12 [btrfs] [23919.710099] [<ffffffffa03ef0b8>] btrfs_alloc_data_chunk_ondemand+0x121/0x28b [btrfs] [23919.711970] [<ffffffffa0413025>] btrfs_fallocate+0x7d3/0xc6d [btrfs] [23919.713602] [<ffffffff8108b78f>] ? lock_acquire+0x10d/0x194 [23919.714756] [<ffffffff81086dbc>] ? percpu_down_read+0x51/0x78 [23919.716155] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.718918] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.724170] [<ffffffff8116b579>] vfs_fallocate+0x170/0x1ff [23919.725482] [<ffffffff8117c1d7>] ioctl_preallocate+0x89/0x9b [23919.726790] [<ffffffff8117c5ef>] do_vfs_ioctl+0x406/0x4e6 [23919.728428] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e [23919.729642] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71 [23919.730782] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79 [23919.731847] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f [23919.733330] ---[ end trace 166ef301a335832a ]--- This is due to a race between device replace and chunk allocation, which the following diagram illustrates: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_fallocate() btrfs_alloc_data_chunk_ondemand() btrfs_join_transaction() --> starts a new transaction do_chunk_alloc() lock fs_info->chunk_mutex btrfs_alloc_chunk() --> creates extent map for the new chunk with em->bdev->map->stripes[i]->dev->devid == X (X > 0) --> extent map is added to fs_info->mapping_tree --> initial phase of bg A allocation completes unlock fs_info->chunk_mutex lock fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) btrfs_end_transaction() btrfs_create_pending_block_groups() --> starts final phase of bg A creation (update device, extent, and chunk trees, etc) btrfs_finish_chunk_alloc() btrfs_update_device() --> attempts to update a device item with ID == 0ULL (BTRFS_DEV_REPLACE_DEVID) which is the current ID of bg A's em->bdev->map->stripes[i]->dev->devid --> doesn't find such item returns -ENOENT --> the device id should have been X and not 0ULL got -ENOENT from btrfs_finish_chunk_alloc() and aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid, which is X (and X > 0) frees the srcdev unlock fs_info->chunk_mutex So fix this by taking the device list mutex when processing the chunk's extent map stripes to update the device items. This avoids getting the wrong device id and use-after-free problems if the task finishing a chunk allocation grabs the replaced device, which is freed while the dev replace task is holding the device list mutex. This happened while running fstest btrfs/071. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2015-11-20 03:42:47 -07:00
/*
* Take the device list mutex to prevent races with the final phase of
* a device replace operation that replaces the device object associated
* with the map's stripes, because the device object's id can change
* at any time during that final phase of the device replace operation
* (dev-replace.c:btrfs_dev_replace_finishing()).
*/
mutex_lock(&fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device);
if (ret)
Btrfs: fix race when finishing dev replace leading to transaction abort During the final phase of a device replace operation, I ran into a transaction abort that resulted in the following trace: [23919.655368] WARNING: CPU: 10 PID: 30175 at fs/btrfs/extent-tree.c:9843 btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs]() [23919.664742] BTRFS: Transaction aborted (error -2) [23919.665749] Modules linked in: btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc i2c_piix4 parport psmouse acpi_cpufreq processor i2c_core evdev microcode pcspkr button serio_raw ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom virtio_scsi ata_generic ata_piix virtio_pci floppy virtio_ring libata e1000 virtio scsi_mod [last unloaded: btrfs] [23919.679442] CPU: 10 PID: 30175 Comm: fsstress Not tainted 4.3.0-rc5-btrfs-next-17+ #1 [23919.682392] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [23919.689151] 0000000000000000 ffff8804020cbb50 ffffffff812566f4 ffff8804020cbb98 [23919.692604] ffff8804020cbb88 ffffffff8104d0a6 ffffffffa03eea69 ffff88041b678a48 [23919.694230] ffff88042ac38000 ffff88041b678930 00000000fffffffe ffff8804020cbbf0 [23919.696716] Call Trace: [23919.698669] [<ffffffff812566f4>] dump_stack+0x4e/0x79 [23919.700597] [<ffffffff8104d0a6>] warn_slowpath_common+0x9f/0xb8 [23919.701958] [<ffffffffa03eea69>] ? btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.703612] [<ffffffff8104d107>] warn_slowpath_fmt+0x48/0x50 [23919.705047] [<ffffffffa03eea69>] btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.706967] [<ffffffffa0402097>] __btrfs_end_transaction+0x84/0x2dd [btrfs] [23919.708611] [<ffffffffa0402300>] btrfs_end_transaction+0x10/0x12 [btrfs] [23919.710099] [<ffffffffa03ef0b8>] btrfs_alloc_data_chunk_ondemand+0x121/0x28b [btrfs] [23919.711970] [<ffffffffa0413025>] btrfs_fallocate+0x7d3/0xc6d [btrfs] [23919.713602] [<ffffffff8108b78f>] ? lock_acquire+0x10d/0x194 [23919.714756] [<ffffffff81086dbc>] ? percpu_down_read+0x51/0x78 [23919.716155] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.718918] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.724170] [<ffffffff8116b579>] vfs_fallocate+0x170/0x1ff [23919.725482] [<ffffffff8117c1d7>] ioctl_preallocate+0x89/0x9b [23919.726790] [<ffffffff8117c5ef>] do_vfs_ioctl+0x406/0x4e6 [23919.728428] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e [23919.729642] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71 [23919.730782] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79 [23919.731847] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f [23919.733330] ---[ end trace 166ef301a335832a ]--- This is due to a race between device replace and chunk allocation, which the following diagram illustrates: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_fallocate() btrfs_alloc_data_chunk_ondemand() btrfs_join_transaction() --> starts a new transaction do_chunk_alloc() lock fs_info->chunk_mutex btrfs_alloc_chunk() --> creates extent map for the new chunk with em->bdev->map->stripes[i]->dev->devid == X (X > 0) --> extent map is added to fs_info->mapping_tree --> initial phase of bg A allocation completes unlock fs_info->chunk_mutex lock fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) btrfs_end_transaction() btrfs_create_pending_block_groups() --> starts final phase of bg A creation (update device, extent, and chunk trees, etc) btrfs_finish_chunk_alloc() btrfs_update_device() --> attempts to update a device item with ID == 0ULL (BTRFS_DEV_REPLACE_DEVID) which is the current ID of bg A's em->bdev->map->stripes[i]->dev->devid --> doesn't find such item returns -ENOENT --> the device id should have been X and not 0ULL got -ENOENT from btrfs_finish_chunk_alloc() and aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid, which is X (and X > 0) frees the srcdev unlock fs_info->chunk_mutex So fix this by taking the device list mutex when processing the chunk's extent map stripes to update the device items. This avoids getting the wrong device id and use-after-free problems if the task finishing a chunk allocation grabs the replaced device, which is freed while the dev replace task is holding the device list mutex. This happened while running fstest btrfs/071. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2015-11-20 03:42:47 -07:00
break;
ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
dev_offset, stripe_size);
if (ret)
Btrfs: fix race when finishing dev replace leading to transaction abort During the final phase of a device replace operation, I ran into a transaction abort that resulted in the following trace: [23919.655368] WARNING: CPU: 10 PID: 30175 at fs/btrfs/extent-tree.c:9843 btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs]() [23919.664742] BTRFS: Transaction aborted (error -2) [23919.665749] Modules linked in: btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc i2c_piix4 parport psmouse acpi_cpufreq processor i2c_core evdev microcode pcspkr button serio_raw ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom virtio_scsi ata_generic ata_piix virtio_pci floppy virtio_ring libata e1000 virtio scsi_mod [last unloaded: btrfs] [23919.679442] CPU: 10 PID: 30175 Comm: fsstress Not tainted 4.3.0-rc5-btrfs-next-17+ #1 [23919.682392] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [23919.689151] 0000000000000000 ffff8804020cbb50 ffffffff812566f4 ffff8804020cbb98 [23919.692604] ffff8804020cbb88 ffffffff8104d0a6 ffffffffa03eea69 ffff88041b678a48 [23919.694230] ffff88042ac38000 ffff88041b678930 00000000fffffffe ffff8804020cbbf0 [23919.696716] Call Trace: [23919.698669] [<ffffffff812566f4>] dump_stack+0x4e/0x79 [23919.700597] [<ffffffff8104d0a6>] warn_slowpath_common+0x9f/0xb8 [23919.701958] [<ffffffffa03eea69>] ? btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.703612] [<ffffffff8104d107>] warn_slowpath_fmt+0x48/0x50 [23919.705047] [<ffffffffa03eea69>] btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.706967] [<ffffffffa0402097>] __btrfs_end_transaction+0x84/0x2dd [btrfs] [23919.708611] [<ffffffffa0402300>] btrfs_end_transaction+0x10/0x12 [btrfs] [23919.710099] [<ffffffffa03ef0b8>] btrfs_alloc_data_chunk_ondemand+0x121/0x28b [btrfs] [23919.711970] [<ffffffffa0413025>] btrfs_fallocate+0x7d3/0xc6d [btrfs] [23919.713602] [<ffffffff8108b78f>] ? lock_acquire+0x10d/0x194 [23919.714756] [<ffffffff81086dbc>] ? percpu_down_read+0x51/0x78 [23919.716155] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.718918] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.724170] [<ffffffff8116b579>] vfs_fallocate+0x170/0x1ff [23919.725482] [<ffffffff8117c1d7>] ioctl_preallocate+0x89/0x9b [23919.726790] [<ffffffff8117c5ef>] do_vfs_ioctl+0x406/0x4e6 [23919.728428] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e [23919.729642] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71 [23919.730782] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79 [23919.731847] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f [23919.733330] ---[ end trace 166ef301a335832a ]--- This is due to a race between device replace and chunk allocation, which the following diagram illustrates: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_fallocate() btrfs_alloc_data_chunk_ondemand() btrfs_join_transaction() --> starts a new transaction do_chunk_alloc() lock fs_info->chunk_mutex btrfs_alloc_chunk() --> creates extent map for the new chunk with em->bdev->map->stripes[i]->dev->devid == X (X > 0) --> extent map is added to fs_info->mapping_tree --> initial phase of bg A allocation completes unlock fs_info->chunk_mutex lock fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) btrfs_end_transaction() btrfs_create_pending_block_groups() --> starts final phase of bg A creation (update device, extent, and chunk trees, etc) btrfs_finish_chunk_alloc() btrfs_update_device() --> attempts to update a device item with ID == 0ULL (BTRFS_DEV_REPLACE_DEVID) which is the current ID of bg A's em->bdev->map->stripes[i]->dev->devid --> doesn't find such item returns -ENOENT --> the device id should have been X and not 0ULL got -ENOENT from btrfs_finish_chunk_alloc() and aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid, which is X (and X > 0) frees the srcdev unlock fs_info->chunk_mutex So fix this by taking the device list mutex when processing the chunk's extent map stripes to update the device items. This avoids getting the wrong device id and use-after-free problems if the task finishing a chunk allocation grabs the replaced device, which is freed while the dev replace task is holding the device list mutex. This happened while running fstest btrfs/071. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2015-11-20 03:42:47 -07:00
break;
}
if (ret) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Btrfs: fix race when finishing dev replace leading to transaction abort During the final phase of a device replace operation, I ran into a transaction abort that resulted in the following trace: [23919.655368] WARNING: CPU: 10 PID: 30175 at fs/btrfs/extent-tree.c:9843 btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs]() [23919.664742] BTRFS: Transaction aborted (error -2) [23919.665749] Modules linked in: btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc i2c_piix4 parport psmouse acpi_cpufreq processor i2c_core evdev microcode pcspkr button serio_raw ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom virtio_scsi ata_generic ata_piix virtio_pci floppy virtio_ring libata e1000 virtio scsi_mod [last unloaded: btrfs] [23919.679442] CPU: 10 PID: 30175 Comm: fsstress Not tainted 4.3.0-rc5-btrfs-next-17+ #1 [23919.682392] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [23919.689151] 0000000000000000 ffff8804020cbb50 ffffffff812566f4 ffff8804020cbb98 [23919.692604] ffff8804020cbb88 ffffffff8104d0a6 ffffffffa03eea69 ffff88041b678a48 [23919.694230] ffff88042ac38000 ffff88041b678930 00000000fffffffe ffff8804020cbbf0 [23919.696716] Call Trace: [23919.698669] [<ffffffff812566f4>] dump_stack+0x4e/0x79 [23919.700597] [<ffffffff8104d0a6>] warn_slowpath_common+0x9f/0xb8 [23919.701958] [<ffffffffa03eea69>] ? btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.703612] [<ffffffff8104d107>] warn_slowpath_fmt+0x48/0x50 [23919.705047] [<ffffffffa03eea69>] btrfs_create_pending_block_groups+0x15e/0x1ab [btrfs] [23919.706967] [<ffffffffa0402097>] __btrfs_end_transaction+0x84/0x2dd [btrfs] [23919.708611] [<ffffffffa0402300>] btrfs_end_transaction+0x10/0x12 [btrfs] [23919.710099] [<ffffffffa03ef0b8>] btrfs_alloc_data_chunk_ondemand+0x121/0x28b [btrfs] [23919.711970] [<ffffffffa0413025>] btrfs_fallocate+0x7d3/0xc6d [btrfs] [23919.713602] [<ffffffff8108b78f>] ? lock_acquire+0x10d/0x194 [23919.714756] [<ffffffff81086dbc>] ? percpu_down_read+0x51/0x78 [23919.716155] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.718918] [<ffffffff8116ef1d>] ? __sb_start_write+0x5f/0xb0 [23919.724170] [<ffffffff8116b579>] vfs_fallocate+0x170/0x1ff [23919.725482] [<ffffffff8117c1d7>] ioctl_preallocate+0x89/0x9b [23919.726790] [<ffffffff8117c5ef>] do_vfs_ioctl+0x406/0x4e6 [23919.728428] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e [23919.729642] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71 [23919.730782] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79 [23919.731847] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f [23919.733330] ---[ end trace 166ef301a335832a ]--- This is due to a race between device replace and chunk allocation, which the following diagram illustrates: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_fallocate() btrfs_alloc_data_chunk_ondemand() btrfs_join_transaction() --> starts a new transaction do_chunk_alloc() lock fs_info->chunk_mutex btrfs_alloc_chunk() --> creates extent map for the new chunk with em->bdev->map->stripes[i]->dev->devid == X (X > 0) --> extent map is added to fs_info->mapping_tree --> initial phase of bg A allocation completes unlock fs_info->chunk_mutex lock fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) btrfs_end_transaction() btrfs_create_pending_block_groups() --> starts final phase of bg A creation (update device, extent, and chunk trees, etc) btrfs_finish_chunk_alloc() btrfs_update_device() --> attempts to update a device item with ID == 0ULL (BTRFS_DEV_REPLACE_DEVID) which is the current ID of bg A's em->bdev->map->stripes[i]->dev->devid --> doesn't find such item returns -ENOENT --> the device id should have been X and not 0ULL got -ENOENT from btrfs_finish_chunk_alloc() and aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid, which is X (and X > 0) frees the srcdev unlock fs_info->chunk_mutex So fix this by taking the device list mutex when processing the chunk's extent map stripes to update the device items. This avoids getting the wrong device id and use-after-free problems if the task finishing a chunk allocation grabs the replaced device, which is freed while the dev replace task is holding the device list mutex. This happened while running fstest btrfs/071. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2015-11-20 03:42:47 -07:00
goto out;
}
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.type = BTRFS_CHUNK_ITEM_KEY;
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
/*
* TODO: Cleanup of inserted chunk root in case of
* failure.
*/
ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
}
Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
out:
kfree(chunk);
free_extent_map(em);
return ret;
}
/*
* Chunk allocation falls into two parts. The first part does work
* that makes the new allocated chunk usable, but does not do any operation
* that modifies the chunk tree. The second part does the work that
* requires modifying the chunk tree. This division is important for the
* bootstrap process of adding storage to a seed btrfs.
*/
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
{
u64 chunk_offset;
lockdep_assert_held(&trans->fs_info->chunk_mutex);
chunk_offset = find_next_chunk(trans->fs_info);
return __btrfs_alloc_chunk(trans, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 chunk_offset;
u64 sys_chunk_offset;
u64 alloc_profile;
int ret;
chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
if (ret)
return ret;
sys_chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_system_alloc_profile(fs_info);
ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
return ret;
}
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
return btrfs_raid_array[index].tolerated_failures;
}
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
int readonly = 0;
int miss_ndevs = 0;
int i;
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
return 1;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
miss_ndevs++;
continue;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
&map->stripes[i].dev->dev_state)) {
readonly = 1;
goto end;
}
}
/*
* If the number of missing devices is larger than max errors,
* we can not write the data into that chunk successfully, so
* set it readonly.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
readonly = 1;
end:
free_extent_map(em);
return readonly;
}
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
{
struct extent_map *em;
while (1) {
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, 0, (u64)-1);
if (em)
remove_extent_mapping(tree, em);
write_unlock(&tree->lock);
if (!em)
break;
/* once for us */
free_extent_map(em);
/* once for the tree */
free_extent_map(em);
}
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
int ret;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
* anything else and exit, so return 1 so the callers don't try
* to use other copies.
*/
return 1;
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
/*
* There could be two corrupted data stripes, we need
* to loop retry in order to rebuild the correct data.
*
* Fail a stripe at a time on every retry except the
* stripe under reconstruction.
*/
ret = map->num_stripes;
else
ret = 1;
free_extent_map(em);
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev)
ret++;
up_read(&fs_info->dev_replace.rwsem);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
struct extent_map *em;
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
em = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
free_extent_map(em);
}
return len;
}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct extent_map *em;
struct map_lookup *map;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, logical, len);
if(!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
{
int i;
int num_stripes;
int preferred_mirror;
int tolerance;
struct btrfs_device *srcdev;
ASSERT((map->type &
(BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
else
num_stripes = map->num_stripes;
preferred_mirror = first + current->pid % num_stripes;
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
srcdev = fs_info->dev_replace.srcdev;
else
srcdev = NULL;
/*
* try to avoid the drive that is the source drive for a
* dev-replace procedure, only choose it if no other non-missing
* mirror is available
*/
for (tolerance = 0; tolerance < 2; tolerance++) {
if (map->stripes[preferred_mirror].dev->bdev &&
(tolerance || map->stripes[preferred_mirror].dev != srcdev))
return preferred_mirror;
for (i = first; i < first + num_stripes; i++) {
if (map->stripes[i].dev->bdev &&
(tolerance || map->stripes[i].dev != srcdev))
return i;
}
}
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
return preferred_mirror;
}
static inline int parity_smaller(u64 a, u64 b)
{
return a > b;
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
struct btrfs_bio_stripe s;
int i;
u64 l;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
if (parity_smaller(bbio->raid_map[i],
bbio->raid_map[i+1])) {
s = bbio->stripes[i];
l = bbio->raid_map[i];
bbio->stripes[i] = bbio->stripes[i+1];
bbio->raid_map[i] = bbio->raid_map[i+1];
bbio->stripes[i+1] = s;
bbio->raid_map[i+1] = l;
again = 1;
}
}
}
}
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
struct btrfs_bio *bbio = kzalloc(
/* the size of the btrfs_bio */
sizeof(struct btrfs_bio) +
/* plus the variable array for the stripes */
sizeof(struct btrfs_bio_stripe) * (total_stripes) +
/* plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
* plus the raid_map, which includes both the tgt dev
* and the stripes
*/
sizeof(u64) * (total_stripes),
GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
refcount_set(&bbio->refs, 1);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
WARN_ON(!refcount_read(&bbio->refs));
refcount_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
if (refcount_dec_and_test(&bbio->refs))
kfree(bbio);
}
/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
/*
* Please note that, discard won't be sent to target device of device
* replace.
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
btrfs: Ensure we trim ranges across block group boundary commit 6b7faadd985c990324b5b5bd18cc4ba5c395eb65 upstream. [BUG] When deleting large files (which cross block group boundary) with discard mount option, we find some btrfs_discard_extent() calls only trimmed part of its space, not the whole range: btrfs_discard_extent: type=0x1 start=19626196992 len=2144530432 trimmed=1073741824 ratio=50% type: bbio->map_type, in above case, it's SINGLE DATA. start: Logical address of this trim len: Logical length of this trim trimmed: Physically trimmed bytes ratio: trimmed / len Thus leaving some unused space not discarded. [CAUSE] When discard mount option is specified, after a transaction is fully committed (super block written to disk), we begin to cleanup pinned extents in the following call chain: btrfs_commit_transaction() |- btrfs_finish_extent_commit() |- find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); |- btrfs_discard_extent() However, pinned extents are recorded in an extent_io_tree, which can merge adjacent extent states. When a large file gets deleted and it has adjacent file extents across block group boundary, we will get a large merged range like this: |<--- BG1 --->|<--- BG2 --->| |//////|<-- Range to discard --->|/////| To discard that range, we have the following calls: btrfs_discard_extent() |- btrfs_map_block() | Returned bbio will end at BG1's end. As btrfs_map_block() | never returns result across block group boundary. |- btrfs_issuse_discard() Issue discard for each stripe. So we will only discard the range in BG1, not the remaining part in BG2. Furthermore, this bug is not that reliably observed, for above case, if there is no other extent in BG2, BG2 will be empty and btrfs will trim all space of BG2, covering up the bug. [FIX] - Allow __btrfs_map_block_for_discard() to modify @length parameter btrfs_map_block() uses its @length paramter to notify the caller how many bytes are mapped in current call. With __btrfs_map_block_for_discard() also modifing the @length, btrfs_discard_extent() now understands when to do extra trim. - Call btrfs_map_block() in a loop until we hit the range end Since we now know how many bytes are mapped each time, we can iterate through each block group boundary and issue correct trim for each range. Reviewed-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Tested-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-10-23 07:57:27 -06:00
u64 logical, u64 *length_ret,
struct btrfs_bio **bbio_ret)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_bio *bbio;
btrfs: Ensure we trim ranges across block group boundary commit 6b7faadd985c990324b5b5bd18cc4ba5c395eb65 upstream. [BUG] When deleting large files (which cross block group boundary) with discard mount option, we find some btrfs_discard_extent() calls only trimmed part of its space, not the whole range: btrfs_discard_extent: type=0x1 start=19626196992 len=2144530432 trimmed=1073741824 ratio=50% type: bbio->map_type, in above case, it's SINGLE DATA. start: Logical address of this trim len: Logical length of this trim trimmed: Physically trimmed bytes ratio: trimmed / len Thus leaving some unused space not discarded. [CAUSE] When discard mount option is specified, after a transaction is fully committed (super block written to disk), we begin to cleanup pinned extents in the following call chain: btrfs_commit_transaction() |- btrfs_finish_extent_commit() |- find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); |- btrfs_discard_extent() However, pinned extents are recorded in an extent_io_tree, which can merge adjacent extent states. When a large file gets deleted and it has adjacent file extents across block group boundary, we will get a large merged range like this: |<--- BG1 --->|<--- BG2 --->| |//////|<-- Range to discard --->|/////| To discard that range, we have the following calls: btrfs_discard_extent() |- btrfs_map_block() | Returned bbio will end at BG1's end. As btrfs_map_block() | never returns result across block group boundary. |- btrfs_issuse_discard() Issue discard for each stripe. So we will only discard the range in BG1, not the remaining part in BG2. Furthermore, this bug is not that reliably observed, for above case, if there is no other extent in BG2, BG2 will be empty and btrfs will trim all space of BG2, covering up the bug. [FIX] - Allow __btrfs_map_block_for_discard() to modify @length parameter btrfs_map_block() uses its @length paramter to notify the caller how many bytes are mapped in current call. With __btrfs_map_block_for_discard() also modifing the @length, btrfs_discard_extent() now understands when to do extra trim. - Call btrfs_map_block() in a loop until we hit the range end Since we now know how many bytes are mapped each time, we can iterate through each block group boundary and issue correct trim for each range. Reviewed-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Tested-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-10-23 07:57:27 -06:00
u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
u64 stripe_nr_end;
u64 stripe_end_offset;
u64 stripe_cnt;
u64 stripe_len;
u64 stripe_offset;
u64 num_stripes;
u32 stripe_index;
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
int ret = 0;
int i;
/* discard always return a bbio */
ASSERT(bbio_ret);
em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
return PTR_ERR(em);
map = em->map_lookup;
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
offset = logical - em->start;
length = min_t(u64, em->start + em->len - logical, length);
btrfs: Ensure we trim ranges across block group boundary commit 6b7faadd985c990324b5b5bd18cc4ba5c395eb65 upstream. [BUG] When deleting large files (which cross block group boundary) with discard mount option, we find some btrfs_discard_extent() calls only trimmed part of its space, not the whole range: btrfs_discard_extent: type=0x1 start=19626196992 len=2144530432 trimmed=1073741824 ratio=50% type: bbio->map_type, in above case, it's SINGLE DATA. start: Logical address of this trim len: Logical length of this trim trimmed: Physically trimmed bytes ratio: trimmed / len Thus leaving some unused space not discarded. [CAUSE] When discard mount option is specified, after a transaction is fully committed (super block written to disk), we begin to cleanup pinned extents in the following call chain: btrfs_commit_transaction() |- btrfs_finish_extent_commit() |- find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); |- btrfs_discard_extent() However, pinned extents are recorded in an extent_io_tree, which can merge adjacent extent states. When a large file gets deleted and it has adjacent file extents across block group boundary, we will get a large merged range like this: |<--- BG1 --->|<--- BG2 --->| |//////|<-- Range to discard --->|/////| To discard that range, we have the following calls: btrfs_discard_extent() |- btrfs_map_block() | Returned bbio will end at BG1's end. As btrfs_map_block() | never returns result across block group boundary. |- btrfs_issuse_discard() Issue discard for each stripe. So we will only discard the range in BG1, not the remaining part in BG2. Furthermore, this bug is not that reliably observed, for above case, if there is no other extent in BG2, BG2 will be empty and btrfs will trim all space of BG2, covering up the bug. [FIX] - Allow __btrfs_map_block_for_discard() to modify @length parameter btrfs_map_block() uses its @length paramter to notify the caller how many bytes are mapped in current call. With __btrfs_map_block_for_discard() also modifing the @length, btrfs_discard_extent() now understands when to do extra trim. - Call btrfs_map_block() in a loop until we hit the range end Since we now know how many bytes are mapped each time, we can iterate through each block group boundary and issue correct trim for each range. Reviewed-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Tested-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-10-23 07:57:27 -06:00
*length_ret = length;
stripe_len = map->stripe_len;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
stripe_nr = div64_u64(offset, stripe_len);
/* stripe_offset is the offset of this block in its stripe */
stripe_offset = offset - stripe_nr * stripe_len;
stripe_nr_end = round_up(offset + length, map->stripe_len);
stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
stripe_cnt = stripe_nr_end - stripe_nr;
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + length);
/*
* after this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
num_stripes = 1;
stripe_index = 0;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
if (map->type & BTRFS_BLOCK_GROUP_RAID0)
sub_stripes = 1;
else
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
num_stripes = min_t(u64, map->num_stripes,
sub_stripes * stripe_cnt);
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= sub_stripes;
stripes_per_dev = div_u64_rem(stripe_cnt, factor,
&remaining_stripes);
div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
num_stripes = map->num_stripes;
} else {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
}
bbio = alloc_btrfs_bio(num_stripes, 0);
if (!bbio) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
bbio->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
bbio->stripes[i].length +=
map->stripe_len;
/*
* Special for the first stripe and
* the last stripe:
*
* |-------|...|-------|
* |----------|
* off end_off
*/
if (i < sub_stripes)
bbio->stripes[i].length -=
stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
bbio->stripes[i].length -=
stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else {
bbio->stripes[i].length = length;
}
stripe_index++;
if (stripe_index == map->num_stripes) {
stripe_index = 0;
stripe_nr++;
}
}
*bbio_ret = bbio;
bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
out:
free_extent_map(em);
return ret;
}
/*
* In dev-replace case, for repair case (that's the only case where the mirror
* is selected explicitly when calling btrfs_map_block), blocks left of the
* left cursor can also be read from the target drive.
*
* For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
* array of stripes.
* For READ, it also needs to be supported using the same mirror number.
*
* If the requested block is not left of the left cursor, EIO is returned. This
* can happen because btrfs_num_copies() returns one more in the dev-replace
* case.
*/
static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
u64 logical, u64 length,
u64 srcdev_devid, int *mirror_num,
u64 *physical)
{
struct btrfs_bio *bbio = NULL;
int num_stripes;
int index_srcdev = 0;
int found = 0;
u64 physical_of_found = 0;
int i;
int ret = 0;
ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
logical, &length, &bbio, 0, 0);
if (ret) {
ASSERT(bbio == NULL);
return ret;
}
num_stripes = bbio->num_stripes;
if (*mirror_num > num_stripes) {
/*
* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
* that means that the requested area is not left of the left
* cursor
*/
btrfs_put_bbio(bbio);
return -EIO;
}
/*
* process the rest of the function using the mirror_num of the source
* drive. Therefore look it up first. At the end, patch the device
* pointer to the one of the target drive.
*/
for (i = 0; i < num_stripes; i++) {
if (bbio->stripes[i].dev->devid != srcdev_devid)
continue;
/*
* In case of DUP, in order to keep it simple, only add the
* mirror with the lowest physical address
*/
if (found &&
physical_of_found <= bbio->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
physical_of_found = bbio->stripes[i].physical;
}
btrfs_put_bbio(bbio);
ASSERT(found);
if (!found)
return -EIO;
*mirror_num = index_srcdev + 1;
*physical = physical_of_found;
return ret;
}
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
struct btrfs_bio **bbio_ret,
struct btrfs_dev_replace *dev_replace,
int *num_stripes_ret, int *max_errors_ret)
{
struct btrfs_bio *bbio = *bbio_ret;
u64 srcdev_devid = dev_replace->srcdev->devid;
int tgtdev_indexes = 0;
int num_stripes = *num_stripes_ret;
int max_errors = *max_errors_ret;
int i;
if (op == BTRFS_MAP_WRITE) {
int index_where_to_add;
/*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk to
* the new disk takes place at run time while the filesystem is
* mounted writable, the regular write operations to the old
* disk have to be duplicated to go to the new disk as well.
*
* Note that device->missing is handled by the caller, and that
* the write to the old disk is already set up in the stripes
* array.
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
if (bbio->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
struct btrfs_bio_stripe *new =
bbio->stripes + index_where_to_add;
struct btrfs_bio_stripe *old =
bbio->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
bbio->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
}
}
num_stripes = index_where_to_add;
} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
int index_srcdev = 0;
int found = 0;
u64 physical_of_found = 0;
/*
* During the dev-replace procedure, the target drive can also
* be used to read data in case it is needed to repair a corrupt
* block elsewhere. This is possible if the requested area is
* left of the left cursor. In this area, the target drive is a
* full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
if (bbio->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it simple,
* only add the mirror with the lowest physical
* address
*/
if (found &&
physical_of_found <=
bbio->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
physical_of_found = bbio->stripes[i].physical;
}
}
if (found) {
struct btrfs_bio_stripe *tgtdev_stripe =
bbio->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
bbio->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
bbio->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
}
}
*num_stripes_ret = num_stripes;
*max_errors_ret = max_errors;
bbio->num_tgtdevs = tgtdev_indexes;
*bbio_ret = bbio;
}
static bool need_full_stripe(enum btrfs_map_op op)
{
return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}
/*
* btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
* tuple. This information is used to calculate how big a
* particular bio can get before it straddles a stripe.
*
* @fs_info - the filesystem
* @logical - address that we want to figure out the geometry of
* @len - the length of IO we are going to perform, starting at @logical
* @op - type of operation - write or read
* @io_geom - pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
{
struct extent_map *em;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
int ret = 0;
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
return PTR_ERR(em);
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
/* Stripe wher this block falls in */
stripe_nr = div64_u64(offset, stripe_len);
/* Offset of stripe in the chunk */
stripe_offset = stripe_nr * stripe_len;
if (offset < stripe_offset) {
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
ret = -EINVAL;
goto out;
}
/* stripe_offset is the offset of this block in its stripe */
stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len = stripe_len - stripe_offset;
/*
* In case of raid56, we need to know the stripe aligned start
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * data_stripes;
raid56_full_stripe_start = offset;
/*
* Allow a write of a full stripe, but make sure we
* don't allow straddling of stripes
*/
raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
/*
* For writes to RAID[56], allow a full stripeset across
* all disks. For other RAID types and for RAID[56]
* reads, just allow a single stripe (on a single disk).
*/
if (op == BTRFS_MAP_WRITE) {
max_len = stripe_len * data_stripes -
(offset - raid56_full_stripe_start);
}
}
len = min_t(u64, em->len - offset, max_len);
} else {
len = em->len - offset;
}
io_geom->len = len;
io_geom->offset = offset;
io_geom->stripe_len = stripe_len;
io_geom->stripe_nr = stripe_nr;
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
out:
/* once for us */
free_extent_map(em);
return ret;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u32 stripe_index;
int data_stripes;
int i;
int ret = 0;
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
struct btrfs_bio *bbio = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
ASSERT(bbio_ret);
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
btrfs: Ensure we trim ranges across block group boundary commit 6b7faadd985c990324b5b5bd18cc4ba5c395eb65 upstream. [BUG] When deleting large files (which cross block group boundary) with discard mount option, we find some btrfs_discard_extent() calls only trimmed part of its space, not the whole range: btrfs_discard_extent: type=0x1 start=19626196992 len=2144530432 trimmed=1073741824 ratio=50% type: bbio->map_type, in above case, it's SINGLE DATA. start: Logical address of this trim len: Logical length of this trim trimmed: Physically trimmed bytes ratio: trimmed / len Thus leaving some unused space not discarded. [CAUSE] When discard mount option is specified, after a transaction is fully committed (super block written to disk), we begin to cleanup pinned extents in the following call chain: btrfs_commit_transaction() |- btrfs_finish_extent_commit() |- find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); |- btrfs_discard_extent() However, pinned extents are recorded in an extent_io_tree, which can merge adjacent extent states. When a large file gets deleted and it has adjacent file extents across block group boundary, we will get a large merged range like this: |<--- BG1 --->|<--- BG2 --->| |//////|<-- Range to discard --->|/////| To discard that range, we have the following calls: btrfs_discard_extent() |- btrfs_map_block() | Returned bbio will end at BG1's end. As btrfs_map_block() | never returns result across block group boundary. |- btrfs_issuse_discard() Issue discard for each stripe. So we will only discard the range in BG1, not the remaining part in BG2. Furthermore, this bug is not that reliably observed, for above case, if there is no other extent in BG2, BG2 will be empty and btrfs will trim all space of BG2, covering up the bug. [FIX] - Allow __btrfs_map_block_for_discard() to modify @length parameter btrfs_map_block() uses its @length paramter to notify the caller how many bytes are mapped in current call. With __btrfs_map_block_for_discard() also modifing the @length, btrfs_discard_extent() now understands when to do extra trim. - Call btrfs_map_block() in a loop until we hit the range end Since we now know how many bytes are mapped each time, we can iterate through each block group boundary and issue correct trim for each range. Reviewed-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Nikolay Borisov <nborisov@suse.com> Tested-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-10-23 07:57:27 -06:00
length, bbio_ret);
ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
if (ret < 0)
return ret;
em = btrfs_get_chunk_map(fs_info, logical, *length);
ASSERT(!IS_ERR(em));
map = em->map_lookup;
*length = geom.len;
stripe_len = geom.stripe_len;
stripe_nr = geom.stripe_nr;
stripe_offset = geom.stripe_offset;
raid56_full_stripe_start = geom.raid56_stripe_offset;
data_stripes = nr_data_stripes(map);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
/*
* Hold the semaphore for read during the whole operation, write is
* requested at commit time but must wait.
*/
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
ret = get_extra_mirror_from_replace(fs_info, logical, *length,
dev_replace->srcdev->devid,
&mirror_num,
&physical_to_patch_in_first_stripe);
if (ret)
goto out;
else
patch_the_first_stripe_for_dev_replace = 1;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
if (!need_full_stripe(op))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
if (need_full_stripe(op))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
if (need_full_stripe(op)) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
} else {
mirror_num = 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
u32 factor = map->num_stripes / map->sub_stripes;
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
if (need_full_stripe(op))
num_stripes = map->sub_stripes;
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
stripe_index,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * data_stripes);
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
} else {
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
stripe_nr = div_u64_rem(stripe_nr,
data_stripes, &stripe_index);
if (mirror_num > 1)
stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
if (!need_full_stripe(op) && mirror_num <= 1)
mirror_num = 1;
}
} else {
/*
* after this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
mirror_num = stripe_index + 1;
}
if (stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
if (op == BTRFS_MAP_WRITE)
num_alloc_stripes <<= 1;
if (op == BTRFS_MAP_GET_READ_MIRRORS)
num_alloc_stripes++;
tgtdev_indexes = num_stripes;
}
bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
if (!bbio) {
ret = -ENOMEM;
goto out;
}
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
(need_full_stripe(op) || mirror_num > 1)) {
u64 tmp;
unsigned rot;
bbio->raid_map = (u64 *)((void *)bbio->stripes +
sizeof(struct btrfs_bio_stripe) *
num_alloc_stripes +
sizeof(int) * tgtdev_indexes);
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
bbio->raid_map[(i+rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
bbio->raid_map[(i+rot+1) % num_stripes] =
RAID6_Q_STRIPE;
}
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset +
stripe_nr * map->stripe_len;
bbio->stripes[i].dev =
map->stripes[stripe_index].dev;
stripe_index++;
}
if (need_full_stripe(op))
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
&max_errors);
}
*bbio_ret = bbio;
bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
* mirror_num == num_stripes + 1 && dev_replace target drive is
* available as a mirror
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
bbio->stripes[0].dev = dev_replace->tgtdev;
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
out:
Btrfs: fix lockdep deadlock warning due to dev_replace Xfstests btrfs/011 complains about a deadlock warning, [ 1226.649039] ========================================================= [ 1226.649039] [ INFO: possible irq lock inversion dependency detected ] [ 1226.649039] 4.1.0+ #270 Not tainted [ 1226.649039] --------------------------------------------------------- [ 1226.652955] kswapd0/46 just changed the state of lock: [ 1226.652955] (&delayed_node->mutex){+.+.-.}, at: [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0 [ 1226.652955] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1226.652955] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1226.652955] other info that might help us debug this: [ 1226.652955] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1226.652955] Possible interrupt unsafe locking scenario: [ 1226.652955] CPU0 CPU1 [ 1226.652955] ---- ---- [ 1226.652955] lock(&fs_info->dev_replace.lock); [ 1226.652955] local_irq_disable(); [ 1226.652955] lock(&delayed_node->mutex); [ 1226.652955] lock(&found->groups_sem); [ 1226.652955] <Interrupt> [ 1226.652955] lock(&delayed_node->mutex); [ 1226.652955] *** DEADLOCK *** Commit 084b6e7c7607 ("btrfs: Fix a lockdep warning when running xfstest.") tried to fix a similar one that has the exactly same warning, but with that, we still run to this. The above lock chain comes from btrfs_commit_transaction ->btrfs_run_delayed_items ... ->__btrfs_update_delayed_inode ... ->__btrfs_cow_block ... ->find_free_extent ->cache_block_group ->load_free_space_cache ->btrfs_readpages ->submit_one_bio ... ->__btrfs_map_block ->btrfs_dev_replace_lock However, with high memory pressure, tasks which hold dev_replace.lock can be interrupted by kswapd and then kswapd is intended to release memory occupied by superblock, inodes and dentries, where we may call evict_inode, and it comes to [ 1226.652955] [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0 [ 1226.652955] [<ffffffff81459e74>] btrfs_remove_delayed_node+0x24/0x30 [ 1226.652955] [<ffffffff8140c5fe>] btrfs_evict_inode+0x34e/0x700 delayed_node->mutex may be acquired in __btrfs_release_delayed_node(), and it leads to a ABBA deadlock. To fix this, we can use "blocking rwlock" used in the case of extent_buffer, but things are simpler here since we only needs read's spinlock to blocking lock. With this, btrfs/011 no more produces warnings in dmesg. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-17 02:49:19 -06:00
if (dev_replace_is_ongoing) {
lockdep_assert_held(&dev_replace->rwsem);
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
Btrfs: fix lockdep deadlock warning due to dev_replace Xfstests btrfs/011 complains about a deadlock warning, [ 1226.649039] ========================================================= [ 1226.649039] [ INFO: possible irq lock inversion dependency detected ] [ 1226.649039] 4.1.0+ #270 Not tainted [ 1226.649039] --------------------------------------------------------- [ 1226.652955] kswapd0/46 just changed the state of lock: [ 1226.652955] (&delayed_node->mutex){+.+.-.}, at: [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0 [ 1226.652955] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1226.652955] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1226.652955] other info that might help us debug this: [ 1226.652955] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1226.652955] Possible interrupt unsafe locking scenario: [ 1226.652955] CPU0 CPU1 [ 1226.652955] ---- ---- [ 1226.652955] lock(&fs_info->dev_replace.lock); [ 1226.652955] local_irq_disable(); [ 1226.652955] lock(&delayed_node->mutex); [ 1226.652955] lock(&found->groups_sem); [ 1226.652955] <Interrupt> [ 1226.652955] lock(&delayed_node->mutex); [ 1226.652955] *** DEADLOCK *** Commit 084b6e7c7607 ("btrfs: Fix a lockdep warning when running xfstest.") tried to fix a similar one that has the exactly same warning, but with that, we still run to this. The above lock chain comes from btrfs_commit_transaction ->btrfs_run_delayed_items ... ->__btrfs_update_delayed_inode ... ->__btrfs_cow_block ... ->find_free_extent ->cache_block_group ->load_free_space_cache ->btrfs_readpages ->submit_one_bio ... ->__btrfs_map_block ->btrfs_dev_replace_lock However, with high memory pressure, tasks which hold dev_replace.lock can be interrupted by kswapd and then kswapd is intended to release memory occupied by superblock, inodes and dentries, where we may call evict_inode, and it comes to [ 1226.652955] [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0 [ 1226.652955] [<ffffffff81459e74>] btrfs_remove_delayed_node+0x24/0x30 [ 1226.652955] [<ffffffff8140c5fe>] btrfs_evict_inode+0x34e/0x700 delayed_node->mutex may be acquired in __btrfs_release_delayed_node(), and it leads to a ABBA deadlock. To fix this, we can use "blocking rwlock" used in the case of extent_buffer, but things are simpler here since we only needs read's spinlock to blocking lock. With this, btrfs/011 no more produces warnings in dmesg. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2015-07-17 02:49:19 -06:00
}
free_extent_map(em);
return ret;
}
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret)
{
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
u64 bytenr;
u64 length;
u64 stripe_nr;
u64 rmap_len;
int i, j, nr = 0;
em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
map = em->map_lookup;
length = em->len;
rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
length = div_u64(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
length = div_u64(length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
length = div_u64(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].physical > physical ||
map->stripes[i].physical + length <= physical)
continue;
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
} /* else if RAID[56], multiply by nr_data_stripes().
* Alternatively, just use rmap_len below instead of
* map->stripe_len */
bytenr = chunk_start + stripe_nr * rmap_len;
WARN_ON(nr >= map->num_stripes);
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
break;
}
if (j == nr) {
WARN_ON(nr >= map->num_stripes);
buf[nr++] = bytenr;
}
}
*logical = buf;
*naddrs = nr;
*stripe_len = rmap_len;
free_extent_map(em);
return 0;
}
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
block: remove management of bi_remaining when restoring original bi_end_io Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") regressed all existing callers that followed this pattern: 1) saving a bio's original bi_end_io 2) wiring up an intermediate bi_end_io 3) restoring the original bi_end_io from intermediate bi_end_io 4) calling bio_endio() to execute the restored original bi_end_io The regression was due to BIO_CHAIN only ever getting set if bio_inc_remaining() is called. For the above pattern it isn't set until step 3 above (step 2 would've needed to establish BIO_CHAIN). As such the first bio_endio(), in step 2 above, never decremented __bi_remaining before calling the intermediate bi_end_io -- leaving __bi_remaining with the value 1 instead of 0. When bio_inc_remaining() occurred during step 3 it brought it to a value of 2. When the second bio_endio() was called, in step 4 above, it should've called the original bi_end_io but it didn't because there was an extra reference that wasn't dropped (due to atomic operations being optimized away since BIO_CHAIN wasn't set upfront). Fix this issue by removing the __bi_remaining management complexity for all callers that use the above pattern -- bio_chain() is the only interface that _needs_ to be concerned with __bi_remaining. For the above pattern callers just expect the bi_end_io they set to get called! Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls that aren't associated with the bio_chain() interface. Also, the bio_inc_remaining() interface has been moved local to bio.c. Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <axboe@fb.com>
2015-05-22 07:14:03 -06:00
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
bio_endio(bio);
block: remove management of bi_remaining when restoring original bi_end_io Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") regressed all existing callers that followed this pattern: 1) saving a bio's original bi_end_io 2) wiring up an intermediate bi_end_io 3) restoring the original bi_end_io from intermediate bi_end_io 4) calling bio_endio() to execute the restored original bi_end_io The regression was due to BIO_CHAIN only ever getting set if bio_inc_remaining() is called. For the above pattern it isn't set until step 3 above (step 2 would've needed to establish BIO_CHAIN). As such the first bio_endio(), in step 2 above, never decremented __bi_remaining before calling the intermediate bi_end_io -- leaving __bi_remaining with the value 1 instead of 0. When bio_inc_remaining() occurred during step 3 it brought it to a value of 2. When the second bio_endio() was called, in step 4 above, it should've called the original bi_end_io but it didn't because there was an extra reference that wasn't dropped (due to atomic operations being optimized away since BIO_CHAIN wasn't set upfront). Fix this issue by removing the __bi_remaining management complexity for all callers that use the above pattern -- bio_chain() is the only interface that _needs_ to be concerned with __bi_remaining. For the above pattern callers just expect the bi_end_io they set to get called! Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls that aren't associated with the bio_chain() interface. Also, the bio_inc_remaining() interface has been moved local to bio.c. Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <axboe@fb.com>
2015-05-22 07:14:03 -06:00
btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio)
{
struct btrfs_bio *bbio = bio->bi_private;
int is_orig_bio = 0;
if (bio->bi_status) {
atomic_inc(&bbio->error);
if (bio->bi_status == BLK_STS_IOERR ||
bio->bi_status == BLK_STS_TARGET) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
if (dev->bdev) {
if (bio_op(bio) == REQ_OP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
}
}
}
if (bio == bbio->orig_bio)
is_orig_bio = 1;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 01:46:55 -07:00
btrfs_bio_counter_dec(bbio->fs_info);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
bio = bbio->orig_bio;
}
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
bio->bi_status = BLK_STS_IOERR;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
bio->bi_status = BLK_STS_OK;
}
btrfs_end_bbio(bbio, bio);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit.
*
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
static noinline void btrfs_schedule_bio(struct btrfs_device *device,
struct bio *bio)
{
struct btrfs_fs_info *fs_info = device->fs_info;
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
btrfsic_submit_bio(bio);
return;
}
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
spin_lock(&device->io_lock);
if (op_is_sync(bio->bi_opf))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
if (pending_bios->tail)
pending_bios->tail->bi_next = bio;
pending_bios->tail = bio;
if (!pending_bios->head)
pending_bios->head = bio;
if (device->running_pending)
should_queue = 0;
spin_unlock(&device->io_lock);
if (should_queue)
btrfs_queue_work(fs_info->submit_workers, &device->work);
}
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
u64 physical, int dev_nr, int async)
{
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
struct btrfs_fs_info *fs_info = bbio->fs_info;
bio->bi_private = bbio;
btrfs_io_bio(bio)->stripe_index = dev_nr;
bio->bi_end_io = btrfs_end_bio;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 16:44:27 -06:00
bio->bi_iter.bi_sector = physical >> 9;
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
(u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 01:46:55 -07:00
btrfs_bio_counter_inc_noblocked(fs_info);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 01:46:55 -07:00
if (async)
btrfs_schedule_bio(dev, bio);
else
btrfsic_submit_bio(bio);
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
/* Should be the original bio. */
WARN_ON(bio != bbio->orig_bio);
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 16:44:27 -06:00
bio->bi_iter.bi_sector = logical >> 9;
if (atomic_read(&bbio->error) > bbio->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
btrfs_end_bbio(bbio, bio);
}
}
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, int async_submit)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 16:44:27 -06:00
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
int dev_nr;
int total_devs;
struct btrfs_bio *bbio = NULL;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 16:44:27 -06:00
length = bio->bi_iter.bi_size;
map_length = length;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
&map_length, &bbio, mirror_num, 1);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 01:46:55 -07:00
if (ret) {
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 01:46:55 -07:00
}
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
bbio->fs_info = fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (bio_op(bio) == REQ_OP_WRITE) {
ret = raid56_parity_write(fs_info, bio, bbio,
map_length);
} else {
ret = raid56_parity_recover(fs_info, bio, bbio,
map_length, mirror_num, 1);
}
btrfs_bio_counter_dec(fs_info);
return errno_to_blk_status(ret);
}
if (map_length < length) {
btrfs_crit(fs_info,
"mapping failed logical %llu bio len %llu len %llu",
logical, length, map_length);
BUG();
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
(bio_op(first_bio) == REQ_OP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
}
if (dev_nr < total_devs - 1)
bio = btrfs_bio_clone(first_bio);
else
bio = first_bio;
submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
dev_nr, async_submit);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
}
/*
* Find a device specified by @devid or @uuid in the list of @fs_devices, or
* return NULL.
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
*
* If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *uuid, u8 *fsid,
bool seed)
{
struct btrfs_device *device;
while (fs_devices) {
if (!fsid ||
!memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
list_for_each_entry(device, &fs_devices->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
BTRFS_UUID_SIZE) == 0))
return device;
}
}
if (seed)
fs_devices = fs_devices->seed;
else
return NULL;
}
return NULL;
}
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
btrfs: fix lockdep splat in add_missing_dev commit fccc0007b8dc952c6bc0805cdf842eb8ea06a639 upstream. Nikolay reported a lockdep splat in generic/476 that I could reproduce with btrfs/187. ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc2+ #1 Tainted: G W ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff9e8ef38b6268 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc_trace+0x3a/0x1a0 btrfs_alloc_device+0x43/0x210 add_missing_dev+0x20/0x90 read_one_chunk+0x301/0x430 btrfs_read_sys_array+0x17b/0x1b0 open_ctree+0xa62/0x1896 btrfs_mount_root.cold+0x12/0xea legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x10d/0x379 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_lookup_inode+0x2a/0x8f __btrfs_update_delayed_inode+0x80/0x240 btrfs_commit_inode_delayed_inode+0x119/0x120 btrfs_evict_inode+0x357/0x500 evict+0xcf/0x1f0 vfs_rmdir.part.0+0x149/0x160 do_rmdir+0x136/0x1a0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&fs_info->chunk_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffffa9d65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff9e8e9da260e0 (&type->s_umount_key#48){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 1 PID: 100 Comm: kswapd0 Tainted: G W 5.9.0-rc2+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x12d/0x150 __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa4/0x3d0 ? btrfs_evict_inode+0x11e/0x500 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x46/0x60 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This is because we are holding the chunk_mutex when we call btrfs_alloc_device, which does a GFP_KERNEL allocation. We don't want to switch that to a GFP_NOFS lock because this is the only place where it matters. So instead use memalloc_nofs_save() around the allocation in order to avoid the lockdep splat. Reported-by: Nikolay Borisov <nborisov@suse.com> CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-08-31 08:52:42 -06:00
unsigned int nofs_flag;
btrfs: fix lockdep splat in add_missing_dev commit fccc0007b8dc952c6bc0805cdf842eb8ea06a639 upstream. Nikolay reported a lockdep splat in generic/476 that I could reproduce with btrfs/187. ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc2+ #1 Tainted: G W ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff9e8ef38b6268 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc_trace+0x3a/0x1a0 btrfs_alloc_device+0x43/0x210 add_missing_dev+0x20/0x90 read_one_chunk+0x301/0x430 btrfs_read_sys_array+0x17b/0x1b0 open_ctree+0xa62/0x1896 btrfs_mount_root.cold+0x12/0xea legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x10d/0x379 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_lookup_inode+0x2a/0x8f __btrfs_update_delayed_inode+0x80/0x240 btrfs_commit_inode_delayed_inode+0x119/0x120 btrfs_evict_inode+0x357/0x500 evict+0xcf/0x1f0 vfs_rmdir.part.0+0x149/0x160 do_rmdir+0x136/0x1a0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&fs_info->chunk_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffffa9d65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff9e8e9da260e0 (&type->s_umount_key#48){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 1 PID: 100 Comm: kswapd0 Tainted: G W 5.9.0-rc2+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x12d/0x150 __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa4/0x3d0 ? btrfs_evict_inode+0x11e/0x500 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x46/0x60 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This is because we are holding the chunk_mutex when we call btrfs_alloc_device, which does a GFP_KERNEL allocation. We don't want to switch that to a GFP_NOFS lock because this is the only place where it matters. So instead use memalloc_nofs_save() around the allocation in order to avoid the lockdep splat. Reported-by: Nikolay Borisov <nborisov@suse.com> CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-08-31 08:52:42 -06:00
/*
* We call this under the chunk_mutex, so we want to use NOFS for this
* allocation, however we don't want to change btrfs_alloc_device() to
* always do NOFS because we use it in a lot of other GFP_KERNEL safe
* places.
*/
nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
btrfs: fix lockdep splat in add_missing_dev commit fccc0007b8dc952c6bc0805cdf842eb8ea06a639 upstream. Nikolay reported a lockdep splat in generic/476 that I could reproduce with btrfs/187. ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc2+ #1 Tainted: G W ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff9e8ef38b6268 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc_trace+0x3a/0x1a0 btrfs_alloc_device+0x43/0x210 add_missing_dev+0x20/0x90 read_one_chunk+0x301/0x430 btrfs_read_sys_array+0x17b/0x1b0 open_ctree+0xa62/0x1896 btrfs_mount_root.cold+0x12/0xea legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x10d/0x379 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_lookup_inode+0x2a/0x8f __btrfs_update_delayed_inode+0x80/0x240 btrfs_commit_inode_delayed_inode+0x119/0x120 btrfs_evict_inode+0x357/0x500 evict+0xcf/0x1f0 vfs_rmdir.part.0+0x149/0x160 do_rmdir+0x136/0x1a0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&fs_info->chunk_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffffa9d74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffffa9d65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff9e8e9da260e0 (&type->s_umount_key#48){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 1 PID: 100 Comm: kswapd0 Tainted: G W 5.9.0-rc2+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x12d/0x150 __lock_acquire+0x1184/0x1fa0 lock_acquire+0xa4/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa4/0x3d0 ? btrfs_evict_inode+0x11e/0x500 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x46/0x60 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This is because we are holding the chunk_mutex when we call btrfs_alloc_device, which does a GFP_KERNEL allocation. We don't want to switch that to a GFP_NOFS lock because this is the only place where it matters. So instead use memalloc_nofs_save() around the allocation in order to avoid the lockdep splat. Reported-by: Nikolay Borisov <nborisov@suse.com> CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-08-31 08:52:42 -06:00
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices++;
return device;
}
/**
* btrfs_alloc_device - allocate struct btrfs_device
* @fs_info: used only for generating a new devid, can be NULL if
* devid is provided (i.e. @devid != NULL).
* @devid: a pointer to devid for this device. If NULL a new devid
* is generated.
* @uuid: a pointer to UUID for this device. If NULL a new UUID
* is generated.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and must be
* destroyed with btrfs_free_device.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid)
{
struct btrfs_device *dev;
u64 tmp;
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
dev = __alloc_device();
if (IS_ERR(dev))
return dev;
if (devid)
tmp = *devid;
else {
int ret;
ret = find_next_devid(fs_info, &tmp);
if (ret) {
btrfs_free_device(dev);
return ERR_PTR(ret);
}
}
dev->devid = tmp;
if (uuid)
memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
else
generate_random_uuid(dev->uuid);
btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
return dev;
}
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
u64 devid, u8 *uuid, bool error)
{
if (error)
btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
devid, uuid);
else
btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
devid, uuid);
}
static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
int index = btrfs_bg_flags_to_raid_index(type);
int ncopies = btrfs_raid_array[index].ncopies;
int data_stripes;
switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case BTRFS_BLOCK_GROUP_RAID5:
data_stripes = num_stripes - 1;
break;
case BTRFS_BLOCK_GROUP_RAID6:
data_stripes = num_stripes - 2;
break;
default:
data_stripes = num_stripes / ncopies;
break;
}
return div_u64(chunk_len, data_stripes);
}
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
u64 logical;
u64 length;
u64 devid;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
int ret;
int i;
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
*/
if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
ret = btrfs_check_chunk_valid(leaf, chunk, logical);
if (ret)
return ret;
}
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, logical, 1);
read_unlock(&map_tree->lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
free_extent_map(em);
return 0;
} else if (em) {
free_extent_map(em);
}
em = alloc_extent_map();
if (!em)
return -ENOMEM;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
return -ENOMEM;
}
Btrfs: fix NULL pointer crash when running balance and scrub concurrently While running balance, scrub, fsstress concurrently we hit the following kernel crash: [56561.448845] BTRFS info (device sde): relocating block group 11005853696 flags 132 [56561.524077] BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [56561.524237] IP: [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.524297] PGD 9be28067 PUD 7f3dd067 PMD 0 [56561.524325] Oops: 0000 [#1] SMP [....] [56561.527237] Call Trace: [56561.527309] [<ffffffffa038980e>] scrub_enumerate_chunks+0x24e/0x490 [btrfs] [56561.527392] [<ffffffff810abe00>] ? abort_exclusive_wait+0x50/0xb0 [56561.527476] [<ffffffffa038add4>] btrfs_scrub_dev+0x1a4/0x530 [btrfs] [56561.527561] [<ffffffffa0368107>] btrfs_ioctl+0x13f7/0x2a90 [btrfs] [56561.527639] [<ffffffff811c82f0>] do_vfs_ioctl+0x2e0/0x4c0 [56561.527712] [<ffffffff8109c384>] ? vtime_account_user+0x54/0x60 [56561.527788] [<ffffffff810f768c>] ? __audit_syscall_entry+0x9c/0xf0 [56561.527870] [<ffffffff811c8551>] SyS_ioctl+0x81/0xa0 [56561.527941] [<ffffffff815707f7>] tracesys+0xdd/0xe2 [...] [56561.528304] RIP [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.528395] RSP <ffff88004c0f5be8> [56561.528454] CR2: 0000000000000078 This is because in btrfs_relocate_chunk(), we will free @bdev directly while scrub may still hold extent mapping, and may access freed memory. Fix this problem by wrapping freeing @bdev work into free_extent_map() which is based on reference count. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-06-18 20:42:52 -06:00
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->map_lookup = map;
em->start = logical;
em->len = length;
em->orig_start = 0;
em->block_start = 0;
Btrfs: Add zlib compression support This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
em->block_len = em->len;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
em->orig_block_len = calc_stripe_length(map->type, em->len,
map->num_stripes);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
devid, uuid, NULL, true);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
btrfs_report_missing_device(fs_info, devid, uuid, true);
return -ENOENT;
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
add_missing_dev(fs_info->fs_devices, devid,
uuid);
if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
btrfs_err(fs_info,
"failed to init missing dev %llu: %ld",
devid, PTR_ERR(map->stripes[i].dev));
return PTR_ERR(map->stripes[i].dev);
}
btrfs_report_missing_device(fs_info, devid, uuid, false);
}
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
}
write_lock(&map_tree->lock);
ret = add_extent_mapping(map_tree, em, 0);
write_unlock(&map_tree->lock);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
em->start, em->len, ret);
}
free_extent_map(em);
return ret;
}
static void fill_device_from_item(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item,
struct btrfs_device *device)
{
unsigned long ptr;
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
ptr = btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
int ret;
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
fs_devices = fs_info->fs_devices->seed;
while (fs_devices) {
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
fs_devices = fs_devices->seed;
}
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
if (!btrfs_test_opt(fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
btrfs: Introduce support for FSID change without metadata rewrite This field is going to be used when the user wants to change the UUID of the filesystem without having to rewrite all metadata blocks. This field adds another level of indirection such that when the FSID is changed what really happens is the current UUID (the one with which the fs was created) is copied to the 'metadata_uuid' field in the superblock as well as a new incompat flag is set METADATA_UUID. When the kernel detects this flag is set it knows that the superblock in fact has 2 UUIDs: 1. Is the UUID which is user-visible, currently known as FSID. 2. Metadata UUID - this is the UUID which is stamped into all on-disk datastructures belonging to this file system. When the new incompat flag is present device scanning checks whether both fsid/metadata_uuid of the scanned device match any of the registered filesystems. When the flag is not set then both UUIDs are equal and only the FSID is retained on disk, metadata_uuid is set only in-memory during mount. Additionally a new metadata_uuid field is also added to the fs_info struct. It's initialised either with the FSID in case METADATA_UUID incompat flag is not set or with the metdata_uuid of the superblock otherwise. This commit introduces the new fields as well as the new incompat flag and switches all users of the fsid to the new logic. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor updates in comments ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-30 08:43:23 -06:00
fs_devices = alloc_fs_devices(fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
fs_devices->seeding = 1;
fs_devices->opened = 1;
return fs_devices;
}
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(ret);
goto out;
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(-EINVAL);
goto out;
}
fs_devices->seed = fs_info->fs_devices->seed;
fs_info->fs_devices->seed = fs_devices;
out:
return fs_devices;
}
static int read_one_dev(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
u8 fs_uuid[BTRFS_FSID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
fs_devices = open_seed_devices(fs_info, fs_uuid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
}
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
fs_uuid, true);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
dev_uuid, true);
return -ENOENT;
}
device = add_missing_dev(fs_devices, devid, dev_uuid);
if (IS_ERR(device)) {
btrfs_err(fs_info,
"failed to add missing dev %llu: %ld",
devid, PTR_ERR(device));
return PTR_ERR(device);
}
btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
} else {
if (!device->bdev) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info,
devid, dev_uuid, true);
return -ENOENT;
}
btrfs_report_missing_device(fs_info, devid,
dev_uuid, false);
}
if (!device->bdev &&
!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
* device->bdev is NULL, and so we have to set
* device->missing to one here
*/
device->fs_devices->missing_devices++;
set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
/* Move the device to its own fs_devices */
if (device->fs_devices != fs_devices) {
ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state));
list_move(&device->dev_list, &fs_devices->devices);
device->fs_devices->num_devices--;
fs_devices->num_devices++;
device->fs_devices->missing_devices--;
fs_devices->missing_devices++;
device->fs_devices = fs_devices;
}
}
if (device->fs_devices != fs_info->fs_devices) {
BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
if (device->generation !=
btrfs_device_generation(leaf, dev_item))
return -EINVAL;
}
fill_device_from_item(leaf, dev_item, device);
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
device->fs_devices->total_rw_bytes += device->total_bytes;
atomic64_add(device->total_bytes - device->bytes_used,
&fs_info->free_chunk_space);
}
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur_offset;
u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
/*
* This will create extent buffer of nodesize, superblock size is
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
* write_extent_buffer then triggers a WARN_ON.
*
* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
* but sb spans only this function. Add an explicit SetPageUptodate call
* to silence the warning eg. on PowerPC 64.
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 06:29:47 -06:00
if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
array_ptr = super_copy->sys_chunk_array;
sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
cur_offset = 0;
while (cur_offset < array_size) {
disk_key = (struct btrfs_disk_key *)array_ptr;
len = sizeof(*disk_key);
if (cur_offset + len > array_size)
goto out_short_read;
btrfs_disk_key_to_cpu(&key, disk_key);
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
chunk = (struct btrfs_chunk *)sb_array_offset;
/*
* At least one btrfs_chunk with one stripe must be
* present, exact stripe count check comes afterwards
*/
len = btrfs_chunk_item_size(1);
if (cur_offset + len > array_size)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
if (!num_stripes) {
btrfs_err(fs_info,
"invalid number of stripes %u in sys_array at offset %u",
num_stripes, cur_offset);
ret = -EIO;
break;
}
type = btrfs_chunk_type(sb, chunk);
if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
btrfs_err(fs_info,
"invalid chunk type %llu in sys_array at offset %u",
type, cur_offset);
ret = -EIO;
break;
}
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
ret = read_one_chunk(&key, sb, chunk);
if (ret)
break;
} else {
btrfs_err(fs_info,
"unexpected item type %u in sys_array at offset %u",
(u32)key.type, cur_offset);
ret = -EIO;
break;
}
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
}
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return ret;
out_short_read:
btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
len, cur_offset);
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return -EIO;
}
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
/*
* Check if all chunks in the fs are OK for read-write degraded mount
*
* If the @failing_dev is specified, it's accounted as missing.
*
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
* Return true if all chunks meet the minimal RW mount requirements.
* Return false if any chunk doesn't meet the minimal RW mount requirements.
*/
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
{
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
struct extent_map *em;
u64 next_start = 0;
bool ret = true;
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, 0, (u64)-1);
read_unlock(&map_tree->lock);
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
/* No chunk at all? Return false anyway */
if (!em) {
ret = false;
goto out;
}
while (em) {
struct map_lookup *map;
int missing = 0;
int max_tolerated;
int i;
map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
if (!dev || !dev->bdev ||
test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
dev->last_flush_error)
missing++;
else if (failing_dev && failing_dev == dev)
missing++;
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
}
if (missing > max_tolerated) {
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
em->start, missing, max_tolerated);
free_extent_map(em);
ret = false;
goto out;
}
next_start = extent_map_end(em);
free_extent_map(em);
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, next_start,
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
(u64)(-1) - next_start);
read_unlock(&map_tree->lock);
btrfs: Introduce a function to check if all chunks a OK for degraded rw mount Introduce a new function, btrfs_check_rw_degradable(), to check if all chunks in btrfs is OK for degraded rw mount. It provides the new basis for accurate btrfs mount/remount and even runtime degraded mount check other than old one-size-fit-all method. Btrfs currently uses num_tolerated_disk_barrier_failures to do global check for tolerated missing device. Although the one-size-fit-all solution is quite safe, it's too strict if data and metadata has different duplication level. For example, if one use Single data and RAID1 metadata for 2 disks, it means any missing device will make the fs unable to be degraded mounted. But in fact, some times all single chunks may be in the existing device and in that case, we should allow it to be rw degraded mounted. Such case can be easily reproduced using the following script: # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc # wipefs -f /dev/sdc # mount /dev/sdb -o degraded,rw If using btrfs-debug-tree to check /dev/sdb, one should find that the data chunk is only in sdb, so in fact it should allow degraded mount. This patchset will introduce a new per-chunk degradable check for btrfs, allow above case to succeed, and it's quite small anyway. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ copied text from cover letter with more details about the problem being solved ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-08 18:34:36 -07:00
}
out:
return ret;
}
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
int slot;
u64 total_dev = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* uuid_mutex is needed only if we are mounting a sprout FS
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
btrfs: fix mount failure caused by race with umount commit 48cfa61b58a1fee0bc49eef04f8ccf31493b7cdd upstream. It is possible to cause a btrfs mount to fail by racing it with a slow umount. The crux of the sequence is generic_shutdown_super not yet calling sop->put_super before btrfs_mount_root calls btrfs_open_devices. If that occurs, btrfs_open_devices will decide the opened counter is non-zero, increment it, and skip resetting fs_devices->total_rw_bytes to 0. From here, mount will call sget which will result in grab_super trying to take the super block umount semaphore. That semaphore will be held by the slow umount, so mount will block. Before up-ing the semaphore, umount will delete the super block, resulting in mount's sget reliably allocating a new one, which causes the mount path to dutifully fill it out, and increment total_rw_bytes a second time, which causes the mount to fail, as we see double the expected bytes. Here is the sequence laid out in greater detail: CPU0 CPU1 down_write sb->s_umount btrfs_kill_super kill_anon_super(sb) generic_shutdown_super(sb); shrink_dcache_for_umount(sb); sync_filesystem(sb); evict_inodes(sb); // SLOW btrfs_mount_root btrfs_scan_one_device fs_devices = device->fs_devices fs_info->fs_devices = fs_devices // fs_devices-opened makes this a no-op btrfs_open_devices(fs_devices, mode, fs_type) s = sget(fs_type, test, set, flags, fs_info); find sb in s_instances grab_super(sb); down_write(&s->s_umount); // blocks sop->put_super(sb) // sb->fs_devices->opened == 2; no-op spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); return 0; retry lookup don't find sb in s_instances (deleted by CPU0) s = alloc_super return s; btrfs_fill_super(s, fs_devices, data) open_ctree // fs_devices total_rw_bytes improperly set! btrfs_read_chunk_tree read_one_dev // increment total_rw_bytes again!! super_total_bytes < fs_devices->total_rw_bytes // ERROR!!! To fix this, we clear total_rw_bytes from within btrfs_read_chunk_tree before the calls to read_one_dev, while holding the sb umount semaphore and the uuid mutex. To reproduce, it is sufficient to dirty a decent number of inodes, then quickly umount and mount. for i in $(seq 0 500) do dd if=/dev/zero of="/mnt/foo/$i" bs=1M count=1 done umount /mnt/foo& mount /mnt/foo does the trick for me. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Boris Burkov <boris@bur.io> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-16 14:29:46 -06:00
/*
* It is possible for mount and umount to race in such a way that
* we execute this code path, but open_fs_devices failed to clear
* total_rw_bytes. We certainly want it cleared before reading the
* device items, so clear it here.
*/
fs_info->fs_devices->total_rw_bytes = 0;
/*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto error;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto error;
break;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
struct btrfs_dev_item);
ret = read_one_dev(leaf, dev_item);
if (ret)
goto error;
total_dev++;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
btrfs: move the chunk_mutex in btrfs_read_chunk_tree commit 01d01caf19ff7c537527d352d169c4368375c0a1 upstream. We are currently getting this lockdep splat in btrfs/161: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc5+ #20 Tainted: G E ------------------------------------------------------ mount/678048 is trying to acquire lock: ffff9b769f15b6e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: clone_fs_devices+0x4d/0x170 [btrfs] but task is already holding lock: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x8b/0x8f0 btrfs_init_new_device+0x2d2/0x1240 [btrfs] btrfs_ioctl+0x1de/0x2d20 [btrfs] ksys_ioctl+0x87/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __lock_acquire+0x1240/0x2460 lock_acquire+0xab/0x360 __mutex_lock+0x8b/0x8f0 clone_fs_devices+0x4d/0x170 [btrfs] btrfs_read_chunk_tree+0x330/0x800 [btrfs] open_ctree+0xb7c/0x18ce [btrfs] btrfs_mount_root.cold+0x13/0xfa [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x7de/0xb30 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); *** DEADLOCK *** 3 locks held by mount/678048: #0: ffff9b75ff5fb0e0 (&type->s_umount_key#63/1){+.+.}-{3:3}, at: alloc_super+0xb5/0x380 #1: ffffffffc0c2fbc8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x54/0x800 [btrfs] #2: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] stack backtrace: CPU: 2 PID: 678048 Comm: mount Tainted: G E 5.8.0-rc5+ #20 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./890FX Deluxe5, BIOS P1.40 05/03/2011 Call Trace: dump_stack+0x96/0xd0 check_noncircular+0x162/0x180 __lock_acquire+0x1240/0x2460 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 lock_acquire+0xab/0x360 ? clone_fs_devices+0x4d/0x170 [btrfs] __mutex_lock+0x8b/0x8f0 ? clone_fs_devices+0x4d/0x170 [btrfs] ? rcu_read_lock_sched_held+0x52/0x60 ? cpumask_next+0x16/0x20 ? module_assert_mutex_or_preempt+0x14/0x40 ? __module_address+0x28/0xf0 ? clone_fs_devices+0x4d/0x170 [btrfs] ? static_obj+0x4f/0x60 ? lockdep_init_map_waits+0x43/0x200 ? clone_fs_devices+0x4d/0x170 [btrfs] clone_fs_devices+0x4d/0x170 [btrfs] btrfs_read_chunk_tree+0x330/0x800 [btrfs] open_ctree+0xb7c/0x18ce [btrfs] ? super_setup_bdi_name+0x79/0xd0 btrfs_mount_root.cold+0x13/0xfa [btrfs] ? vfs_parse_fs_string+0x84/0xb0 ? rcu_read_lock_sched_held+0x52/0x60 ? kfree+0x2b5/0x310 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] ? cred_has_capability+0x7c/0x120 ? rcu_read_lock_sched_held+0x52/0x60 ? legacy_get_tree+0x30/0x50 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x7de/0xb30 ? memdup_user+0x4e/0x90 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This is because btrfs_read_chunk_tree() can come upon DEV_EXTENT's and then read the device, which takes the device_list_mutex. The device_list_mutex needs to be taken before the chunk_mutex, so this is a problem. We only really need the chunk mutex around adding the chunk, so move the mutex around read_one_chunk. An argument could be made that we don't even need the chunk_mutex here as it's during mount, and we are protected by various other locks. However we already have special rules for ->device_list_mutex, and I'd rather not have another special case for ->chunk_mutex. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-17 13:12:28 -06:00
mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
btrfs: move the chunk_mutex in btrfs_read_chunk_tree commit 01d01caf19ff7c537527d352d169c4368375c0a1 upstream. We are currently getting this lockdep splat in btrfs/161: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc5+ #20 Tainted: G E ------------------------------------------------------ mount/678048 is trying to acquire lock: ffff9b769f15b6e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: clone_fs_devices+0x4d/0x170 [btrfs] but task is already holding lock: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x8b/0x8f0 btrfs_init_new_device+0x2d2/0x1240 [btrfs] btrfs_ioctl+0x1de/0x2d20 [btrfs] ksys_ioctl+0x87/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __lock_acquire+0x1240/0x2460 lock_acquire+0xab/0x360 __mutex_lock+0x8b/0x8f0 clone_fs_devices+0x4d/0x170 [btrfs] btrfs_read_chunk_tree+0x330/0x800 [btrfs] open_ctree+0xb7c/0x18ce [btrfs] btrfs_mount_root.cold+0x13/0xfa [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x7de/0xb30 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); *** DEADLOCK *** 3 locks held by mount/678048: #0: ffff9b75ff5fb0e0 (&type->s_umount_key#63/1){+.+.}-{3:3}, at: alloc_super+0xb5/0x380 #1: ffffffffc0c2fbc8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x54/0x800 [btrfs] #2: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] stack backtrace: CPU: 2 PID: 678048 Comm: mount Tainted: G E 5.8.0-rc5+ #20 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./890FX Deluxe5, BIOS P1.40 05/03/2011 Call Trace: dump_stack+0x96/0xd0 check_noncircular+0x162/0x180 __lock_acquire+0x1240/0x2460 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 lock_acquire+0xab/0x360 ? clone_fs_devices+0x4d/0x170 [btrfs] __mutex_lock+0x8b/0x8f0 ? clone_fs_devices+0x4d/0x170 [btrfs] ? rcu_read_lock_sched_held+0x52/0x60 ? cpumask_next+0x16/0x20 ? module_assert_mutex_or_preempt+0x14/0x40 ? __module_address+0x28/0xf0 ? clone_fs_devices+0x4d/0x170 [btrfs] ? static_obj+0x4f/0x60 ? lockdep_init_map_waits+0x43/0x200 ? clone_fs_devices+0x4d/0x170 [btrfs] clone_fs_devices+0x4d/0x170 [btrfs] btrfs_read_chunk_tree+0x330/0x800 [btrfs] open_ctree+0xb7c/0x18ce [btrfs] ? super_setup_bdi_name+0x79/0xd0 btrfs_mount_root.cold+0x13/0xfa [btrfs] ? vfs_parse_fs_string+0x84/0xb0 ? rcu_read_lock_sched_held+0x52/0x60 ? kfree+0x2b5/0x310 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] ? cred_has_capability+0x7c/0x120 ? rcu_read_lock_sched_held+0x52/0x60 ? legacy_get_tree+0x30/0x50 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x7de/0xb30 ? memdup_user+0x4e/0x90 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This is because btrfs_read_chunk_tree() can come upon DEV_EXTENT's and then read the device, which takes the device_list_mutex. The device_list_mutex needs to be taken before the chunk_mutex, so this is a problem. We only really need the chunk mutex around adding the chunk, so move the mutex around read_one_chunk. An argument could be made that we don't even need the chunk_mutex here as it's during mount, and we are protected by various other locks. However we already have special rules for ->device_list_mutex, and I'd rather not have another special case for ->chunk_mutex. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-07-17 13:12:28 -06:00
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
path->slots[0]++;
}
/*
* After loading chunk tree, we've got all device information,
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
btrfs_err(fs_info,
"super_num_devices %llu mismatch with num_devices %llu found here",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
ret = -EINVAL;
goto error;
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
btrfs_err(fs_info,
"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
btrfs_super_total_bytes(fs_info->super_copy),
fs_info->fs_devices->total_rw_bytes);
ret = -EINVAL;
goto error;
}
ret = 0;
error:
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
return ret;
}
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
while (fs_devices) {
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->fs_info = fs_info;
mutex_unlock(&fs_devices->device_list_mutex);
fs_devices = fs_devices->seed;
}
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
const struct btrfs_dev_stats_item *ptr,
int index)
{
u64 val;
read_extent_buffer(eb, &val,
offsetof(struct btrfs_dev_stats_item, values) +
((unsigned long)ptr) + (index * sizeof(u64)),
sizeof(val));
return val;
}
static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
struct btrfs_dev_stats_item *ptr,
int index, u64 val)
{
write_extent_buffer(eb, &val,
offsetof(struct btrfs_dev_stats_item, values) +
((unsigned long)ptr) + (index * sizeof(u64)),
sizeof(val));
}
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct extent_buffer *eb;
int slot;
int ret = 0;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
int i;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
int item_size;
struct btrfs_dev_stats_item *ptr;
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_dev_stat_set(device, i, 0);
device->dev_stats_valid = 1;
btrfs_release_path(path);
continue;
}
slot = path->slots[0];
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr(eb, slot,
struct btrfs_dev_stats_item);
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
if (item_size >= (1 + i) * sizeof(__le64))
btrfs_dev_stat_set(device, i,
btrfs_dev_stats_value(eb, ptr, i));
else
btrfs_dev_stat_set(device, i, 0);
}
device->dev_stats_valid = 1;
btrfs_dev_stat_print_on_load(device);
btrfs_release_path(path);
}
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_stats_item *ptr;
int ret;
int i;
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
}
if (ret == 0 &&
btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
btrfs_warn_in_rcu(fs_info,
"delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
ret = 1;
}
if (ret == 1) {
/* need to insert a new item */
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
btrfs_warn_in_rcu(fs_info,
"insert dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
}
eb = path->nodes[0];
ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
btrfs_mark_buffer_dirty(eb);
out:
btrfs_free_path(path);
return ret;
}
/*
* called from commit_transaction. Writes all changed device stats to disk.
*/
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int stats_cnt;
int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
stats_cnt = atomic_read(&device->dev_stats_ccnt);
if (!device->dev_stats_valid || stats_cnt == 0)
continue;
/*
* There is a LOAD-LOAD control dependency between the value of
* dev_stats_ccnt and updating the on-disk values which requires
* reading the in-memory counters. Such control dependencies
* require explicit read memory barriers.
*
* This memory barriers pairs with smp_mb__before_atomic in
* btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
* barrier implied by atomic_xchg in
* btrfs_dev_stats_read_and_reset
*/
smp_rmb();
ret = update_dev_stat_item(trans, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
btrfs_dev_stat_print_on_error(dev);
}
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
int i;
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (btrfs_dev_stat_read(dev, i) != 0)
break;
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
btrfs_info_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_get_dev_stats *stats)
{
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
true);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
btrfs_warn(fs_info, "get dev_stats failed, device not found");
return -ENODEV;
} else if (!dev->dev_stats_valid) {
btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
return -ENODEV;
} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
if (stats->nr_items > i)
stats->values[i] =
btrfs_dev_stat_read_and_reset(dev, i);
else
btrfs_dev_stat_set(dev, i, 0);
}
btrfs_info(fs_info, "device stats zeroed by %s (%d)",
current->comm, task_pid_nr(current));
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (stats->nr_items > i)
stats->values[i] = btrfs_dev_stat_read(dev, i);
}
if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
return 0;
}
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
{
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
int copy_num;
if (!bdev)
return;
for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
copy_num++) {
if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
brelse(bh);
}
/* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
update_dev_time(device_path);
}
/*
* Update the size and bytes used for each device where it changed. This is
* delayed since we would otherwise get errors while writing out the
* superblocks.
*
* Must be invoked during transaction commit.
*/
void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
if (list_empty(&trans->dev_update_list))
return;
/*
* We don't need the device_list_mutex here. This list is owned by the
* transaction and the transaction must complete before the device is
* released.
*/
mutex_lock(&trans->fs_info->chunk_mutex);
list_for_each_entry_safe(curr, next, &trans->dev_update_list,
post_commit_list) {
list_del_init(&curr->post_commit_list);
curr->commit_total_bytes = curr->disk_total_bytes;
curr->commit_bytes_used = curr->bytes_used;
}
mutex_unlock(&trans->fs_info->chunk_mutex);
}
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
while (fs_devices) {
fs_devices->fs_info = fs_info;
fs_devices = fs_devices->seed;
}
}
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
while (fs_devices) {
fs_devices->fs_info = NULL;
fs_devices = fs_devices->seed;
}
}
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
int btrfs_bg_type_to_factor(u64 flags)
{
const int index = btrfs_bg_flags_to_raid_index(flags);
return btrfs_raid_array[index].ncopies;
}
static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
ret = -EUCLEAN;
goto out;
}
map = em->map_lookup;
stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
physical_offset, devid, em->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
}
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->devid == devid &&
map->stripes[i].physical == physical_offset) {
found = true;
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
em->start);
ret = -EUCLEAN;
goto out;
}
map->verified_stripes++;
break;
}
}
if (!found) {
btrfs_err(fs_info,
"dev extent physical offset %llu devid %llu has no corresponding chunk",
physical_offset, devid);
ret = -EUCLEAN;
}
/* Make sure no dev extent is beyond device bondary */
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
ret = -EUCLEAN;
goto out;
}
}
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
devid, physical_offset, physical_len,
dev->disk_total_bytes);
ret = -EUCLEAN;
goto out;
}
out:
free_extent_map(em);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct rb_node *node;
int ret = 0;
read_lock(&em_tree->lock);
for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
em = rb_entry(node, struct extent_map, rb_node);
if (em->map_lookup->num_stripes !=
em->map_lookup->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
em->start, em->map_lookup->verified_stripes,
em->map_lookup->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
read_unlock(&em_tree->lock);
return ret;
}
/*
* Ensure that all dev extents are mapped to correct chunk, otherwise
* later chunk allocation/free would cause unexpected behavior.
*
* NOTE: This will iterate through the whole device tree, which should be of
* the same size level as the chunk tree. This slightly increases mount time.
*/
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
struct btrfs_path *path;
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_key key;
btrfs: volumes: Make sure there is no overlap of dev extents at mount time Enhance btrfs_verify_dev_extents() to remember previous checked dev extents, so it can verify no dev extents can overlap. Analysis from Hans: "Imagine allocating a DATA|DUP chunk. In the chunk allocator, we first set... max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE ... which is 10GiB. Then... /* we don't want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); Imagine we only have one 7880MiB block device in this filesystem. Now max_chunk_size is down to 788MiB. The next step in the code is to search for max_stripe_size * dev_stripes amount of free space on the device, which is in our example 1GiB * 2 = 2GiB. Imagine the device has exactly 1578MiB free in one contiguous piece. This amount of bytes will be put in devices_info[ndevs - 1].max_avail Next we recalculate the stripe_size (which is actually the device extent length), based on the actual maximum amount of available raw disk space: stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); stripe_size is now 789MiB Next we do... data_stripes = num_stripes / ncopies ...where data_stripes ends up as 1, because num_stripes is 2 (the amount of device extents we're going to have), and DUP has ncopies 2. Next there's a check... if (stripe_size * data_stripes > max_chunk_size) ...which matches because 789MiB * 1 > 788MiB. We go into the if code, and next is... stripe_size = div_u64(max_chunk_size, data_stripes); ...which resets stripe_size to max_chunk_size: 788MiB Next is a fun one... /* bump the answer up to a 16MB boundary */ stripe_size = round_up(stripe_size, SZ_16M); ...which changes stripe_size from 788MiB to 800MiB. We're not done changing stripe_size yet... /* But don't go higher than the limits we found while searching * for free extents */ stripe_size = min(devices_info[ndevs - 1].max_avail, stripe_size); This is bad. max_avail is twice the stripe_size (we need to fit 2 device extents on the same device for DUP). The result here is that 800MiB < 1578MiB, so it's unchanged. However, the resulting DUP chunk will need 1600MiB disk space, which isn't there, and the second dev_extent might extend into the next thing (next dev_extent? end of device?) for 22MiB. The last shown line of code relies on a situation where there's twice the value of stripe_size present as value for the variable stripe_size when it's DUP. This was actually the case before commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling", from which I quote: "[...] in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size." In the previous version of the code, the 16MiB alignment (why is this done, by the way?) would result in a 50% chance that it would actually do an 8MiB alignment for the individual dev_extents, since it was operating on double the size. Does this matter? Does it matter that stripe_size can be set to anything which is not 16MiB aligned because of the amount of remaining available disk space which is just taken? What is the main purpose of this round_up? The most straightforward thing to do seems something like... stripe_size = min( div_u64(devices_info[ndevs - 1].max_avail, dev_stripes), stripe_size ) ..just putting half of the max_avail into stripe_size." Link: https://lore.kernel.org/linux-btrfs/b3461a38-e5f8-f41d-c67c-2efac8129054@mendix.com/ Reported-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Signed-off-by: Qu Wenruo <wqu@suse.com> [ add analysis from report ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-05 03:45:54 -06:00
u64 prev_devid = 0;
u64 prev_dev_ext_end = 0;
int ret = 0;
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_item(root, path);
if (ret < 0)
goto out;
/* No dev extents at all? Not good */
if (ret > 0) {
ret = -EUCLEAN;
goto out;
}
}
while (1) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dev_extent *dext;
int slot = path->slots[0];
u64 chunk_offset;
u64 physical_offset;
u64 physical_len;
u64 devid;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.type != BTRFS_DEV_EXTENT_KEY)
break;
devid = key.objectid;
physical_offset = key.offset;
dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
physical_len = btrfs_dev_extent_length(leaf, dext);
btrfs: volumes: Make sure there is no overlap of dev extents at mount time Enhance btrfs_verify_dev_extents() to remember previous checked dev extents, so it can verify no dev extents can overlap. Analysis from Hans: "Imagine allocating a DATA|DUP chunk. In the chunk allocator, we first set... max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE ... which is 10GiB. Then... /* we don't want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); Imagine we only have one 7880MiB block device in this filesystem. Now max_chunk_size is down to 788MiB. The next step in the code is to search for max_stripe_size * dev_stripes amount of free space on the device, which is in our example 1GiB * 2 = 2GiB. Imagine the device has exactly 1578MiB free in one contiguous piece. This amount of bytes will be put in devices_info[ndevs - 1].max_avail Next we recalculate the stripe_size (which is actually the device extent length), based on the actual maximum amount of available raw disk space: stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); stripe_size is now 789MiB Next we do... data_stripes = num_stripes / ncopies ...where data_stripes ends up as 1, because num_stripes is 2 (the amount of device extents we're going to have), and DUP has ncopies 2. Next there's a check... if (stripe_size * data_stripes > max_chunk_size) ...which matches because 789MiB * 1 > 788MiB. We go into the if code, and next is... stripe_size = div_u64(max_chunk_size, data_stripes); ...which resets stripe_size to max_chunk_size: 788MiB Next is a fun one... /* bump the answer up to a 16MB boundary */ stripe_size = round_up(stripe_size, SZ_16M); ...which changes stripe_size from 788MiB to 800MiB. We're not done changing stripe_size yet... /* But don't go higher than the limits we found while searching * for free extents */ stripe_size = min(devices_info[ndevs - 1].max_avail, stripe_size); This is bad. max_avail is twice the stripe_size (we need to fit 2 device extents on the same device for DUP). The result here is that 800MiB < 1578MiB, so it's unchanged. However, the resulting DUP chunk will need 1600MiB disk space, which isn't there, and the second dev_extent might extend into the next thing (next dev_extent? end of device?) for 22MiB. The last shown line of code relies on a situation where there's twice the value of stripe_size present as value for the variable stripe_size when it's DUP. This was actually the case before commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling", from which I quote: "[...] in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size." In the previous version of the code, the 16MiB alignment (why is this done, by the way?) would result in a 50% chance that it would actually do an 8MiB alignment for the individual dev_extents, since it was operating on double the size. Does this matter? Does it matter that stripe_size can be set to anything which is not 16MiB aligned because of the amount of remaining available disk space which is just taken? What is the main purpose of this round_up? The most straightforward thing to do seems something like... stripe_size = min( div_u64(devices_info[ndevs - 1].max_avail, dev_stripes), stripe_size ) ..just putting half of the max_avail into stripe_size." Link: https://lore.kernel.org/linux-btrfs/b3461a38-e5f8-f41d-c67c-2efac8129054@mendix.com/ Reported-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Signed-off-by: Qu Wenruo <wqu@suse.com> [ add analysis from report ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-05 03:45:54 -06:00
/* Check if this dev extent overlaps with the previous one */
if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
devid, physical_offset, prev_dev_ext_end);
ret = -EUCLEAN;
goto out;
}
ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
physical_offset, physical_len);
if (ret < 0)
goto out;
btrfs: volumes: Make sure there is no overlap of dev extents at mount time Enhance btrfs_verify_dev_extents() to remember previous checked dev extents, so it can verify no dev extents can overlap. Analysis from Hans: "Imagine allocating a DATA|DUP chunk. In the chunk allocator, we first set... max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE ... which is 10GiB. Then... /* we don't want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); Imagine we only have one 7880MiB block device in this filesystem. Now max_chunk_size is down to 788MiB. The next step in the code is to search for max_stripe_size * dev_stripes amount of free space on the device, which is in our example 1GiB * 2 = 2GiB. Imagine the device has exactly 1578MiB free in one contiguous piece. This amount of bytes will be put in devices_info[ndevs - 1].max_avail Next we recalculate the stripe_size (which is actually the device extent length), based on the actual maximum amount of available raw disk space: stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); stripe_size is now 789MiB Next we do... data_stripes = num_stripes / ncopies ...where data_stripes ends up as 1, because num_stripes is 2 (the amount of device extents we're going to have), and DUP has ncopies 2. Next there's a check... if (stripe_size * data_stripes > max_chunk_size) ...which matches because 789MiB * 1 > 788MiB. We go into the if code, and next is... stripe_size = div_u64(max_chunk_size, data_stripes); ...which resets stripe_size to max_chunk_size: 788MiB Next is a fun one... /* bump the answer up to a 16MB boundary */ stripe_size = round_up(stripe_size, SZ_16M); ...which changes stripe_size from 788MiB to 800MiB. We're not done changing stripe_size yet... /* But don't go higher than the limits we found while searching * for free extents */ stripe_size = min(devices_info[ndevs - 1].max_avail, stripe_size); This is bad. max_avail is twice the stripe_size (we need to fit 2 device extents on the same device for DUP). The result here is that 800MiB < 1578MiB, so it's unchanged. However, the resulting DUP chunk will need 1600MiB disk space, which isn't there, and the second dev_extent might extend into the next thing (next dev_extent? end of device?) for 22MiB. The last shown line of code relies on a situation where there's twice the value of stripe_size present as value for the variable stripe_size when it's DUP. This was actually the case before commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling", from which I quote: "[...] in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size." In the previous version of the code, the 16MiB alignment (why is this done, by the way?) would result in a 50% chance that it would actually do an 8MiB alignment for the individual dev_extents, since it was operating on double the size. Does this matter? Does it matter that stripe_size can be set to anything which is not 16MiB aligned because of the amount of remaining available disk space which is just taken? What is the main purpose of this round_up? The most straightforward thing to do seems something like... stripe_size = min( div_u64(devices_info[ndevs - 1].max_avail, dev_stripes), stripe_size ) ..just putting half of the max_avail into stripe_size." Link: https://lore.kernel.org/linux-btrfs/b3461a38-e5f8-f41d-c67c-2efac8129054@mendix.com/ Reported-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Signed-off-by: Qu Wenruo <wqu@suse.com> [ add analysis from report ] Signed-off-by: David Sterba <dsterba@suse.com>
2018-10-05 03:45:54 -06:00
prev_devid = devid;
prev_dev_ext_end = physical_offset + physical_len;
ret = btrfs_next_item(root, path);
if (ret < 0)
goto out;
if (ret > 0) {
ret = 0;
break;
}
}
/* Ensure all chunks have corresponding dev extents */
ret = verify_chunk_dev_extent_mapping(fs_info);
out:
btrfs_free_path(path);
return ret;
}
/*
* Check whether the given block group or device is pinned by any inode being
* used as a swapfile.
*/
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
{
struct btrfs_swapfile_pin *sp;
struct rb_node *node;
spin_lock(&fs_info->swapfile_pins_lock);
node = fs_info->swapfile_pins.rb_node;
while (node) {
sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (ptr < sp->ptr)
node = node->rb_left;
else if (ptr > sp->ptr)
node = node->rb_right;
else
break;
}
spin_unlock(&fs_info->swapfile_pins_lock);
return node != NULL;
}