From 437490fed3b0c9ae21af8f70e0f338d34560842b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 28 Jul 2020 09:42:49 +0800 Subject: [PATCH 001/161] btrfs: tracepoints: output proper root owner for trace_find_free_extent() The current trace event always output result like this: find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA) find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA) find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA) find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA) T's saying we're allocating data extent for EXTENT tree, which is not even possible. It's because we always use EXTENT tree as the owner for trace_find_free_extent() without using the @root from btrfs_reserve_extent(). This patch will change the parameter to use proper @root for trace_find_free_extent(): Now it looks much better: find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=4096 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=7(CSUM_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=1(ROOT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) Reported-by: Hans van Kranenburg CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 ++++--- include/trace/events/btrfs.h | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 780b9c9a98fe..dbff61d36cab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3918,11 +3918,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, +static noinline int find_free_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; @@ -3954,7 +3955,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, num_bytes, empty_size, flags); space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { @@ -4203,7 +4204,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 863335ecb7e8..b9241836d4f7 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1176,25 +1176,27 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes, + TP_PROTO(const struct btrfs_root *root, u64 num_bytes, u64 empty_size, u64 data), - TP_ARGS(fs_info, num_bytes, empty_size, data), + TP_ARGS(root, num_bytes, empty_size, data), TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, data ) ), - TP_fast_assign_btrfs(fs_info, + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; __entry->num_bytes = num_bytes; __entry->empty_size = empty_size; __entry->data = data; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", - show_root_type(BTRFS_EXTENT_TREE_OBJECTID), + show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->data, __print_flags((unsigned long)__entry->data, "|", BTRFS_GROUP_FLAGS)) From 260db43cd2f556677f6ae818ba09f997eed81004 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Aug 2020 19:48:34 -0700 Subject: [PATCH 002/161] btrfs: delete duplicated words + other fixes in comments Delete repeated words in fs/btrfs/. {to, the, a, and old} and change "into 2 part" to "into 2 parts". Reviewed-by: Nikolay Borisov Signed-off-by: Randy Dunlap Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/ctree.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/tree-log.c | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea8aaf36647e..fb507a389f40 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2785,7 +2785,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cd392da69b81..e0fd8a34efcc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5115,7 +5115,7 @@ again: slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f72b092bc22..b1f6662f89c4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2928,7 +2928,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a940edb1e64f..0320ddb0133e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3241,7 +3241,7 @@ static int __do_readpage(struct page *page, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dc82fd0c80cb..8759f5a1d6a0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1353,7 +1353,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c0f350c3a0cf..580899bdb991 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39da9db35278..a6e69378b706 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4883,7 +4883,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -6380,7 +6380,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT * otherwise. * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be * committed (without attempting to sync the log). */ From 57297c1e8e1ce030050bf07ffa7182aaa588e2ac Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Aug 2020 12:43:18 +0300 Subject: [PATCH 003/161] btrfs: remove spurious BUG_ON in btrfs_get_extent That BUG_ON cannot ever trigger because as the comment there states - 'err' is always set. Simply remove it as it brings no value. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9570458aa847..df08ace5d914 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6748,7 +6748,6 @@ out: free_extent_map(em); return ERR_PTR(err); } - BUG_ON(!em); /* Error is always set */ return em; } From 8e5600818022bd329a5846dd6b5242965644f150 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 16:18:51 +0300 Subject: [PATCH 004/161] btrfs: remove fsid argument from btrfs_sysfs_update_sprout_fsid It can be accessed from 'fs_devices' as it's identical to fs_info->fs_devices. Also add a comment about why we are calling the function. No semantic changes. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 7 +++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5be30066563c..d7c58c89fb9c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1328,8 +1328,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid) +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; @@ -1337,7 +1337,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index cf839c46a131..c9efa15f96e0 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -20,8 +20,7 @@ int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1997a7d67f22..5d73c287314a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2645,8 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_sysfs; } - btrfs_sysfs_update_sprout_fsid(fs_devices, - fs_info->fs_devices->fsid); + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); From b49121393f583635b78abae4037fa1e772b2a042 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:12 -0400 Subject: [PATCH 005/161] btrfs: change nr to u64 in btrfs_start_delalloc_roots We have btrfs_wait_ordered_roots() which takes a u64 for nr, but btrfs_start_delalloc_roots() that takes an int for nr, which makes using them in conjunction, especially for something like (u64)-1, annoying and inconsistent. Fix btrfs_start_delalloc_roots() to take a u64 for nr and adjust start_delalloc_inodes() and it's callers appropriately. This means we've adjusted start_delalloc_inodes() to take a pointer of nr since we want to preserve the ability for start-delalloc_inodes() to return an error, so simply make it do the nr adjusting as necessary. Part of adjusting the callers to this means changing btrfs_writeback_inodes_sb_nr() to take a u64 for items. This may be confusing because it seems unrelated, but the caller of btrfs_writeback_inodes_sb_nr() already passes in a u64, it's just the function variable that needs to be changed. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/inode.c | 26 ++++++++++---------------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/space-info.c | 2 +- 5 files changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a72896bed2e..e2ec4f067857 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2956,7 +2956,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e4a1c6afe35d..61d800a149b0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -661,7 +661,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index df08ace5d914..48b8b5f09f47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9387,7 +9387,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9427,9 +9427,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9454,18 +9456,15 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9488,15 +9487,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2d9109d9e98f..a5355a16eabb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4897,7 +4897,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..266ff14e4c8c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -477,7 +477,7 @@ again: } static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) + unsigned long nr_pages, u64 nr_items) { struct super_block *sb = fs_info->sb; From 288be2d997360c71a1126261680d95ad9df5932a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:13 -0400 Subject: [PATCH 006/161] btrfs: remove orig from shrink_delalloc We don't use this anywhere inside of shrink_delalloc since 17024ad0a0fd ("Btrfs: fix early ENOSPC due to delalloc"), remove it. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 266ff14e4c8c..9c79bdfd145b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -517,7 +517,7 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, * shrink metadata reservation for delalloc */ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) + bool wait_ordered) { struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -742,7 +742,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + shrink_delalloc(fs_info, num_bytes * 2, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: From d7f81fac97e6d06c4e9bf090085137aa2dddff99 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:14 -0400 Subject: [PATCH 007/161] btrfs: handle U64_MAX for shrink_delalloc Data allocations are going to want to pass in U64_MAX for flushing space, adjust shrink_delalloc to handle this properly. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9c79bdfd145b..2b48b37216eb 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -530,8 +530,19 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, int loops; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly, so increase the amount to reclaim by 2x in order to + * make sure we're flushing enough delalloc to hopefully reclaim + * some metadata reservations. + */ + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + } trans = (struct btrfs_trans_handle *)current->journal_info; space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -742,7 +753,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, + shrink_delalloc(fs_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: From 920a9958c2553e4e23f9ff42dca2907ecf9a5fa0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:15 -0400 Subject: [PATCH 008/161] btrfs: make shrink_delalloc take space_info as an arg Currently shrink_delalloc just looks up the metadata space info, but this won't work if we're trying to reclaim space for data chunks. We get the right space_info we want passed into flush_space, so simply pass that along to shrink_delalloc. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2b48b37216eb..3aaeff287b9c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -516,10 +516,10 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered) { - struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; @@ -545,7 +545,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, } trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -753,7 +752,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes, + shrink_delalloc(fs_info, space_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: From c6c453032ea3b9033f0a312175aab6519ff9bdf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:16 -0400 Subject: [PATCH 009/161] btrfs: make ALLOC_CHUNK use the space info flags We have traditionally used flush_space() to flush metadata space, so we've been unconditionally using btrfs_metadata_alloc_profile() for our profile to allocate a chunk. However if we're going to use this for data we need to use btrfs_get_alloc_profile() on the space_info we pass in. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3aaeff287b9c..ed2ca20070f3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -777,7 +777,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), + btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); From 3308234a7e9828f8cbec308010348ddf712fc15e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:17 -0400 Subject: [PATCH 010/161] btrfs: call btrfs_try_granting_tickets when freeing reserved bytes We were missing a call to btrfs_try_granting_tickets in btrfs_free_reserved_bytes, so add it to handle the case where we're able to satisfy an allocation because we've freed a pending reservation. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fb507a389f40..bc95f5f963fb 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2994,6 +2994,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } From 2732798c9bb647978043e4daf54a6f9f3025d4c5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:18 -0400 Subject: [PATCH 011/161] btrfs: call btrfs_try_granting_tickets when unpinning anything When unpinning we were only calling btrfs_try_granting_tickets() if global_rsv->space_info == space_info, which is problematic because we use ticketing for SYSTEM chunks, and want to use it for DATA as well. Fix this by moving this call outside of that if statement. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dbff61d36cab..79c4432e6a3f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2849,11 +2849,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } From 99ffb43e5d4a0710fb094f3efbdf47f85f3b87c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:19 -0400 Subject: [PATCH 012/161] btrfs: call btrfs_try_granting_tickets when reserving space If we have compression on we could free up more space than we reserved, and thus be able to make a space reservation. Add the call for this scenario. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bc95f5f963fb..c9d7465501a4 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2961,6 +2961,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake + * tickets if that happens + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); From 39753e4a3a43c7d14a17f626f32d4a53ee59d563 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:20 -0400 Subject: [PATCH 013/161] btrfs: use the btrfs_space_info_free_bytes_may_use helper for delalloc We are going to use the ticket infrastructure for data, so use the btrfs_space_info_free_bytes_may_use() helper in btrfs_free_reserved_data_space_noquota() so we get the btrfs_try_granting_tickets call when we free our reservation. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0e354e9e57d0..0a41bdcc14d1 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -277,9 +277,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - spin_unlock(&data_sinfo->lock); + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); } /* From 38d715f494f2f1dddbf3d0c6e50aefff49519232 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:21 -0400 Subject: [PATCH 014/161] btrfs: use btrfs_start_delalloc_roots in shrink_delalloc The original iteration of flushing had us flushing delalloc and then checking to see if we could make our reservation, thus we were very careful about how many pages we would flush at once. But now that everything is async and we satisfy tickets as the space becomes available we don't have to keep track of any of this, simply try and flush the number of dirty inodes we may have in order to reclaim space to make our reservation. This cleans up our delalloc flushing significantly. The async_pages stuff is dropped because btrfs_start_delalloc_roots() handles the case that we generate async extents for us, so we no longer require this extra logic. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 55 +------------------------------------------ 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ed2ca20070f3..4b8753b1a628 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -476,28 +476,6 @@ again: up_read(&info->groups_sem); } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, u64 nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { @@ -523,10 +501,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; - u64 async_pages; u64 items; long time_left; - unsigned long nr_pages; int loops; /* Calc the number of the pages we need flush for space reservation */ @@ -567,37 +543,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + btrfs_start_delalloc_roots(fs_info, items); - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { From 448b966b49be55af8f4aa79b24d7896809fd8e67 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:22 -0400 Subject: [PATCH 015/161] btrfs: check tickets after waiting on ordered extents Right now if the space is freed up after the ordered extents complete (which is likely since the reservations are held until they complete), we would do extra delalloc flushing before we'd notice that we didn't have any more tickets. Fix this by moving the tickets check after our wait_ordered_extents check. Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 4b8753b1a628..6a9bc5cc487b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -545,14 +545,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, while ((delalloc_bytes || dio_bytes) && loops < 3) { btrfs_start_delalloc_roots(fs_info, items); - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); - loops++; if (wait_ordered && !trans) { btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); @@ -561,6 +553,15 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, if (time_left) break; } + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); From 058e6d1d267fe9c7e072533d2e1469d93aec3ec7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:23 -0400 Subject: [PATCH 016/161] btrfs: add flushing states for handling data reservations Currently the way we do data reservations is by seeing if we have enough space in our space_info. If we do not and we're a normal inode we'll 1) Attempt to force a chunk allocation until we can't anymore. 2) If that fails we'll flush delalloc, then commit the transaction, then run the delayed iputs. If we are a free space inode we're only allowed to force a chunk allocation. In order to use the normal flushing mechanism we need to encode this into a flush state array for normal inodes. Since both will start with allocating chunks until the space info is full there is no need to add this as a flush state, this will be handled specially. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/space-info.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e2ec4f067857..098f006fc5a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2592,6 +2592,8 @@ enum btrfs_reserve_flush_enum { * * Can be interruped by fatal signal. */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 6a9bc5cc487b..980b6f641e78 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1017,6 +1017,12 @@ static const enum btrfs_flush_state evict_flush_states[] = { COMMIT_TRANS, }; +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + COMMIT_TRANS, + RUN_DELAYED_IPUTS, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, From a1ed0a8216f7c7305ccfaa2c93b498f10b340ede Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:24 -0400 Subject: [PATCH 017/161] btrfs: add the data transaction commit logic into may_commit_transaction Data space flushing currently unconditionally commits the transaction twice in a row, and the last time it checks if there's enough pinned extents to satisfy its reservation before deciding to commit the transaction for the 3rd and final time. Encode this logic into may_commit_transaction(). In the next patch we will pass in U64_MAX for bytes_needed the first two times, and the final time we will pass in the actual bytes we need so the normal logic will apply. This patch exists solely to make the logical changes I will make to the flushing state machine separate to make it easier to bisect any performance related regressions. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 980b6f641e78..abd2b8fb1833 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -579,21 +579,33 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + struct btrfs_space_info *space_info, + u64 bytes_needed) { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; - u64 bytes_needed; u64 reclaim_bytes = 0; u64 cur_free_bytes = 0; + bool do_commit = false; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; + /* + * If we are data and have passed in U64_MAX we just want to + * unconditionally commit the transaction to match the previous data + * flushing behavior. + */ + if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && + bytes_needed == U64_MAX) { + do_commit = true; + goto check_pinned; + } + spin_lock(&space_info->lock); cur_free_bytes = btrfs_space_info_used(space_info, true); if (cur_free_bytes < space_info->total_bytes) @@ -607,7 +619,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; + if (ticket) + bytes_needed = ticket->bytes; if (bytes_needed > cur_free_bytes) bytes_needed -= cur_free_bytes; @@ -618,6 +631,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes_needed) return 0; +check_pinned: trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -627,15 +641,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, * we have block groups that are going to be freed, allowing us to * possibly do a chunk allocation the next loop through. */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + if (do_commit || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || __percpu_counter_compare(&space_info->total_bytes_pinned, bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; /* - * See if there is some space in the delayed insertion reservation for - * this reservation. + * See if there is some space in the delayed insertion reserve for this + * reservation. If the space_info's don't match (like for DATA or + * SYSTEM) then just go enospc, reclaiming this space won't recover any + * space to satisfy those reservations. */ if (space_info != delayed_rsv->space_info) goto enospc; @@ -742,7 +759,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info); + ret = may_commit_transaction(fs_info, space_info, num_bytes); break; default: ret = -ENOSPC; From 8698fc4eb7884eeba29adc4193f9d05fa2bed16b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:25 -0400 Subject: [PATCH 018/161] btrfs: add btrfs_reserve_data_bytes and use it Create a new function btrfs_reserve_data_bytes() in order to handle data reservations. This uses the new flush types and flush states to handle making data reservations. This patch specifically does not change any functionality, and is purposefully not cleaned up in order to make bisection easier for the future patches. The new helper is identical to the old helper in how it handles data reservations. We first try to force a chunk allocation, and then we run through the flush states all at once and in the same order that they were done with the old helper. Subsequent patches will clean this up and change the behavior of the flushing, and it is important to keep those changes separate so we can easily bisect down to the patch that caused the regression, rather than the patch that made us start using the new infrastructure. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 119 ++------------------------------------ fs/btrfs/space-info.c | 92 +++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 2 + 3 files changed, 98 insertions(+), 115 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0a41bdcc14d1..bacee09b7bfd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; -again: - /* Make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * If we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* Commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - spin_unlock(&data_sinfo->lock); - - return 0; + return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index abd2b8fb1833..ad1dcb37593d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1329,3 +1329,95 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } return ret; } + +/** + * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation + * @fs_info - the filesystem + * @bytes - the number of bytes we need + * @flush - how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + const enum btrfs_flush_state *states = NULL; + u64 used; + int states_nr = 0; + int commit_cycles = 2; + int ret = -ENOSPC; + + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + if (flush == BTRFS_RESERVE_FLUSH_DATA) { + states = data_flush_states; + states_nr = ARRAY_SIZE(data_flush_states); + } + + spin_lock(&data_sinfo->lock); +again: + used = btrfs_space_info_used(data_sinfo, true); + + if (used + bytes > data_sinfo->total_bytes) { + u64 prev_total_bytes = data_sinfo->total_bytes; + int flush_state = 0; + + spin_unlock(&data_sinfo->lock); + + /* + * Everybody can force chunk allocation, so try this first to + * see if we can just bail here and make our reservation. + */ + flush_space(fs_info, data_sinfo, bytes, ALLOC_CHUNK_FORCE); + spin_lock(&data_sinfo->lock); + if (prev_total_bytes < data_sinfo->total_bytes) + goto again; + spin_unlock(&data_sinfo->lock); + + /* + * Cycle through the rest of the flushing options for our flush + * type, then try again. + */ + while (flush_state < states_nr) { + u64 flush_bytes = U64_MAX; + + /* + * Previously we unconditionally committed the + * transaction twice before finally checking against + * pinned space before committing the final time. We + * also skipped flushing delalloc the final pass + * through. + */ + if (!commit_cycles) { + if (states[flush_state] == FLUSH_DELALLOC_WAIT) { + flush_state++; + continue; + } + if (states[flush_state] == COMMIT_TRANS) + flush_bytes = bytes; + } + + flush_space(fs_info, data_sinfo, flush_bytes, + states[flush_state]); + flush_state++; + } + + if (!commit_cycles) + goto out; + + commit_cycles--; + spin_lock(&data_sinfo->lock); + goto again; + } + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + ret = 0; + spin_unlock(&data_sinfo->lock); +out: + if (ret) + trace_btrfs_space_reservation(fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c3c64019950a..5646393b928c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ From 1004f6860f8c64f8d959051d6299981c09046cbe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:26 -0400 Subject: [PATCH 019/161] btrfs: use ticketing for data space reservations Now that we have all the infrastructure in place, use the ticketing infrastructure to make data allocations. This still maintains the exact same flushing behavior, but now we're using tickets to get our reservations satisfied. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 122 ++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ad1dcb37593d..9c118af363de 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1070,6 +1070,53 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } while (flush_state < states_nr); } +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) +{ + int flush_state = 0; + int commit_cycles = 2; + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } +again: + while (flush_state < states_nr) { + u64 flush_bytes = U64_MAX; + + if (!commit_cycles) { + if (states[flush_state] == FLUSH_DELALLOC_WAIT) { + flush_state++; + continue; + } + if (states[flush_state] == COMMIT_TRANS) + flush_bytes = ticket->bytes; + } + + flush_space(fs_info, space_info, flush_bytes, states[flush_state]); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + flush_state++; + } + if (commit_cycles) { + commit_cycles--; + flush_state = 0; + goto again; + } +} + static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) @@ -1136,6 +1183,13 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; + case BTRFS_RESERVE_FLUSH_DATA: + priority_reclaim_data_space(fs_info, space_info, ticket, + data_flush_states, ARRAY_SIZE(data_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket, NULL, 0); + break; default: ASSERT(0); break; @@ -1343,78 +1397,30 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - const enum btrfs_flush_state *states = NULL; u64 used; - int states_nr = 0; - int commit_cycles = 2; int ret = -ENOSPC; ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - if (flush == BTRFS_RESERVE_FLUSH_DATA) { - states = data_flush_states; - states_nr = ARRAY_SIZE(data_flush_states); - } - spin_lock(&data_sinfo->lock); -again: used = btrfs_space_info_used(data_sinfo, true); if (used + bytes > data_sinfo->total_bytes) { - u64 prev_total_bytes = data_sinfo->total_bytes; - int flush_state = 0; + struct reserve_ticket ticket; + init_waitqueue_head(&ticket.wait); + ticket.bytes = bytes; + ticket.error = 0; + list_add_tail(&ticket.list, &data_sinfo->priority_tickets); + data_sinfo->reclaim_size += bytes; spin_unlock(&data_sinfo->lock); - /* - * Everybody can force chunk allocation, so try this first to - * see if we can just bail here and make our reservation. - */ - flush_space(fs_info, data_sinfo, bytes, ALLOC_CHUNK_FORCE); - spin_lock(&data_sinfo->lock); - if (prev_total_bytes < data_sinfo->total_bytes) - goto again; + ret = handle_reserve_ticket(fs_info, data_sinfo, &ticket, flush); + } else { + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + ret = 0; spin_unlock(&data_sinfo->lock); - - /* - * Cycle through the rest of the flushing options for our flush - * type, then try again. - */ - while (flush_state < states_nr) { - u64 flush_bytes = U64_MAX; - - /* - * Previously we unconditionally committed the - * transaction twice before finally checking against - * pinned space before committing the final time. We - * also skipped flushing delalloc the final pass - * through. - */ - if (!commit_cycles) { - if (states[flush_state] == FLUSH_DELALLOC_WAIT) { - flush_state++; - continue; - } - if (states[flush_state] == COMMIT_TRANS) - flush_bytes = bytes; - } - - flush_space(fs_info, data_sinfo, flush_bytes, - states[flush_state]); - flush_state++; - } - - if (!commit_cycles) - goto out; - - commit_cycles--; - spin_lock(&data_sinfo->lock); - goto again; } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - ret = 0; - spin_unlock(&data_sinfo->lock); -out: if (ret) trace_btrfs_space_reservation(fs_info, "space_info:enospc", From 0532a6f8b6ce64ced0ffcca0a2ab5d66a8eb9a6d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:27 -0400 Subject: [PATCH 020/161] btrfs: serialize data reservations if we are flushing Nikolay reported a problem where generic/371 would fail sometimes with a slow drive. The gist of the test is that we fallocate a file in parallel with a pwrite of a different file. These two files combined are smaller than the file system, but sometimes the pwrite would ENOSPC. A fair bit of investigation uncovered the fact that the fallocate workload was racing in and grabbing the free space that the pwrite workload was trying to free up so it could make its own reservation. After a few loops of this eventually the pwrite workload would error out with an ENOSPC. We've had the same problem with metadata as well, and we serialized all metadata allocations to satisfy this problem. This wasn't usually a problem with data because data reservations are more straightforward, but obviously could still happen. Fix this by not allowing reservations to occur if there are any pending tickets waiting to be satisfied on the space info. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9c118af363de..fd9eff51e5b4 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1399,13 +1399,16 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; u64 used; int ret = -ENOSPC; + bool pending_tickets; ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); spin_lock(&data_sinfo->lock); used = btrfs_space_info_used(data_sinfo, true); + pending_tickets = !list_empty(&data_sinfo->tickets) || + !list_empty(&data_sinfo->priority_tickets); - if (used + bytes > data_sinfo->total_bytes) { + if (pending_tickets || used + bytes > data_sinfo->total_bytes) { struct reserve_ticket ticket; init_waitqueue_head(&ticket.wait); From f3bda421c16fe2f67cb3ff81b65ce8de331dd8e2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:28 -0400 Subject: [PATCH 021/161] btrfs: use the same helper for data and metadata reservations Now that data reservations follow the same pattern as metadata reservations we can simply rename __reserve_metadata_bytes to __reserve_bytes and use that helper for data reservations. Things to keep in mind, btrfs_can_overcommit() returns 0 for data, because we can never overcommit. We also will never pass in FLUSH_ALL for data, so we'll simply be added to the priority list and go straight into handle_reserve_ticket. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 46 ++++++++++++------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index fd9eff51e5b4..87f92fc67542 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1249,10 +1249,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct reserve_ticket ticket; u64 used; @@ -1364,8 +1363,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush); + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -1397,36 +1395,18 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = -ENOSPC; - bool pending_tickets; + int ret; + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - pending_tickets = !list_empty(&data_sinfo->tickets) || - !list_empty(&data_sinfo->priority_tickets); - - if (pending_tickets || used + bytes > data_sinfo->total_bytes) { - struct reserve_ticket ticket; - - init_waitqueue_head(&ticket.wait); - ticket.bytes = bytes; - ticket.error = 0; - list_add_tail(&ticket.list, &data_sinfo->priority_tickets); - data_sinfo->reclaim_size += bytes; - spin_unlock(&data_sinfo->lock); - - ret = handle_reserve_ticket(fs_info, data_sinfo, &ticket, flush); - } else { - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - ret = 0; - spin_unlock(&data_sinfo->lock); - } - if (ret) - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } return ret; } From 028270013586681b02db49be20e05b56e662dc55 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:29 -0400 Subject: [PATCH 022/161] btrfs: drop the commit_cycles stuff for data reservations This was an old wart left over from how we previously did data reservations. Before we could have people race in and take a reservation while we were flushing space, so we needed to make sure we looped a few times before giving up. Now that we're using the ticketing infrastructure we don't have to worry about this and can drop the logic altogether. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 87f92fc67542..244cd2a61386 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1077,7 +1077,6 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, int states_nr) { int flush_state = 0; - int commit_cycles = 2; while (!space_info->full) { flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); @@ -1088,20 +1087,9 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, } spin_unlock(&space_info->lock); } -again: + while (flush_state < states_nr) { - u64 flush_bytes = U64_MAX; - - if (!commit_cycles) { - if (states[flush_state] == FLUSH_DELALLOC_WAIT) { - flush_state++; - continue; - } - if (states[flush_state] == COMMIT_TRANS) - flush_bytes = ticket->bytes; - } - - flush_space(fs_info, space_info, flush_bytes, states[flush_state]); + flush_space(fs_info, space_info, U64_MAX, states[flush_state]); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); @@ -1110,11 +1098,6 @@ again: spin_unlock(&space_info->lock); flush_state++; } - if (commit_cycles) { - commit_cycles--; - flush_state = 0; - goto again; - } } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, From bb86bd3db82ed6d28d4ab4ed33c7ee3b27290e49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:30 -0400 Subject: [PATCH 023/161] btrfs: don't force commit if we are data We used to unconditionally commit the transaction at least 2 times and then on the 3rd try check against pinned space to make sure committing the transaction was worth the effort. This is overkill, we know nobody is going to steal our reservation, and if we can't make our reservation with the pinned amount simply bail out. This also cleans up the passing of bytes_needed to may_commit_transaction, as that was the thing we added into place in order to accomplish this behavior. We no longer need it so remove that mess. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 244cd2a61386..c229dffa46d8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -579,8 +579,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 bytes_needed) + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; @@ -588,24 +587,13 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; u64 reclaim_bytes = 0; + u64 bytes_needed = 0; u64 cur_free_bytes = 0; - bool do_commit = false; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; - /* - * If we are data and have passed in U64_MAX we just want to - * unconditionally commit the transaction to match the previous data - * flushing behavior. - */ - if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && - bytes_needed == U64_MAX) { - do_commit = true; - goto check_pinned; - } - spin_lock(&space_info->lock); cur_free_bytes = btrfs_space_info_used(space_info, true); if (cur_free_bytes < space_info->total_bytes) @@ -631,7 +619,6 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes_needed) return 0; -check_pinned: trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -641,8 +628,7 @@ check_pinned: * we have block groups that are going to be freed, allowing us to * possibly do a chunk allocation the next loop through. */ - if (do_commit || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || __percpu_counter_compare(&space_info->total_bytes_pinned, bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) @@ -759,7 +745,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info, num_bytes); + ret = may_commit_transaction(fs_info, space_info); break; default: ret = -ENOSPC; From 327feeeb2e9bc68d2af1c581394ec6706f600bb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:31 -0400 Subject: [PATCH 024/161] btrfs: run delayed iputs before committing the transaction for data Before we were waiting on iputs after we committed the transaction, but this doesn't really make much sense. We want to reclaim any space we may have in order to be more likely to commit the transaction, due to pinned space being added by running the delayed iputs. Fix this by making delayed iputs run before committing the transaction. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c229dffa46d8..6d82c444ceff 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1022,8 +1022,8 @@ static const enum btrfs_flush_state evict_flush_states[] = { static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, - COMMIT_TRANS, RUN_DELAYED_IPUTS, + COMMIT_TRANS, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, From cb3e3930459972dbafc274bcde3fdc1462429391 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:32 -0400 Subject: [PATCH 025/161] btrfs: flush delayed refs when trying to reserve data space We can end up with freed extents in the delayed refs, and thus may_commit_transaction() may not think we have enough pinned space to commit the transaction and we'll ENOSPC early. Handle this by running the delayed refs in order to make sure pinned is uptodate before we try to commit the transaction. Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 6d82c444ceff..8496518297f3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1023,6 +1023,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, COMMIT_TRANS, }; From 5705674081cee751659a6adb4849645a566a6cf4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:33 -0400 Subject: [PATCH 026/161] btrfs: do async reclaim for data reservations Now that we have the data ticketing stuff in place, move normal data reservations to use an async reclaim helper to satisfy tickets. Before we could have multiple tasks race in and both allocate chunks, resulting in more data chunks than we would necessarily need. Serializing these allocations and making a single thread responsible for flushing will only allocate chunks as needed, as well as cut down on transaction commits and other flush related activities. Priority reservations will still work as they have before, simply trying to allocate a chunk until they can make their reservation. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +- fs/btrfs/disk-io.c | 3 +- fs/btrfs/space-info.c | 117 ++++++++++++++++++++++++++++++------------ fs/btrfs/super.c | 1 + 4 files changed, 89 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 098f006fc5a3..f2d4c14fc273 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -494,7 +494,7 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -void btrfs_init_async_reclaim_work(struct work_struct *work); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; @@ -912,6 +912,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1f6662f89c4..a496219baa39 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2753,7 +2753,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); - btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + btrfs_init_async_reclaim_work(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -4056,6 +4056,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8496518297f3..bbddb8b5eff1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -998,9 +998,79 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, + COMMIT_TRANS, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + int flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state]); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + } + spin_unlock(&space_info->lock); + } +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) +{ + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1020,13 +1090,6 @@ static const enum btrfs_flush_state evict_flush_states[] = { COMMIT_TRANS, }; -static const enum btrfs_flush_state data_flush_states[] = { - FLUSH_DELALLOC_WAIT, - RUN_DELAYED_IPUTS, - FLUSH_DELAYED_REFS, - COMMIT_TRANS, -}; - static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, @@ -1059,12 +1122,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, - const enum btrfs_flush_state *states, - int states_nr) + struct reserve_ticket *ticket) { - int flush_state = 0; - while (!space_info->full) { flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); spin_lock(&space_info->lock); @@ -1074,17 +1133,6 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, } spin_unlock(&space_info->lock); } - - while (flush_state < states_nr) { - flush_space(fs_info, space_info, U64_MAX, states[flush_state]); - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - flush_state++; - } } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, @@ -1139,6 +1187,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, int ret; switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); @@ -1153,12 +1202,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; - case BTRFS_RESERVE_FLUSH_DATA: - priority_reclaim_data_space(fs_info, space_info, ticket, - data_flush_states, ARRAY_SIZE(data_flush_states)); - break; case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: - priority_reclaim_data_space(fs_info, space_info, ticket, NULL, 0); + priority_reclaim_data_space(fs_info, space_info, ticket); break; default: ASSERT(0); @@ -1223,6 +1268,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { + struct work_struct *async_work; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -1231,6 +1277,11 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); @@ -1272,7 +1323,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); if (flush == BTRFS_RESERVE_FLUSH_ALL || - flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; @@ -1280,8 +1332,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 25967ecaaf0a..3ebe7240c63d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * the filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); From 1a7a92c8ddcd1edc4a5407de8f56edc6cfdf394a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:34 -0400 Subject: [PATCH 027/161] btrfs: add a comment explaining the data flush steps The data flushing steps are not obvious to people other than myself and Chris. Write a giant comment explaining the reasoning behind each flush step for data as well as why it is in that particular order. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bbddb8b5eff1..71aa9e0de61e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -998,6 +998,53 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * FLUSH_DELAYED_REFS + * The above two cases generate delayed refs that will affect + * ->total_bytes_pinned. However this counter can be inconsistent with + * reality if there are outstanding delayed refs. This is because we adjust + * the counter based solely on the current set of delayed refs and disregard + * any on-disk state which might include more refs. So for example, if we + * have an extent with 2 references, but we only drop 1, we'll see that there + * is a negative delayed ref count for the extent and assume that the space + * will be freed, and thus increase ->total_bytes_pinned. + * + * Running the delayed refs gives us the actual real view of what will be + * freed at the transaction commit time. This stage will not actually free + * space for us, it just makes sure that may_commit_transaction() has all of + * the information it needs to make the right decision. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by the previous + * two stages. We will not commit the transaction if we don't think we're + * likely to satisfy our request, which means if our current free space + + * total_bytes_pinned < reservation we will not commit. This is why the + * previous states are actually important, to make sure we know for sure + * whether committing the transaction will allow us to make progress. + */ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, From c4923027bd58cdccbe54e13298a4d914668a56da Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Aug 2020 16:56:59 -0400 Subject: [PATCH 028/161] btrfs: fix possible infinite loop in data async reclaim Dave reported an issue where generic/102 would sometimes hang. This turned out to be because we'd get into this spot where we were no longer making progress on data reservations because our exit condition was not met. The log is basically while (!space_info->full && !list_empty(&space_info->tickets)) flush_space(space_info, flush_state); where flush state is our various flush states, but doesn't include ALLOC_CHUNK_FORCE. This is because we actually lead with allocating chunks, and so the assumption was that once you got to the actual flushing states you could no longer allocate chunks. This was a stupid assumption, because you could have deleted block groups that would be reclaimed by a transaction commit, thus unsetting space_info->full. This is essentially what happens with generic/102, and so sometimes you'd get stuck in the flushing loop because we weren't allocating chunks, but flushing space wasn't giving us what we needed to make progress. Fix this by adding ALLOC_CHUNK_FORCE to the end of our flushing states, that way we will eventually bail out because we did end up with space_info->full if we free'd a chunk previously. Otherwise, as is the case for this test, we'll allocate our chunk and continue on our happy merry way. Reported-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 71aa9e0de61e..b733718f45d3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1044,12 +1044,18 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) * total_bytes_pinned < reservation we will not commit. This is why the * previous states are actually important, to make sure we know for sure * whether committing the transaction will allow us to make progress. + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. */ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, FLUSH_DELAYED_REFS, COMMIT_TRANS, + ALLOC_CHUNK_FORCE, }; static void btrfs_async_reclaim_data_space(struct work_struct *work) From e21139c621ada48f2bc382865ea34dd49d45c2a8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Aug 2020 14:33:52 +0800 Subject: [PATCH 029/161] btrfs: cleanup calculation of lockend in lock_and_cleanup_extent_if_need() We're just doing rounding up to sectorsize to calculate the lockend. There is no need to do the unnecessary length calculation, just direct round_up() is enough. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4507c3d09399..de14ed402390 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1477,9 +1477,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); - last_pos = start_pos - + round_up(pos + write_bytes - start_pos, - fs_info->sectorsize) - 1; + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; From 9e6df7cedfdf3e2469aa46fb772ca70b422132ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 10:56:00 +0200 Subject: [PATCH 030/161] btrfs: remove const from btrfs_feature_set_name The function btrfs_feature_set_name returns a const char pointer, the second const is not necessary and reported as a warning: In file included from fs/btrfs/space-info.c:6: fs/btrfs/sysfs.h:16:1: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] 16 | const char * const btrfs_feature_set_name(enum btrfs_feature_set set); | ^~~~~ Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 +- fs/btrfs/sysfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index d7c58c89fb9c..fcc8757b4e93 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -973,7 +973,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_INCOMPAT] = "incompat", }; -const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c9efa15f96e0..4217823e255c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -13,7 +13,7 @@ enum btrfs_feature_set { }; char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -const char * const btrfs_feature_set_name(enum btrfs_feature_set set); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, From cb4c9198302b02c3f0eaf4267636e5ce1f4a4765 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 10:58:38 +0200 Subject: [PATCH 031/161] btrfs: compression: move declarations to header The declarations of compression algorithm callbacks are defined in the .c file as they're used from there. Compiler warns that there are no declarations for public functions when compiling lzo.c/zlib.c/zstd.c. Fix that by moving the declarations to the header as it's the common place for all of them. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/compression.c | 35 ----------------------------------- fs/btrfs/compression.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1ab56a734e70..eeface30facd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,41 +29,6 @@ #include "extent_io.h" #include "extent_map.h" -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); -void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); - -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); -void lzo_free_workspace(struct list_head *ws); - -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); -void zstd_put_workspace(struct list_head *ws); - static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9f3dbe372631..8001b700ea3a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + #endif From 0af447d0507b1623cd4b94ad941460951e141783 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 12:08:37 +0200 Subject: [PATCH 032/161] btrfs: remove unnecessarily shadowed variables In btrfs_orphan_cleanup, there's another instance of fs_info, but it's the same as the one we already have. In btrfs_backref_finish_upper_links, rb_node is same type and used as temporary cursor to the tree. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 - fs/btrfs/inode.c | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea1c28ccb44f..b3268f4ea5f3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; - struct rb_node *rb_node; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48b8b5f09f47..50d9c335271a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3055,7 +3055,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* From 8bb1cf1ba639d271983be29db68d30e2eebabb98 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 12:12:38 +0200 Subject: [PATCH 033/161] btrfs: scrub: rename ratelimit state varaible to avoid shadowing There's already defined _rs within ctree.h:btrfs_printk_ratelimited, local variables should not use _ to avoid such name clashes with macro-local variables. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 354ab9985a34..cf63f1e27a27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int success; bool full_stripe_locked; unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) From 1b51d6fce45ec1f2279f06f57fe1c8a703437b27 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 12:16:57 +0200 Subject: [PATCH 034/161] btrfs: send: remove indirect callback parameter for changed_cb There's a custom callback passed to btrfs_compare_trees which happens to be named exactly same as the existing function implementing it. This is confusing and the indirection is not necessary for our needs. Compiler is clever enough to call it directly so there's effectively no change. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9813a5b075a..7c7c09fc65e8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -278,11 +278,6 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, @@ -6692,8 +6687,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) + struct btrfs_root *right_root, void *ctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6960,8 +6954,7 @@ static int send_subvol(struct send_ctx *sctx) goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); From 5522a27e59c62b3fb60448eb6c3640cf307247bb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:37 +0100 Subject: [PATCH 035/161] btrfs: do not take the log_mutex of the subvolume when pinning the log During a rename we pin the log to make sure no one commits a log that reflects an ongoing rename operation, as it might result in a committed log where it recorded the unlink of the old name without having recorded the new name. However we are taking the subvolume's log_mutex before incrementing the log_writers counter, which is not necessary since that counter is atomic and we only remove the old name from the log and add the new name to the log after we have incremented log_writers, ensuring that no one can commit the log after we have removed the old name from the log and before we added the new name to the log. By taking the log_mutex lock we are just adding unnecessary contention on the lock, which can become visible for workloads that mix renames with fsyncs, writes for files opened with O_SYNC and unlink operations (if the inode or its parent were fsynced before in the current transaction). So just remove the lock and unlock of the subvolume's log_mutex at btrfs_pin_log_trans(). Using dbench, which mixes different types of operations that end up taking that mutex (fsyncs, renames, unlinks and writes into files opened with O_SYNC) revealed some small gains. The following script that calls dbench was used: #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-m single -d single" THREADS=32 echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -s -t 600 -D $MNT $THREADS umount $MNT The test was run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). Results before this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 4410848 0.017 738.640 Close 3240222 0.001 0.834 Rename 186850 7.478 1272.476 Unlink 890875 0.128 785.018 Deltree 128 2.846 12.081 Mkdir 64 0.002 0.003 Qpathinfo 3997659 0.009 11.171 Qfileinfo 701307 0.001 0.478 Qfsinfo 733494 0.002 1.103 Sfileinfo 359362 0.004 3.266 Find 1546226 0.041 4.128 WriteX 2202803 7.905 1376.989 ReadX 6917775 0.003 3.887 LockX 14392 0.002 0.043 UnlockX 14392 0.001 0.085 Flush 309225 0.128 1033.936 Throughput 231.555 MB/sec (sync open) 32 clients 32 procs max_latency=1376.993 ms Results after this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 4603244 0.017 232.776 Close 3381299 0.001 1.041 Rename 194871 7.251 1073.165 Unlink 929730 0.133 119.233 Deltree 128 2.871 10.199 Mkdir 64 0.002 0.004 Qpathinfo 4171343 0.009 11.317 Qfileinfo 731227 0.001 1.635 Qfsinfo 765079 0.002 3.568 Sfileinfo 374881 0.004 1.220 Find 1612964 0.041 4.675 WriteX 2296720 7.569 1178.204 ReadX 7213633 0.003 3.075 LockX 14976 0.002 0.076 UnlockX 14976 0.001 0.061 Flush 322635 0.102 579.505 Throughput 241.4 MB/sec (sync open) 32 clients 32 procs max_latency=1178.207 ms (+4.3% throughput, -14.4% max latency) Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a6e69378b706..4c3565c282e2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -215,9 +215,7 @@ static int join_running_log_trans(struct btrfs_root *root) */ void btrfs_pin_log_trans(struct btrfs_root *root) { - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); } /* From 75b463d2b47aef96fe1dc3e0237629963034764b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:48 +0100 Subject: [PATCH 036/161] btrfs: do not commit logs and transactions during link and rename operations Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we started to commit logs, and fallback to transaction commits when we failed to log the new names or commit the logs, after link and rename operations when the target inodes (or their parents) were previously logged in the current transaction. This was to avoid losing directories despite an explicit fsync on them when they are ancestors of some inode that got a new named logged, due to a link or rename operation. However that adds the cost of starting IO and waiting for it to complete, which can cause higher latencies for applications. Instead of doing that, just make sure that when we log a new name for an inode we don't mark any of its ancestors as logged, so that if any one does an fsync against any of them, without doing any other change on them, the fsync commits the log. This way we only pay the cost of a log commit (or a transaction commit if something goes wrong or a new block group was created) if the application explicitly asks to fsync any of the parent directories. Using dbench, which mixes several filesystems operations including renames, revealed some significant latency gains. The following script that uses dbench was used to test this: #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-m single -d single" THREADS=16 echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -t 300 -D $MNT $THREADS umount $MNT The test was run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). Results before this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 10750455 0.011 155.088 Close 7896674 0.001 0.243 Rename 455222 2.158 1101.947 Unlink 2171189 0.067 121.638 Deltree 256 2.425 7.816 Mkdir 128 0.002 0.003 Qpathinfo 9744323 0.006 21.370 Qfileinfo 1707092 0.001 0.146 Qfsinfo 1786756 0.001 11.228 Sfileinfo 875612 0.003 21.263 Find 3767281 0.025 9.617 WriteX 5356924 0.011 211.390 ReadX 16852694 0.003 9.442 LockX 35008 0.002 0.119 UnlockX 35008 0.001 0.138 Flush 753458 4.252 1102.249 Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms Results after this patch: 16 clients, after Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 11471098 0.012 448.281 Close 8426396 0.001 0.925 Rename 485746 0.123 267.183 Unlink 2316477 0.080 63.433 Deltree 288 2.830 11.144 Mkdir 144 0.003 0.010 Qpathinfo 10397420 0.006 10.288 Qfileinfo 1822039 0.001 0.169 Qfsinfo 1906497 0.002 14.039 Sfileinfo 934433 0.004 2.438 Find 4019879 0.026 10.200 WriteX 5718932 0.011 200.985 ReadX 17981671 0.003 10.036 LockX 37352 0.002 0.076 UnlockX 37352 0.001 0.109 Flush 804018 5.015 778.033 Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms (+6.5% throughput, -29.4% max latency, -75.8% rename latency) Test case generic/498 from fstests tests the scenario that the previously mentioned commit fixed. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 115 +++++--------------------------------------- fs/btrfs/tree-log.c | 100 +++++++++++++++++--------------------- fs/btrfs/tree-log.h | 14 ++---- 3 files changed, 60 insertions(+), 169 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50d9c335271a..e8a35db53369 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6373,7 +6373,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6388,12 +6387,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -8778,27 +8772,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -8940,30 +8926,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -8996,46 +8966,13 @@ out_fail: dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -9103,11 +9040,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9257,17 +9192,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9304,23 +9230,8 @@ out_fail: btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4c3565c282e2..e86fe7bbba82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -176,7 +176,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -5337,19 +5337,34 @@ log_extents: } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -6369,26 +6384,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6403,34 +6405,18 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; + return; - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..ddfc6789d9bf 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,6 +16,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; }; @@ -26,6 +27,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } @@ -67,16 +69,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif From 487781796d302266aff993bee17d4e1ddd73445b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:58 +0100 Subject: [PATCH 037/161] btrfs: make fast fsyncs wait only for writeback Currently regardless of a full or a fast fsync we always wait for ordered extents to complete, and then start logging the inode after that. However for fast fsyncs we can just wait for the writeback to complete, we don't need to wait for the ordered extents to complete since we use the list of modified extents maps to figure out which extents we must log and we can get their checksums directly from the ordered extents that are still in flight, otherwise look them up from the checksums tree. Until commit b5e6c3e170b770 ("btrfs: always wait on ordered extents at fsync time"), for fast fsyncs, we used to start logging without even waiting for the writeback to complete first, we would wait for it to complete after logging, while holding a transaction open, which lead to performance issues when using cgroups and probably for other cases too, as wait for IO while holding a transaction handle should be avoided as much as possible. After that, for fast fsyncs, we started to wait for ordered extents to complete before starting to log, which adds some latency to fsyncs and we even got at least one report about a performance drop which bisected to that particular change: https://lore.kernel.org/linux-btrfs/20181109215148.GF23260@techsingularity.net/ This change makes fast fsyncs only wait for writeback to finish before starting to log the inode, instead of waiting for both the writeback to finish and for the ordered extents to complete. This brings back part of the logic we had that extracts checksums from in flight ordered extents, which are not yet in the checksums tree, and making sure transaction commits wait for the completion of ordered extents previously logged (by far most of the time they have already completed by the time a transaction commit starts, resulting in no wait at all), to avoid any data loss if an ordered extent completes after the transaction used to log an inode is committed, followed by a power failure. When there are no other tasks accessing the checksums and the subvolume btrees, the ordered extent completion is pretty fast, typically taking 100 to 200 microseconds only in my observations. However when there are other tasks accessing these btrees, ordered extent completion can take a lot more time due to lock contention on nodes and leaves of these btrees. I've seen cases over 2 milliseconds, which starts to be significant. In particular when we do have concurrent fsyncs against different files there is a lot of contention on the checksums btree, since we have many tasks writing the checksums into the btree and other tasks that already started the logging phase are doing lookups for checksums in the btree. This change also turns all ranged fsyncs into full ranged fsyncs, which is something we already did when not using the NO_HOLES features or when doing a full fsync. This is to guarantee we never miss checksums due to writeback having been triggered only for a part of an extent, and we end up logging the full extent but only checksums for the written range, which results in missing checksums after log replay. Allowing ranged fsyncs to operate again only in the original range, when using the NO_HOLES feature and doing a fast fsync is doable but requires some non trivial changes to the writeback path, which can always be worked on later if needed, but I don't think they are a very common use case. Several tests were performed using fio for different numbers of concurrent jobs, each writing and fsyncing its own file, for both sequential and random file writes. The tests were run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). The following script that calls fio was used: $ cat test-fsync.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-d single -m single" if [ $# -ne 5 ]; then echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE [write|randwrite]" exit 1 fi NUM_JOBS=$1 FILE_SIZE=$2 FSYNC_FREQ=$3 BLOCK_SIZE=$4 WRITE_MODE=$5 if [ "$WRITE_MODE" != "write" ] && [ "$WRITE_MODE" != "randwrite" ]; then echo "Invalid WRITE_MODE, must be 'write' or 'randwrite'" exit 1 fi cat < /tmp/fio-job.ini [writers] rw=$WRITE_MODE fsync=$FSYNC_FREQ fallocate=none group_reporting=1 direct=0 bs=$BLOCK_SIZE ioengine=sync size=$FILE_SIZE directory=$MNT numjobs=$NUM_JOBS EOF echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor echo echo "Using config:" echo cat /tmp/fio-job.ini echo umount $MNT &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT The results were the following: ************************* *** sequential writes *** ************************* ==== 1 job, 8GiB file, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=36.6MiB/s (38.4MB/s), 36.6MiB/s-36.6MiB/s (38.4MB/s-38.4MB/s), io=8192MiB (8590MB), run=223689-223689msec After patch: WRITE: bw=40.2MiB/s (42.1MB/s), 40.2MiB/s-40.2MiB/s (42.1MB/s-42.1MB/s), io=8192MiB (8590MB), run=203980-203980msec (+9.8%, -8.8% runtime) ==== 2 jobs, 4GiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=35.8MiB/s (37.5MB/s), 35.8MiB/s-35.8MiB/s (37.5MB/s-37.5MB/s), io=8192MiB (8590MB), run=228950-228950msec After patch: WRITE: bw=43.5MiB/s (45.6MB/s), 43.5MiB/s-43.5MiB/s (45.6MB/s-45.6MB/s), io=8192MiB (8590MB), run=188272-188272msec (+21.5% throughput, -17.8% runtime) ==== 4 jobs, 2GiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=50.1MiB/s (52.6MB/s), 50.1MiB/s-50.1MiB/s (52.6MB/s-52.6MB/s), io=8192MiB (8590MB), run=163446-163446msec After patch: WRITE: bw=64.5MiB/s (67.6MB/s), 64.5MiB/s-64.5MiB/s (67.6MB/s-67.6MB/s), io=8192MiB (8590MB), run=126987-126987msec (+28.7% throughput, -22.3% runtime) ==== 8 jobs, 1GiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=64.0MiB/s (68.1MB/s), 64.0MiB/s-64.0MiB/s (68.1MB/s-68.1MB/s), io=8192MiB (8590MB), run=126075-126075msec After patch: WRITE: bw=86.8MiB/s (91.0MB/s), 86.8MiB/s-86.8MiB/s (91.0MB/s-91.0MB/s), io=8192MiB (8590MB), run=94358-94358msec (+35.6% throughput, -25.2% runtime) ==== 16 jobs, 512MiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=79.8MiB/s (83.6MB/s), 79.8MiB/s-79.8MiB/s (83.6MB/s-83.6MB/s), io=8192MiB (8590MB), run=102694-102694msec After patch: WRITE: bw=107MiB/s (112MB/s), 107MiB/s-107MiB/s (112MB/s-112MB/s), io=8192MiB (8590MB), run=76446-76446msec (+34.1% throughput, -25.6% runtime) ==== 32 jobs, 512MiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=93.2MiB/s (97.7MB/s), 93.2MiB/s-93.2MiB/s (97.7MB/s-97.7MB/s), io=16.0GiB (17.2GB), run=175836-175836msec After patch: WRITE: bw=111MiB/s (117MB/s), 111MiB/s-111MiB/s (117MB/s-117MB/s), io=16.0GiB (17.2GB), run=147001-147001msec (+19.1% throughput, -16.4% runtime) ==== 64 jobs, 512MiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=108MiB/s (114MB/s), 108MiB/s-108MiB/s (114MB/s-114MB/s), io=32.0GiB (34.4GB), run=302656-302656msec After patch: WRITE: bw=133MiB/s (140MB/s), 133MiB/s-133MiB/s (140MB/s-140MB/s), io=32.0GiB (34.4GB), run=246003-246003msec (+23.1% throughput, -18.7% runtime) ************************ *** random writes *** ************************ ==== 1 job, 8GiB file, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=11.5MiB/s (12.0MB/s), 11.5MiB/s-11.5MiB/s (12.0MB/s-12.0MB/s), io=8192MiB (8590MB), run=714281-714281msec After patch: WRITE: bw=11.6MiB/s (12.2MB/s), 11.6MiB/s-11.6MiB/s (12.2MB/s-12.2MB/s), io=8192MiB (8590MB), run=705959-705959msec (+0.9% throughput, -1.7% runtime) ==== 2 jobs, 4GiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=12.8MiB/s (13.5MB/s), 12.8MiB/s-12.8MiB/s (13.5MB/s-13.5MB/s), io=8192MiB (8590MB), run=638101-638101msec After patch: WRITE: bw=13.1MiB/s (13.7MB/s), 13.1MiB/s-13.1MiB/s (13.7MB/s-13.7MB/s), io=8192MiB (8590MB), run=625374-625374msec (+2.3% throughput, -2.0% runtime) ==== 4 jobs, 2GiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=15.4MiB/s (16.2MB/s), 15.4MiB/s-15.4MiB/s (16.2MB/s-16.2MB/s), io=8192MiB (8590MB), run=531146-531146msec After patch: WRITE: bw=17.8MiB/s (18.7MB/s), 17.8MiB/s-17.8MiB/s (18.7MB/s-18.7MB/s), io=8192MiB (8590MB), run=460431-460431msec (+15.6% throughput, -13.3% runtime) ==== 8 jobs, 1GiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=19.9MiB/s (20.8MB/s), 19.9MiB/s-19.9MiB/s (20.8MB/s-20.8MB/s), io=8192MiB (8590MB), run=412664-412664msec After patch: WRITE: bw=22.2MiB/s (23.3MB/s), 22.2MiB/s-22.2MiB/s (23.3MB/s-23.3MB/s), io=8192MiB (8590MB), run=368589-368589msec (+11.6% throughput, -10.7% runtime) ==== 16 jobs, 512MiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=29.3MiB/s (30.7MB/s), 29.3MiB/s-29.3MiB/s (30.7MB/s-30.7MB/s), io=8192MiB (8590MB), run=279924-279924msec After patch: WRITE: bw=30.4MiB/s (31.9MB/s), 30.4MiB/s-30.4MiB/s (31.9MB/s-31.9MB/s), io=8192MiB (8590MB), run=269258-269258msec (+3.8% throughput, -3.8% runtime) ==== 32 jobs, 512MiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=36.9MiB/s (38.7MB/s), 36.9MiB/s-36.9MiB/s (38.7MB/s-38.7MB/s), io=16.0GiB (17.2GB), run=443581-443581msec After patch: WRITE: bw=41.6MiB/s (43.6MB/s), 41.6MiB/s-41.6MiB/s (43.6MB/s-43.6MB/s), io=16.0GiB (17.2GB), run=394114-394114msec (+12.7% throughput, -11.2% runtime) ==== 64 jobs, 512MiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=45.9MiB/s (48.1MB/s), 45.9MiB/s-45.9MiB/s (48.1MB/s-48.1MB/s), io=32.0GiB (34.4GB), run=714614-714614msec After patch: WRITE: bw=48.8MiB/s (51.1MB/s), 48.8MiB/s-48.8MiB/s (51.1MB/s-51.1MB/s), io=32.0GiB (34.4GB), run=672087-672087msec (+6.3% throughput, -6.0% runtime) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 5 ++ fs/btrfs/file.c | 97 ++++++++++++++-------- fs/btrfs/ordered-data.c | 59 ++++++++++++++ fs/btrfs/ordered-data.h | 11 +++ fs/btrfs/transaction.c | 10 +++ fs/btrfs/transaction.h | 7 ++ fs/btrfs/tree-log.c | 176 ++++++++++++++++++++++++---------------- fs/btrfs/tree-log.h | 18 +++- 8 files changed, 277 insertions(+), 106 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c47b6c6fea9f..00f7831d0902 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -25,6 +25,11 @@ enum { BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index de14ed402390..b859d5ae7f80 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2114,20 +2114,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; + u64 len; + bool full_sync; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* - * Set the range to full if the NO_HOLES feature is not enabled. - * This is to avoid missing file extent items representing holes after - * replaying the log. + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { - start = 0; - end = LLONG_MAX; - } + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete @@ -2151,19 +2155,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges and races between logging and - * completion of ordered extents for adjancent ranges - both races - * could lead to file extent items in the log with overlapping ranges. - * Do this while holding the inode lock, to avoid races with other - * tasks. + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); /* * Before we acquired the inode's lock, someone may have dirtied more @@ -2194,20 +2191,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * - * Also, the range length can be represented by u64, we have to do the - * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. */ - ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); - if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); } + + if (ret) + goto out_release_extents; + atomic_inc(&root->log_batch); + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { + (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && + (full_sync || list_empty(&ctx.ordered_extents)))) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2223,9 +2242,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } /* @@ -2242,12 +2259,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } - ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2274,6 +2290,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2284,6 +2307,12 @@ out: if (!ret) ret = err; return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; } static const struct vm_operations_struct btrfs_file_vm_ops = { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ebac13389e7e..4732c5b89460 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,6 +212,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); @@ -445,6 +446,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); @@ -470,6 +472,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; + bool pending; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -491,8 +494,36 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; @@ -774,6 +805,34 @@ out: return entry; } +/* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d61ea9c880a3..644258a7dfb1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,6 +56,12 @@ enum { BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ BTRFS_ORDERED_REGULAR, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, }; struct btrfs_ordered_extent { @@ -104,6 +110,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* used for fast fsyncs */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -174,6 +183,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d2fc292ac61b..f8265dbec9c3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,6 +292,8 @@ loop: } cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); @@ -2165,6 +2167,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(trans); + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d60b055b8695..8241c050ba71 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -85,6 +85,13 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; }; #define __TRANS_FREEZABLE (1U << 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e86fe7bbba82..e35cfe8cf68b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,6 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4080,10 +4078,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4092,13 +4094,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4138,7 +4198,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4340,10 +4400,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; @@ -4357,23 +4417,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4432,8 +4475,32 @@ process: btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4839,7 +4906,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE_ALL, - 0, LLONG_MAX, ctx); + ctx); btrfs_add_delayed_iput(inode); } } @@ -4897,7 +4964,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5110,8 +5177,6 @@ next_key: static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_path *path; @@ -5290,7 +5355,7 @@ log_extents: } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5299,31 +5364,8 @@ log_extents: struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5604,7 +5646,7 @@ process_leaf: if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5748,7 +5790,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5799,8 +5841,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > last_committed) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -5855,7 +5896,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); if (ret) break; } @@ -5963,8 +6004,6 @@ out: static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { @@ -6017,7 +6056,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6113,15 +6152,13 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -6416,7 +6453,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * we don't need to worry about getting a log committed that has an * inconsistent state after a rename operation. */ - btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx); + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index ddfc6789d9bf..731bd9c029f5 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,8 @@ struct btrfs_log_ctx { bool logging_new_name; struct inode *inode; struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -30,6 +32,20 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) @@ -51,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, From 24646481fb19a3bd86933ec30480454381ff9860 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Aug 2020 17:16:28 +0300 Subject: [PATCH 038/161] btrfs: sysfs: fix unused-but-set-variable warnings The compilation with W=1 generates the following warnings: fs/btrfs/sysfs.c:1630:6: warning: variable 'ret' set but not used [-Wunused-but-set-variable] 1630 | int ret; | ^~~ fs/btrfs/sysfs.c:1629:6: warning: variable 'features' set but not used [-Wunused-but-set-variable] 1629 | u64 features; | ^~~~~~~~ [ The unused variables are leftover from e410e34fad91 ("Revert "btrfs: synchronize incompat feature bits with sysfs files""), which needs to be properly fixed by moving feature bit manipulation from the sysfs context. Silence the warning to save pepople time, we got several reports. ] Signed-off-by: Leon Romanovsky Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fcc8757b4e93..81496ac43d09 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1630,12 +1630,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 features; - int ret; + u64 __maybe_unused features; + int __maybe_unused ret; if (!fs_info) return; + /* + * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not + * safe when called from some contexts (eg. balance) + */ features = get_features(fs_info, set); ASSERT(bit & supported_feature_masks[set]); From 4c448ce8b48fb65536c903f1ed8a80554838508e Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 17 Aug 2020 10:56:10 -0300 Subject: [PATCH 039/161] btrfs: make read_block_group_item return void Since it's inclusion on 9afc66498a0b ("btrfs: block-group: refactor how we read one block group item") this function always returned 0, so there is no need to check for the returned value. Reviewed-by: Nikolay Borisov Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c9d7465501a4..01e8ba1da1d3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1873,7 +1873,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static int read_block_group_item(struct btrfs_block_group *cache, +static void read_block_group_item(struct btrfs_block_group *cache, struct btrfs_path *path, const struct btrfs_key *key) { @@ -1887,8 +1887,6 @@ static int read_block_group_item(struct btrfs_block_group *cache, sizeof(bgi)); cache->used = btrfs_stack_block_group_used(&bgi); cache->flags = btrfs_stack_block_group_flags(&bgi); - - return 0; } static int read_one_block_group(struct btrfs_fs_info *info, @@ -1907,9 +1905,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - ret = read_block_group_item(cache, path, key); - if (ret < 0) - goto error; + read_block_group_item(cache, path, key); set_free_space_tree_thresholds(cache); From 154f7cb86809a3a796bffbc7a5a7ce0dee585eaa Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 20 Aug 2020 15:42:46 +0800 Subject: [PATCH 040/161] btrfs: add owner and fs_info to alloc_state io_tree Commit 1c11b63eff2a ("btrfs: replace pending/pinned chunks lists with io tree") introduced btrfs_device::alloc_state extent io tree, but it doesn't initialize the fs_info and owner member. This means the following features are not properly supported: - Fs owner report for insert_state() error Without fs_info initialized, although btrfs_err() won't panic, it won't output which fs is causing the error. - Wrong owner for trace events alloc_state will get the owner as pinned extents. Fix this by assiging proper fs_info and owner for btrfs_device::alloc_state. Fixes: 1c11b63eff2a ("btrfs: replace pending/pinned chunks lists with io tree") Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 1 + fs/btrfs/volumes.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 219a09a2b734..250b8cbaaf97 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -48,6 +48,7 @@ enum { IO_TREE_INODE_FILE_EXTENT, IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, }; struct extent_io_tree { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5d73c287314a..a028a64b4add 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -406,7 +406,7 @@ void __exit btrfs_cleanup_fs_uuids(void) * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -433,7 +433,8 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } @@ -6532,7 +6533,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; From f85781fb505ec02891734e77f7c69a9c85c99ec3 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 17 Aug 2020 11:18:21 -0500 Subject: [PATCH 041/161] btrfs: switch to iomap for direct IO We're using direct io implementation based on buffer heads. This patch switches to the new iomap infrastructure. Switch from __blockdev_direct_IO() to iomap_dio_rw(). Rename btrfs_get_blocks_direct() to btrfs_dio_iomap_begin() and use it as iomap_begin() for iomap direct I/O functions. This function allocates and locks all the blocks required for the I/O. btrfs_submit_direct() is used as the submit_io() hook for direct I/O ops. Since we need direct I/O reads to go through iomap_dio_rw(), we change file_operations.read_iter() to a btrfs_file_read_iter() which calls btrfs_direct_IO() for direct reads and falls back to generic_file_buffered_read() for incomplete reads and buffered reads. We don't need address_space.direct_IO() anymore: set it to noop. Similarly, we don't need flags used in __blockdev_direct_IO(). iomap is capable of direct I/O reads from a hole, so we don't need to return -ENOENT. Btrfs direct I/O is now done under i_rwsem, shared in case of reads and exclusive in case of writes. This guards against simultaneous truncates. Use iomap->iomap_end() to check for failed or incomplete direct I/O: - for writes, call __endio_write_update_ordered() - for reads, unlock extents btrfs_dio_data is now hooked in iomap->private and not current->journal_info. It carries the reservation variable and the amount of data submitted, so we can calculate the amount of data to call __endio_write_update_ordered in case of an error. This patch removes last use of struct buffer_head from btrfs. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 1 + fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 21 ++- fs/btrfs/inode.c | 325 ++++++++++++++++++++++------------------------- 4 files changed, 176 insertions(+), 172 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f2d4c14fc273..eb7adc069926 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3020,6 +3020,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b859d5ae7f80..b62679382799 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1870,7 +1870,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from); + written = btrfs_direct_IO(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -3568,9 +3568,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct inode *inode = file_inode(iocb->ki_filp); + + inode_lock_shared(inode); + ret = btrfs_direct_IO(iocb, to); + inode_unlock_shared(inode); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8a35db53369..4f0b1dbd3240 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include "misc.h" #include "ctree.h" @@ -59,9 +59,9 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7103,7 +7103,7 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7241,30 +7241,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7325,7 +7302,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { @@ -7336,64 +7312,76 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; } + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } + } + iomap->private = dio_data; + + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7425,36 +7413,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; } + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; + free_extent_map(em); return 0; @@ -7463,8 +7463,55 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + kfree(dio_data); + iomap->private = NULL; + return ret; } @@ -7488,7 +7535,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - dio_end_io(dip->dio_bio); + bio_endio(dip->dio_bio); kfree(dip); } @@ -7722,24 +7769,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); - - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - /* - * Setting range start and end to the same value means that - * no cleanup will happen in btrfs_direct_IO - */ - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } return dip; } -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7756,6 +7790,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7764,8 +7799,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - dio_end_io(dio_bio); - return; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } if (!write && csum) { @@ -7836,15 +7871,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return; + return BLK_QC_T_NONE; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7880,37 +7917,30 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7918,66 +7948,21 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, offset, dio_data.reserve, - true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(BTRFS_I(inode), - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -10209,7 +10194,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION From 0eb79294dbe328debae67961df28113141825d7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Sep 2020 11:16:51 -0400 Subject: [PATCH 042/161] btrfs: dio iomap DSYNC workaround iomap dio will run generic_write_sync() for us if the iocb is DSYNC. This is problematic for us because of 2 reasons: 1. we hold the inode_lock() during this operation, and we take it in generic_write_sync() 2. we hold a read lock on the dio_sem but take the write lock in fsync Since we don't want to rip out this code right now, but reworking the locking is a bit much to do at this point, work around this problem with this masterpiece of a patch. First, we clear DSYNC on the iocb so that the iomap stuff doesn't know that it needs to handle the sync. We save this fact in current->journal_info, because we need to see do special things once we're in iomap_begin, and we have no way to pass private information into iomap_dio_rw(). Next we specify a separate iomap_dio_ops for sync, which implements an ->end_io() callback that gets called when the dio completes. This is important for AIO, because we really do need to run generic_write_sync() if we complete asynchronously. However if we're still in the submitting context when we enter ->end_io() we clear the flag so that the submitter knows they're the ones that needs to run generic_write_sync(). This is meant to be temporary. We need to work out how to eliminate the inode_lock() and the dio_sem in our fsync and use another mechanism to protect these operations. Tested-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 33 ++++++++++++++++++++++ fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/transaction.h | 1 + 3 files changed, 94 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b62679382799..4bf15846efdc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2023,7 +2023,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4f0b1dbd3240..366d5b768f86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,6 +62,7 @@ struct btrfs_dio_data { loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7337,6 +7338,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); + + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; if (!write) len = min_t(u64, len, fs_info->sectorsize); @@ -7362,6 +7374,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; + dio_data->sync = sync; dio_data->length = length; if (write) { dio_data->reserve = round_up(length, fs_info->sectorsize); @@ -7509,6 +7522,14 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, extent_changeset_free(dio_data->data_reserved); } out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } kfree(dio_data); iomap->private = NULL; @@ -7917,6 +7938,30 @@ out: return retval; } +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, @@ -7926,6 +7971,11 @@ static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -7954,8 +8004,16 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) down_read(&BTRFS_I(inode)->dio_sem); } - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - is_sync_kiocb(iocb)); + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8241c050ba71..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -112,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; From 1028d1c48b95c37336bed24a1dc1594e70aa67c6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Aug 2020 12:58:46 +0300 Subject: [PATCH 043/161] btrfs: remove err variable from btrfs_get_extent There's no practical reason too use 'err' as a variable to convey errors. In fact it's value is either set explicitly in the beginning of the function or it simply takes the value of 'ret'. Not conforming to the usual pattern of having ret be the only variable used to convey errors makes the code more error prone to bugs. In fact one such bug was introduced by 6bf9e4bd6a27 ("btrfs: inode: Verify inode mode toi avoid NULL pointer dereference") by assigning the error value to 'ret' and not 'err'. Let's fix that issue and make the function less tricky by leaving only ret to convey error values. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 366d5b768f86..a50a40f8bef2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6534,8 +6534,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6563,7 +6562,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; @@ -6573,7 +6572,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -6588,12 +6587,12 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { - err = ret; goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; @@ -6619,7 +6618,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); @@ -6637,12 +6636,11 @@ next: path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } else if (ret > 0) { + else if (ret > 0) goto not_found; - } + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6693,10 +6691,8 @@ next: BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6720,27 +6716,27 @@ not_found: em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } return em; } From dc0ab488d2cb514e08276cbbc846b2ebb6056c2a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 13:48:46 +0300 Subject: [PATCH 044/161] btrfs: factor out reada loop in __reada_start_machine This is in preparation for moving fs_devices to proper lists. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/reada.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 243a2e44526e..a6bfe7ec14cb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -767,15 +767,13 @@ static void reada_start_machine_worker(struct btrfs_work *work) kfree(rmw); } -static void __reada_start_machine(struct btrfs_fs_info *fs_info) +/* Try to start up to 10k READA requests for a group of devices */ +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 enqueued; u64 total = 0; - int i; + struct btrfs_device *device; -again: do { enqueued = 0; mutex_lock(&fs_devices->device_list_mutex); @@ -787,6 +785,18 @@ again: mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); + + return total; +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + int i; + u64 enqueued = 0; + +again: + enqueued += reada_start_for_fsdevs(fs_devices); if (fs_devices->seed) { fs_devices = fs_devices->seed; goto again; From 3712ccb7f1cccd9371efdaaac4775aa79bdf5f40 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 16 Jul 2020 10:17:04 +0300 Subject: [PATCH 045/161] btrfs: factor out loop logic from btrfs_free_extra_devids This prepares the code to switching seeds devices to a proper list. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a028a64b4add..309b575613fe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1036,28 +1036,21 @@ error: return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1095,6 +1088,19 @@ again: btrfs_free_device(device); } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + + mutex_lock(&uuid_mutex); +again: + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); if (fs_devices->seed) { fs_devices = fs_devices->seed; goto again; From 54eed6ae8d8e9df81202da64f0817d8e40d5db0e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 13:48:48 +0300 Subject: [PATCH 046/161] btrfs: make close_fs_devices return void The return value of this function conveys absolutely no information. All callers already check the state of fs_devices->opened to decide how to proceed. So convert the function to returning void. While at it make btrfs_close_devices also return void. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 ++++-------- fs/btrfs/volumes.h | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 309b575613fe..e94a0eeeeb52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1156,12 +1156,12 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) - return 0; + return; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { @@ -1173,17 +1173,14 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; - - return 0; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_fs_devices *seed_devices = NULL; - int ret; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); + close_fs_devices(fs_devices); if (!fs_devices->opened) { seed_devices = fs_devices->seed; fs_devices->seed = NULL; @@ -1196,7 +1193,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) close_fs_devices(fs_devices); free_fs_devices(fs_devices); } - return ret; } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 302c9234f7d0..d0e93e01d60b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); From c4989c2fd0eba6e164e9a29c4a865e57dd644451 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 13:48:49 +0300 Subject: [PATCH 047/161] btrfs: simplify setting/clearing fs_info to btrfs_fs_devices It makes no sense to have sysfs-related routines be responsible for properly initialising the fs_info pointer of struct btrfs_fs_device. Instead this can be streamlined by making it the responsibility of btrfs_init_devices_late to initialize it. That function already initializes fs_info of every individual device in btrfs_fs_devices. As far as clearing it is concerned it makes sense to move it to close_fs_devices. That function is only called when struct btrfs_fs_devices is no longer in use - either for holding seeds or main devices for a mounted filesystem. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 4 ---- fs/btrfs/volumes.c | 20 ++------------------ fs/btrfs/volumes.h | 2 -- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81496ac43d09..438a367371c4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -939,8 +939,6 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; - btrfs_reset_fs_info_ptr(fs_info); - sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { @@ -1404,8 +1402,6 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - btrfs_set_fs_info_ptr(fs_info); - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); if (error) return error; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e94a0eeeeb52..91a413b73b64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1173,6 +1173,7 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; + fs_devices->fs_info = NULL; } void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) @@ -7201,6 +7202,7 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) device->fs_info = fs_info; mutex_unlock(&fs_devices->device_list_mutex); + fs_devices->fs_info = fs_info; fs_devices = fs_devices->seed; } } @@ -7499,24 +7501,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) mutex_unlock(&trans->fs_info->chunk_mutex); } -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } -} - /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d0e93e01d60b..524aa5eb5667 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -569,8 +569,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, From 944d3f9fac61e24e13a056b25974df3831994f29 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 16 Jul 2020 10:25:33 +0300 Subject: [PATCH 048/161] btrfs: switch seed device to list api While this patch touches a bunch of files the conversion is straighforward. Instead of using the implicit linked list anchored at btrfs_fs_devices::seed the code is switched to using list_for_each_entry. Previous patches in the series already factored out code that processed both main and seed devices so in those cases the factored out functions are called on the main fs_devices and then on every seed dev inside list_for_each_entry. Using list api also allows to simplify deletion from the seed dev list performed in btrfs_rm_device and btrfs_rm_dev_replace_free_srcdev by substituting a while() loop with a simple list_del_init. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 39 ++++++++--------- fs/btrfs/reada.c | 9 ++-- fs/btrfs/volumes.c | 105 +++++++++++++++++++++------------------------ fs/btrfs/volumes.h | 2 +- 4 files changed, 71 insertions(+), 84 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a496219baa39..84799ef76758 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -545,33 +545,30 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - int ret = 1; + u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); - while (fs_devices) { - u8 *metadata_uuid; + /* + * Checking the incompat flag is only valid for the current fs. For + * seed devices it's forbidden to have their uuid changed so reading + * ->fsid in this case is fine + */ + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; - /* - * Checking the incompat flag is only valid for the current - * fs. For seed devices it's forbidden to have their uuid - * changed so reading ->fsid in this case is fine - */ - if (fs_devices == fs_info->fs_devices && - btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + return 0; - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return 0; + + return 1; } static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a6bfe7ec14cb..e20972230823 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -791,16 +791,13 @@ static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) static void __reada_start_machine(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; int i; u64 enqueued = 0; -again: enqueued += reada_start_for_fsdevs(fs_devices); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + enqueued += reada_start_for_fsdevs(seed_devs); if (enqueued == 0) return; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 91a413b73b64..2d7c845b8e67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -356,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -1097,14 +1098,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); -again: __btrfs_free_extra_devids(fs_devices, step, &latest_dev); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1178,20 +1178,18 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } + if (!fs_devices->opened) + list_splice_init(&fs_devices->seed_list, &list); mutex_unlock(&uuid_mutex); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } } @@ -2169,14 +2167,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } @@ -2236,8 +2227,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2246,15 +2235,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } @@ -2409,7 +2390,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; - fs_devices->seed = seed_devices; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -6465,11 +6446,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; - while (fs_devices) { + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6477,11 +6468,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6732,13 +6720,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { @@ -6772,8 +6757,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, goto out; } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); out: return fs_devices; } @@ -7193,17 +7177,23 @@ error: void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); + fs_devices->fs_info = fs_info; - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + mutex_unlock(&fs_devices->device_list_mutex); + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + mutex_lock(&seed_devs->device_list_mutex); + list_for_each_entry(device, &seed_devs->devices, dev_list) + device->fs_info = fs_info; + mutex_unlock(&seed_devs->device_list_mutex); + + seed_devs->fs_info = fs_info; } } @@ -7581,8 +7571,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, - NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 524aa5eb5667..48bdca01e237 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -246,7 +246,7 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; - struct btrfs_fs_devices *seed; + struct list_head seed_list; bool seeding; int opened; From 427c8fddb1296dd013d8af5a4957d9d88fb8ab4e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 17:04:36 +0300 Subject: [PATCH 049/161] btrfs: document some invariants of seed code Without good understanding of how seed devices works it's hard to grok some of what the code in open_seed_devices or btrfs_prepare_sprout does. Add comments hopefully reducing some of the cognitive load. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d7c845b8e67..7a39f3aebc53 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2357,10 +2357,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -6720,6 +6730,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); + /* This will match only for multi-device seed fs */ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; @@ -6739,6 +6750,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; From 68abf360160ca085bb5109d35692ea0f9581f8d1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 16:26:46 +0300 Subject: [PATCH 050/161] btrfs: remove alloc_list splice in btrfs_prepare_sprout btrfs_prepare_sprout is called when the first rw device is added to a seed filesystem. This means the filesystem can't have its alloc_list be non-empty, since seed filesystems are read only. Simply remove the code altogether. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7a39f3aebc53..9800e966ef6e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2391,10 +2391,6 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; From 62cf5391209ac7b258a82316c4d703342863fd37 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:18:27 -0400 Subject: [PATCH 051/161] btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks When closing and freeing the source device we could end up doing our final blkdev_put() on the bdev, which will grab the bd_mutex. As such we want to be holding as few locks as possible, so move this call outside of the dev_replace->lock_finishing_cancel_unmount lock. Since we're modifying the fs_devices we need to make sure we're holding the uuid_mutex here, so take that as well. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9800e966ef6e..503ccc3a091e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2216,7 +2216,6 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { - struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; mutex_lock(&uuid_mutex); From 425c6ed6486f1e2ed892eadb94fd4829c299493e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:28 -0400 Subject: [PATCH 052/161] btrfs: do not hold device_list_mutex when closing devices The following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00169-g87212851a027-dirty #929 Not tainted ------------------------------------------------------ fsstress/8739 is trying to acquire lock: ffff88bfd0eb0c90 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x43/0x70 but task is already holding lock: ffff88bfbd16e538 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x6a/0x4a0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #10 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x129/0x210 btrfs_page_mkwrite+0x6a/0x4a0 do_page_mkwrite+0x4d/0xc0 handle_mm_fault+0x103c/0x1730 exc_page_fault+0x340/0x660 asm_exc_page_fault+0x1e/0x30 -> #9 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x68/0x90 _copy_to_user+0x1e/0x80 perf_read+0x141/0x2c0 vfs_read+0xad/0x1b0 ksys_read+0x5f/0xe0 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #8 (&cpuctx_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x88/0x150 perf_event_init+0x1db/0x20b start_kernel+0x3ae/0x53c secondary_startup_64+0xa4/0xb0 -> #7 (pmus_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x4f/0x150 cpuhp_invoke_callback+0xb1/0x900 _cpu_up.constprop.26+0x9f/0x130 cpu_up+0x7b/0xc0 bringup_nonboot_cpus+0x4f/0x60 smp_init+0x26/0x71 kernel_init_freeable+0x110/0x258 kernel_init+0xa/0x103 ret_from_fork+0x1f/0x30 -> #6 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x39/0xb0 kmem_cache_create_usercopy+0x28/0x230 kmem_cache_create+0x12/0x20 bioset_init+0x15e/0x2b0 init_bio+0xa3/0xaa do_one_initcall+0x5a/0x2e0 kernel_init_freeable+0x1f4/0x258 kernel_init+0xa/0x103 ret_from_fork+0x1f/0x30 -> #5 (bio_slab_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 bioset_init+0xbc/0x2b0 __blk_alloc_queue+0x6f/0x2d0 blk_mq_init_queue_data+0x1b/0x70 loop_add+0x110/0x290 [loop] fq_codel_tcf_block+0x12/0x20 [sch_fq_codel] do_one_initcall+0x5a/0x2e0 do_init_module+0x5a/0x220 load_module+0x2459/0x26e0 __do_sys_finit_module+0xba/0xe0 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (loop_ctl_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 lo_open+0x18/0x50 [loop] __blkdev_get+0xec/0x570 blkdev_get+0xe8/0x150 do_dentry_open+0x167/0x410 path_openat+0x7c9/0xa80 do_filp_open+0x93/0x100 do_sys_openat2+0x22a/0x2e0 do_sys_open+0x4b/0x80 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 blkdev_put+0x1d/0x120 close_fs_devices.part.31+0x84/0x130 btrfs_close_devices+0x44/0xb0 close_ctree+0x296/0x2b2 generic_shutdown_super+0x69/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_run_dev_stats+0x49/0x480 commit_cowonly_roots+0xb5/0x2a0 btrfs_commit_transaction+0x516/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_commit_transaction+0x4bb/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 __mutex_lock+0x9f/0x930 btrfs_record_root_in_trans+0x43/0x70 start_transaction+0xd1/0x5d0 btrfs_dirty_inode+0x42/0xd0 file_update_time+0xc8/0x110 btrfs_page_mkwrite+0x10c/0x4a0 do_page_mkwrite+0x4d/0xc0 handle_mm_fault+0x103c/0x1730 exc_page_fault+0x340/0x660 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by fsstress/8739: #0: ffff88bee66eeb68 (&mm->mmap_lock#2){++++}-{3:3}, at: exc_page_fault+0x173/0x660 #1: ffff88bfbd16e538 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x6a/0x4a0 #2: ffff88bfbd16e630 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3da/0x5d0 stack backtrace: CPU: 17 PID: 8739 Comm: fsstress Kdump: loaded Not tainted 5.8.0-rc7-00169-g87212851a027-dirty #929 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 ? btrfs_get_alloc_profile+0x150/0x210 lock_acquire+0x9e/0x360 ? btrfs_record_root_in_trans+0x43/0x70 __mutex_lock+0x9f/0x930 ? btrfs_record_root_in_trans+0x43/0x70 ? lock_acquire+0x9e/0x360 ? join_transaction+0x5d/0x450 ? find_held_lock+0x2d/0x90 ? btrfs_record_root_in_trans+0x43/0x70 ? join_transaction+0x3d5/0x450 ? btrfs_record_root_in_trans+0x43/0x70 btrfs_record_root_in_trans+0x43/0x70 start_transaction+0xd1/0x5d0 btrfs_dirty_inode+0x42/0xd0 file_update_time+0xc8/0x110 btrfs_page_mkwrite+0x10c/0x4a0 ? handle_mm_fault+0x5e/0x1730 do_page_mkwrite+0x4d/0xc0 ? __do_fault+0x32/0x150 handle_mm_fault+0x103c/0x1730 exc_page_fault+0x340/0x660 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7faa6c9969c4 Was seen in testing. The fix is similar to that of btrfs: open device without device_list_mutex where we're holding the device_list_mutex and then grab the bd_mutex, which pulls in a bunch of dependencies under the bd_mutex. We only ever call btrfs_close_devices() on mount failure or unmount, so we're save to not have the device_list_mutex here. We're already holding the uuid_mutex which keeps us safe from any external modification of the fs_devices. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 503ccc3a091e..29c6f0154c62 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1160,14 +1160,13 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; + lockdep_assert_held(&uuid_mutex); + if (--fs_devices->opened > 0) return; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); @@ -1185,13 +1184,13 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) close_fs_devices(fs_devices); if (!fs_devices->opened) list_splice_init(&fs_devices->seed_list, &list); - mutex_unlock(&uuid_mutex); list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, From b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 24 Jul 2020 14:46:09 +0800 Subject: [PATCH 053/161] btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode For delayed inode facility, qgroup metadata is reserved for it, and later freed. However we're freeing more bytes than we reserved. In btrfs_delayed_inode_reserve_metadata(): num_bytes = btrfs_calc_metadata_size(fs_info, 1); ... ret = btrfs_qgroup_reserve_meta_prealloc(root, fs_info->nodesize, true); ... if (!ret) { node->bytes_reserved = num_bytes; But in btrfs_delayed_inode_release_metadata(): if (qgroup_free) btrfs_qgroup_free_meta_prealloc(node->root, node->bytes_reserved); else btrfs_qgroup_convert_reserved_meta(node->root, node->bytes_reserved); This means, we're always releasing more qgroup metadata rsv than we have reserved. This won't trigger selftest warning, as btrfs qgroup metadata rsv has extra protection against cases like quota enabled half-way. But we still need to fix this problem any way. This patch will use the same num_bytes for qgroup metadata rsv so we could handle it correctly. Fixes: f218ea6c4792 ("btrfs: delayed-inode: Remove wrong qgroup meta reservation calls") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf1595a42a98..0727b10a9a89 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, - fs_info->nodesize, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, From e85fde5162bf1b242cbd6daf7dba0f9b457d592b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 24 Jul 2020 14:46:10 +0800 Subject: [PATCH 054/161] btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations [BUG] When quota is enabled for TEST_DEV, generic/013 sometimes fails like this: generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg) And with the following metadata leak: BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs] Call Trace: btrfs_put_super+0x15/0x17 [btrfs] generic_shutdown_super+0x72/0x110 kill_anon_super+0x18/0x30 btrfs_kill_super+0x17/0x30 [btrfs] deactivate_locked_super+0x3b/0xa0 deactivate_super+0x40/0x50 cleanup_mnt+0x135/0x190 __cleanup_mnt+0x12/0x20 task_work_run+0x64/0xb0 __prepare_exit_to_usermode+0x1bc/0x1c0 __syscall_return_slowpath+0x47/0x230 do_syscall_64+0x64/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace a6cfd45ba80e4e06 ]--- BTRFS error (device dm-3): qgroup reserved space leaked BTRFS info (device dm-3): disk space caching is enabled BTRFS info (device dm-3): has skinny extents [CAUSE] The qgroup preallocated meta rsv operations of that offending root are: btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072 btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072 btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152 btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072 btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072 It's pretty obvious that, we reserve qgroup meta rsv in btrfs_subvolume_reserve_metadata(), but doesn't have corresponding release/convert calls in btrfs_subvolume_release_metadata(). This leads to the leakage. [FIX] To fix this bug, we should follow what we're doing in btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and add it to block_rsv->qgroup_rsv_reserved. And free the qgroup reserved metadata space when releasing the block_rsv. To do this, we need to change the btrfs_subvolume_release_metadata() to accept btrfs_root, and record the qgroup_to_release number, and call btrfs_qgroup_convert_reserved_meta() for it. Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/root-tree.c | 13 +++++++++++-- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eb7adc069926..f9d4e0958e2e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2622,7 +2622,7 @@ enum btrfs_flush_state { int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a50a40f8bef2..123521aa5595 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4051,7 +4051,7 @@ out_end_trans: err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5355a16eabb..3779a6c12184 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -618,7 +618,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -742,7 +742,7 @@ fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) @@ -856,7 +856,7 @@ fail: if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c89697486366..702dc5441f03 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); } From 217f5004fee678bf238fbbc7e7a01c55df92e732 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 16:16:35 +0300 Subject: [PATCH 055/161] btrfs: rework error detection in init_tree_roots To avoid duplicating 3 lines of code the error detection logic in init_tree_roots is somewhat quirky. It first checks for the presence of any error condition, then checks for the specific condition to perform any specific actions. That's spurious because directly checking for each respective error condition and doing the necessary steps is more obvious. While at it change the -EUCLEAN to -EIO in case the extent buffer is not read correctly, this is in line with other sites which return -EIO when the eb couldn't be read. Additionally it results in smaller code and the code reads more linearly: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-95 (-95) Function old new delta open_ctree 17243 17148 -95 Total: Before=113104, After=113009, chg -0.08% Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 84799ef76758..4eb56f00ee81 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2623,18 +2623,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { + if (IS_ERR(tree_root->node)) { handle_error = true; + ret = PTR_ERR(tree_root->node); + tree_root->node = NULL; + btrfs_warn(fs_info, "couldn't read tree root"); + continue; - if (IS_ERR(tree_root->node)) { - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - } else if (!extent_buffer_uptodate(tree_root->node)) { - ret = -EUCLEAN; - } - - btrfs_warn(fs_info, "failed to read tree root"); + } else if (!extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + ret = -EIO; + btrfs_warn(fs_info, "error while reading tree root"); continue; } From f98b6215d7d1cda6ac552b7bc58a6ad092aa517c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:47 +0800 Subject: [PATCH 056/161] btrfs: extent_io: do extra check for extent buffer read write functions Although we have start, len check for extent buffer reader/write (e.g. read_extent_buffer()), these checks have limitations: - No overflow check Values like start = 1024 len = -1024 can still pass the basic (start + len) > eb->len check. - Checks are not consistent For read_extent_buffer() we only check (start + len) against eb->len. While for memcmp_extent_buffer() we also check start against eb->len. - Different error reporting mechanism We use WARN() in read_extent_buffer() but BUG() in memcpy_extent_buffer(). - Still modify memory if the request is obviously wrong In read_extent_buffer() even we find (start + len) > eb->len, we still call memset(dst, 0, len), which can easily cause memory access error if start + len overflows. To address above problems, this patch creates a new common function to check such access, check_eb_range(). - Add overflow check This function checks start, start + len against eb->len and overflow check. - Unified checks - Unified error reports Will call WARN() if CONFIG_BTRFS_DEBUG is configured. And also do btrfs_warn() message for non-debug build. - Exit ASAP if check fails No more possible memory corruption. - Add extra comment for @start @len used in those functions as it's sometimes confused with the logical addressing instead of a range inside the eb space Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202817 [ Inspired by above report, the report itself is already addressed ] Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ use check_add_overflow ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 84 +++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0320ddb0133e..a28d442d65b5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5622,6 +5622,36 @@ unlock_exit: return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5632,12 +5662,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) return; - } offset = offset_in_page(start); @@ -5702,8 +5728,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; offset = offset_in_page(start); @@ -5756,8 +5782,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5785,8 +5811,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, char *kaddr; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5830,6 +5856,10 @@ void copy_extent_buffer(const struct extent_buffer *dst, char *kaddr; unsigned long i = dst_offset >> PAGE_SHIFT; + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + WARN_ON(src->len != dst_len); offset = offset_in_page(dst_offset); @@ -6019,25 +6049,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { dst_off_in_page = offset_in_page(dst_offset); @@ -6064,7 +6084,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -6073,18 +6092,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; From 1c2a07f598d526e39acbf1837b8d521fc0dab9c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:48 +0800 Subject: [PATCH 057/161] btrfs: extent-tree: kill BUG_ON() in __btrfs_free_extent() __btrfs_free_extent() is doing two things: 1. Reduce the refs number of an extent backref Either it's an inline extent backref (inside EXTENT/METADATA item) or a keyed extent backref (SHARED_* item). We only need to locate that backref line, either reduce the number or remove the backref line completely. 2. Update the refs count in EXTENT/METADATA_ITEM During step 1), we will try to locate the EXTENT/METADATA_ITEM without triggering another btrfs_search_slot() as fast path. Only when we fail to locate that item, we will trigger another btrfs_search_slot() to get that EXTENT/METADATA_ITEM after we updated/deleted the backref line. And we have a lot of strict checks on things like refs_to_drop against extent refs and special case checks for single ref extents. There are 7 BUG_ON()s, although they're doing correct checks, they can be triggered by crafted images. This patch improves the function: - Introduce two examples to show what __btrfs_free_extent() is doing One inline backref case and one keyed case. Should cover most cases. - Kill all BUG_ON()s with proper error message and optional leaf dump - Add comment to show the overall flow Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202819 [ The report triggers one BUG_ON() in __btrfs_free_extent() ] Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 160 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 147 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 79c4432e6a3f..40e5b41e91cc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2934,6 +2934,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF ) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF ) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2966,7 +3025,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -2975,6 +3042,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -2991,13 +3065,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); @@ -3008,6 +3090,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3082,19 +3165,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3106,7 +3196,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); @@ -3121,13 +3216,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } @@ -3168,6 +3289,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* From 07cce5cf3b489419aa8e87f48a55f4e190a30876 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:49 +0800 Subject: [PATCH 058/161] btrfs: extent-tree: kill the BUG_ON() in insert_inline_extent_backref() [BUG] With a crafted image, btrfs can panic at insert_inline_extent_backref(): kernel BUG at fs/btrfs/extent-tree.c:1857! invalid opcode: 0000 [#1] SMP PTI CPU: 0 PID: 1117 Comm: btrfs-transacti Not tainted 5.0.0-rc8+ #9 RIP: 0010:insert_inline_extent_backref+0xcc/0xe0 RSP: 0018:ffffac4dc1287be8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000007 RCX: 0000000000000001 RDX: 0000000000001000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffac4dc1287c28 R08: ffffac4dc1287ab8 R09: ffffac4dc1287ac0 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff8febef88a540 R14: ffff8febeaa7bc30 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8febf7a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f663ace94c0 CR3: 0000000235698006 CR4: 00000000000206f0 Call Trace: ? _cond_resched+0x1a/0x50 __btrfs_inc_extent_ref.isra.64+0x7e/0x240 ? btrfs_merge_delayed_refs+0xa5/0x330 __btrfs_run_delayed_refs+0x653/0x1120 btrfs_run_delayed_refs+0xdb/0x1b0 btrfs_commit_transaction+0x52/0x950 ? start_transaction+0x94/0x450 transaction_kthread+0x163/0x190 kthread+0x105/0x140 ? btrfs_cleanup_transaction+0x560/0x560 ? kthread_destroy_worker+0x50/0x50 ret_from_fork+0x35/0x40 Modules linked in: ---[ end trace 2ad8b3de903cf825 ]--- [CAUSE] Due to extent tree corruption (still valid by itself, but bad cross ref), we can allocate an extent which is still in extent tree. The offending tree block of that case is from csum tree. The newly allocated tree block is also for csum tree. Then we will try to insert a tree block ref for the existing tree block ref. For a tree extent item, tree block can never be shared directly by the same tree twice. We have such BUG_ON() to prevent such problem, but this is not a proper error handling. [FIX] Replace that BUG_ON() with proper error message and leaf dump for debug build. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202829 Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 40e5b41e91cc..7b61b7b5122f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1177,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } update_inline_extent_backref(path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { @@ -1397,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for From d16c702fe4f274bd77b47d3ab737eadcf24e0b93 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:50 +0800 Subject: [PATCH 059/161] btrfs: ctree: check key order before merging tree blocks [BUG] With a crafted image, btrfs can panic at btrfs_del_csums(): kernel BUG at fs/btrfs/ctree.c:3188! invalid opcode: 0000 [#1] SMP PTI CPU: 0 PID: 1156 Comm: btrfs-transacti Not tainted 5.0.0-rc8+ #9 RIP: 0010:btrfs_set_item_key_safe+0x16c/0x180 RSP: 0018:ffff976141257ab8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff898a6b890930 RCX: 0000000004b70000 RDX: 0000000000000000 RSI: ffff976141257bae RDI: ffff976141257acf RBP: ffff976141257b10 R08: 0000000000001000 R09: ffff9761412579a8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff976141257abe R13: 0000000000000003 R14: ffff898a6a8be578 R15: ffff976141257bae FS: 0000000000000000(0000) GS:ffff898a77a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f779d9cd624 CR3: 000000022b2b4006 CR4: 00000000000206f0 Call Trace: truncate_one_csum+0xac/0xf0 btrfs_del_csums+0x24f/0x3a0 __btrfs_free_extent.isra.72+0x5a7/0xbe0 __btrfs_run_delayed_refs+0x539/0x1120 btrfs_run_delayed_refs+0xdb/0x1b0 btrfs_commit_transaction+0x52/0x950 ? start_transaction+0x94/0x450 transaction_kthread+0x163/0x190 kthread+0x105/0x140 ? btrfs_cleanup_transaction+0x560/0x560 ? kthread_destroy_worker+0x50/0x50 ret_from_fork+0x35/0x40 Modules linked in: ---[ end trace 93bf9db00e6c374e ]--- [CAUSE] This crafted image has a tricky key order corruption: checksum tree key (CSUM_TREE ROOT_ITEM 0) node 29741056 level 1 items 14 free 107 generation 19 owner CSUM_TREE ... key (EXTENT_CSUM EXTENT_CSUM 73785344) block 29757440 gen 19 key (EXTENT_CSUM EXTENT_CSUM 77594624) block 29753344 gen 19 ... leaf 29757440 items 5 free space 150 generation 19 owner CSUM_TREE item 0 key (EXTENT_CSUM EXTENT_CSUM 73785344) itemoff 2323 itemsize 1672 range start 73785344 end 75497472 length 1712128 item 1 key (EXTENT_CSUM EXTENT_CSUM 75497472) itemoff 2319 itemsize 4 range start 75497472 end 75501568 length 4096 item 2 key (EXTENT_CSUM EXTENT_CSUM 75501568) itemoff 579 itemsize 1740 range start 75501568 end 77283328 length 1781760 item 3 key (EXTENT_CSUM EXTENT_CSUM 77283328) itemoff 575 itemsize 4 range start 77283328 end 77287424 length 4096 item 4 key (EXTENT_CSUM EXTENT_CSUM 4120596480) itemoff 275 itemsize 300 <<< range start 4120596480 end 4120903680 length 307200 leaf 29753344 items 3 free space 1936 generation 19 owner CSUM_TREE item 0 key (18446744073457893366 EXTENT_CSUM 77594624) itemoff 2323 itemsize 1672 range start 77594624 end 79306752 length 1712128 ... Note the item 4 key of leaf 29757440, which is obviously too large, and even larger than the first key of the next leaf. However it still follows the key order in that tree block, thus tree checker is unable to detect it at read time, since tree checker can only work inside one leaf, thus such complex corruption can't be detected in advance. [FIX] The next time to detect such problem is at tree block merge time, which is in push_node_left(), balance_node_right(), push_leaf_left() or push_leaf_right(). Now we check if the key order of the right-most key of the left node is larger than the left-most key of the right node. By this we don't need to call the full tree-checker, while still keeping the key order correct as key order in each node is already checked by tree checker thus we only need to check the above two slots. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202833 Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e0fd8a34efcc..be7d01055118 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3163,6 +3163,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, fixup_low_keys(path, &disk_key, 1); } +/* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + /* * try to push data from one node into the next node left in the * tree. @@ -3207,6 +3259,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3275,6 +3333,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), @@ -3751,6 +3815,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (path->slots[0] == left_nritems && !empty) { /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete @@ -3988,6 +4058,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; + } return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); From f4cfa9bdd40c038ac901fbf3c57ab63e9e8eb949 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:22 +0300 Subject: [PATCH 060/161] btrfs: use RCU for quick device check in btrfs_init_new_device When adding a new device there's a mandatory check to see if a device is being duplicated to the filesystem it's added to. Since this is a read-only operations not necessary to take device_list_mutex and can simply make do with an rcu-readlock. Using just RCU is safe because there won't be another device add delete running in parallel as btrfs_init_new_device is called only from btrfs_ioctl_add_dev. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 29c6f0154c62..5449e3899e67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2515,16 +2515,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { From 44cab9ba374a0341e12c024f43a83e12ec4978fd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:23 +0300 Subject: [PATCH 061/161] btrfs: refactor locked condition in btrfs_init_new_device Invert unlocked to locked and exploit the fact it can only ever be modified if we are adding a new device to a seed filesystem. This allows to simplify the check in error: label. No semantics changes. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5449e3899e67..3e3aacaedf47 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2497,7 +2497,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2511,6 +2511,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -2645,7 +2646,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2706,7 +2707,7 @@ error_free_device: btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } From 4ae312e9728fe568ce0d863421c399ec829d6df9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:24 +0300 Subject: [PATCH 062/161] btrfs: remove redundant code from btrfs_free_stale_devices Following the refactor of btrfs_free_stale_devices in 7bcb8164ad94 ("btrfs: use device_list_mutex when removing stale devices") fs_devices are freed after they have been iterated by the inner list_for_each so the use-after-free fixed by introducing the break in fd649f10c3d2 ("btrfs: Fix use-after-free when cleaning up fs_devs with a single stale device") is no longer necessary. Just remove it altogether. No functional changes. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e3aacaedf47..9355f5c78c15 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -595,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path, btrfs_free_device(device); ret = 0; - if (fs_devices->num_devices == 0) - break; } mutex_unlock(&fs_devices->device_list_mutex); From b9ba017fb0771f67903d70f57908899d7b0020b1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:25 +0300 Subject: [PATCH 063/161] btrfs: don't opencode sync_blockdev in btrfs_init_new_device Instead of opencoding filemap_write_and_wait simply call syncblockdev as it makes it abundantly clear what's going on and why this is used. No semantics changes. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9355f5c78c15..9d169cba8514 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2512,7 +2512,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { From 329ced799be881feab445b68163cd3869340cc08 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:00 -0400 Subject: [PATCH 064/161] btrfs: rename extent_buffer::lock_nested to extent_buffer::lock_recursed Nested locking with lockdep and everything else refers to lock hierarchy within the same lock map. This is how we indicate the same locks for different objects are ok to take in a specific order, for our use case that would be to take the lock on a leaf and then take a lock on an adjacent leaf. What ->lock_nested _actually_ refers to is if we happen to already be holding the write lock on the extent buffer and we're allowing a read lock to be taken on that extent buffer, which is recursion. Rename this so we don't get confused when we switch to a rwsem and have to start using the _nested helpers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/locking.c | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a28d442d65b5..f971e9d689df 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4990,7 +4990,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); eb->blocking_writers = 0; - eb->lock_nested = false; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 30794ae58498..9e1e22f1586a 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -102,7 +102,7 @@ struct extent_buffer { int blocking_writers; atomic_t blocking_readers; - bool lock_nested; + bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f75612e18a82..14ceb2ce33ac 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ * performance reasons. * * - * Lock nesting - * ------------ + * Lock recursion + * -------------- * * A write operation on a tree might indirectly start a look up on the same * tree. This can happen when btrfs_cow_block locks the tree and needs to @@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); @@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); @@ -263,8 +263,8 @@ again: * depends on this as it may be called on a partly * (write-)locked tree. */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; read_unlock(&eb->lock); trace_btrfs_tree_read_lock(eb, start_ns); return; @@ -362,11 +362,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -388,11 +388,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); From 51899412dd95b2d9f50297c527b22ada3da0000c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:01 -0400 Subject: [PATCH 065/161] btrfs: introduce btrfs_path::recurse Our current tree locking stuff allows us to recurse with read locks if we're already holding the write lock. This is necessary for the space cache inode, as we could be holding a lock on the root_tree root when we need to cache a block group, and thus need to be able to read down the root_tree to read in the inode cache. We can get away with this in our current locking, but we won't be able to with a rwsem. Handle this by purposefully annotating the places where we require recursion, so that in the future we can maybe come up with a way to avoid the recursion. In the case of the free space inode, this will be superseded by the free space tree. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++++---- fs/btrfs/ctree.h | 3 +-- fs/btrfs/inode.c | 2 ++ fs/btrfs/locking.c | 13 ++++++++++--- fs/btrfs/locking.h | 9 +++++++++ 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index be7d01055118..7c99bf112960 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2601,7 +2601,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + b = __btrfs_read_lock_root_node(root, p->recurse); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2875,7 +2875,7 @@ cow_done: } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); + __btrfs_tree_read_lock(b, p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -5453,7 +5453,7 @@ again: } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5488,7 +5488,7 @@ again: ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9d4e0958e2e..1339e390a757 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,6 +374,7 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -2654,8 +2655,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 123521aa5595..1fbc7d9e5103 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6585,6 +6585,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, */ path->leave_spinning = 1; + path->recurse = btrfs_is_free_space_inode(inode); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { goto out; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 14ceb2ce33ac..740a0cde5731 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -244,7 +244,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void btrfs_tree_read_lock(struct extent_buffer *eb) +void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse) { u64 start_ns = 0; @@ -263,6 +263,7 @@ again: * depends on this as it may be called on a partly * (write-)locked tree. */ + WARN_ON(!recurse); BUG_ON(eb->lock_recursed); eb->lock_recursed = true; read_unlock(&eb->lock); @@ -279,6 +280,11 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); } +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, false); +} + /* * Lock extent buffer for read, optimistically expecting that there are no * contending blocking writers. If there are, don't wait. @@ -552,13 +558,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); + __btrfs_tree_read_lock(eb, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d715846c10b8..31f6d6405c1d 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -21,6 +21,7 @@ struct btrfs_path; void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); +void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); @@ -29,6 +30,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse); + +static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + return __btrfs_read_lock_root_node(root, false); +} #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { From fd7ba1c1202d15ac96d7b42256550973e12686b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:02 -0400 Subject: [PATCH 066/161] btrfs: add nesting tags to the locking helpers We will need these when we switch to an rwsem, so plumb in the infrastructure here to use later on. I violate the 80 character limit some here because it'll be cleaned up later. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 ++++++++--- fs/btrfs/locking.c | 14 ++++++++++---- fs/btrfs/locking.h | 25 ++++++++++++++++++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7c99bf112960..71e3fa30082c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2875,7 +2875,8 @@ cow_done: } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - __btrfs_tree_read_lock(b, p->recurse); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -5453,7 +5454,9 @@ again: } if (!ret) { btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, path->recurse); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_NORMAL, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5488,7 +5491,9 @@ again: ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, path->recurse); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_NORMAL, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 740a0cde5731..66e02ebdd340 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse) { u64 start_ns = 0; @@ -282,7 +283,7 @@ again: void btrfs_tree_read_lock(struct extent_buffer *eb) { - __btrfs_tree_read_lock(eb, false); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); } /* @@ -415,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * * The rwlock is held for write upon exit. */ -void btrfs_tree_lock(struct extent_buffer *eb) +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -440,6 +441,11 @@ again: trace_btrfs_tree_lock(eb, start_ns); } +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + /* * Release the write lock, either blocking or spinning (ie. there's no need * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). @@ -565,7 +571,7 @@ struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, while (1) { eb = btrfs_root_node(root); - __btrfs_tree_read_lock(eb, recurse); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 31f6d6405c1d..5fabda8f0a80 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -16,12 +16,35 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + struct btrfs_path; +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); From 9631e4cc1a030aa71dea588cc9a57533b657489f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:03 -0400 Subject: [PATCH 067/161] btrfs: introduce BTRFS_NESTING_COW for cow'ing blocks When we COW a block we are holding a lock on the original block, and then we lock the new COW block. Because our lockdep maps are based on root + level, this will make lockdep complain. We need a way to indicate a subclass for locking the COW'ed block, so plumb through our btrfs_lock_nesting from btrfs_cow_block down to the btrfs_init_buffer, and then introduce BTRFS_NESTING_COW to be used for cow'ing blocks. The reason I've added all this extra infrastructure is because there will be need of different nesting classes in follow up patches. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 52 ++++++++++++++++++++++++++---------------- fs/btrfs/ctree.h | 6 +++-- fs/btrfs/disk-io.c | 5 ++-- fs/btrfs/extent-tree.c | 12 ++++++---- fs/btrfs/ioctl.c | 3 ++- fs/btrfs/locking.h | 8 +++++++ fs/btrfs/relocation.c | 11 +++++---- fs/btrfs/transaction.c | 5 ++-- 8 files changed, 66 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 71e3fa30082c..48178fd8740f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0); + &disk_key, level, buf->start, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -957,7 +957,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( const struct btrfs_disk_key *disk_key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *ret; @@ -986,7 +987,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( ret = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, disk_key, level, - hint, empty_size); + hint, empty_size, nest); trans->can_flush_pending_bgs = true; return ret; @@ -1009,7 +1010,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -1040,7 +1042,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1446,7 +1448,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; @@ -1485,7 +1488,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -1657,7 +1660,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -1855,7 +1859,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking_write(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); @@ -1894,7 +1899,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_COW); if (wret) { ret = wret; goto enospc; @@ -1909,7 +1915,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_COW); if (wret) { ret = wret; goto enospc; @@ -2077,7 +2084,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_COW); if (ret) wret = 1; else { @@ -2132,7 +2140,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_COW); if (ret) wret = 1; else { @@ -2740,11 +2748,13 @@ again: btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, - &b); + &b, + BTRFS_NESTING_COW); else err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], &b, + BTRFS_NESTING_COW); if (err) { ret = err; goto done; @@ -3396,7 +3406,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, - root->node->start, 0); + root->node->start, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(c)) return PTR_ERR(c); @@ -3526,7 +3537,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0); + c->start, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(split)) return PTR_ERR(split); @@ -3804,7 +3815,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_COW); if (ret) goto out_unlock; @@ -4045,7 +4056,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -4312,7 +4324,7 @@ again: btrfs_item_key(l, &disk_key, mid); right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0); + l->start, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1339e390a757..b4d3a4d82053 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2526,7 +2526,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size); + u64 empty_size, + enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2667,7 +2668,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4eb56f00ee81..ed887242f348 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1205,7 +1205,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1277,7 +1278,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0); + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { btrfs_put_root(root); return ERR_CAST(leaf); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7b61b7b5122f..3b21fee13e77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4656,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -4679,7 +4680,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } btrfs_set_buffer_lockdep_class(owner, buf, level); - btrfs_tree_lock(buf); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4725,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4741,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4758,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3779a6c12184..b91444e810a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -628,7 +628,8 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5fabda8f0a80..8b47ba34fb03 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -24,6 +24,14 @@ enum btrfs_lock_nesting { BTRFS_NESTING_NORMAL, + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4ba1ab9cc76d..3602806d71bd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1206,7 +1206,8 @@ again: } if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1274,7 +1275,8 @@ again: btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); + slot, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); if (ret < 0) @@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); + slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f8265dbec9c3..52ada47aff50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1184,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); + 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -1589,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); From bf77467a93bdf11d815f5c5f25206a0d058da8de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:04 -0400 Subject: [PATCH 068/161] btrfs: introduce BTRFS_NESTING_LEFT/BTRFS_NESTING_RIGHT Our lockdep maps are based on rootid+level, however in some cases we will lock adjacent blocks on the same level, namely in searching forward or in split/balance. Because of this lockdep will complain, so we need a separate subclass to indicate to lockdep that these are different locks. lock leaf -> BTRFS_NESTING_NORMAL cow leaf -> BTRFS_NESTING_COW split leaf lock left -> BTRFS_NESTING_LEFT lock right -> BTRFS_NESTING_RIGHT The above graph illustrates the need for this new nesting subclass. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/locking.h | 12 ++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 48178fd8740f..94af1f385ea6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1896,7 +1896,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = NULL; if (left) { - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, @@ -1912,7 +1912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; if (right) { - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, @@ -2076,7 +2076,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (left) { u32 left_nr; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); @@ -2131,7 +2131,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (right) { u32 right_nr; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); @@ -3806,7 +3806,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return 1; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); @@ -4045,7 +4045,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return 1; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); @@ -5467,7 +5467,7 @@ again: if (!ret) { btrfs_set_path_blocking(path); __btrfs_tree_read_lock(next, - BTRFS_NESTING_NORMAL, + BTRFS_NESTING_RIGHT, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; @@ -5504,7 +5504,7 @@ again: if (!ret) { btrfs_set_path_blocking(path); __btrfs_tree_read_lock(next, - BTRFS_NESTING_NORMAL, + BTRFS_NESTING_RIGHT, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 8b47ba34fb03..5844bc1c8410 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -32,6 +32,18 @@ enum btrfs_lock_nesting { */ BTRFS_NESTING_COW, + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From bf59a5a21604ef79da9105bef1bae817fd053e75 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:05 -0400 Subject: [PATCH 069/161] btrfs: introduce BTRFS_NESTING_LEFT/RIGHT_COW For similar reasons as BTRFS_NESTING_COW, we need BTRFS_NESTING_LEFT/RIGHT_COW. The pattern is this lock leaf -> BTRFS_NESTING_NORMAL cow leaf -> BTRFS_NESTING_COW split leaf lock left -> BTRFS_NESTING_LEFT cow left -> BTRFS_NESTING_LEFT_COW We need this in order to indicate to lockdep that these locks are discrete and are being taken in a safe order. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 ++++++------ fs/btrfs/locking.h | 8 ++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 94af1f385ea6..ecf8bf240d73 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1900,7 +1900,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, - BTRFS_NESTING_COW); + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto enospc; @@ -1916,7 +1916,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, - BTRFS_NESTING_COW); + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto enospc; @@ -2085,7 +2085,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, - BTRFS_NESTING_COW); + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { @@ -2140,7 +2140,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right, BTRFS_NESTING_COW); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { @@ -3815,7 +3815,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right, BTRFS_NESTING_COW); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; @@ -4057,7 +4057,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, - BTRFS_NESTING_COW); + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5844bc1c8410..16563bc486c7 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -44,6 +44,14 @@ enum btrfs_lock_nesting { BTRFS_NESTING_LEFT, BTRFS_NESTING_RIGHT, + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From 4dff97e69005ea90266f3e3dda295264e854c15d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:06 -0400 Subject: [PATCH 070/161] btrfs: introduce BTRFS_NESTING_SPLIT for split blocks If we are splitting a leaf/node, we could do something like the following lock(leaf) BTRFS_NESTING_NORMAL lock(left) BTRFS_NESTING_LEFT + BTRFS_NESTING_COW push from leaf -> left reset path to point to left split left allocate new block, lock block BTRFS_NESTING_SPLIT at the new block point we need to have a different nesting level, because we have already used either BTRFS_NESTING_LEFT or BTRFS_NESTING_RIGHT when pushing items from the original leaf into the adjacent leaves. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++-- fs/btrfs/locking.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecf8bf240d73..884c8b5a0e62 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3537,7 +3537,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0, BTRFS_NESTING_NORMAL); + c->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); @@ -4324,7 +4324,7 @@ again: btrfs_item_key(l, &disk_key, mid); right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0, BTRFS_NESTING_NORMAL); + l->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 16563bc486c7..a6b59808e046 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -52,6 +52,15 @@ enum btrfs_lock_nesting { BTRFS_NESTING_LEFT_COW, BTRFS_NESTING_RIGHT_COW, + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From cf6f34aa3ada0be8c5f90fe93f48a75fea082c51 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:07 -0400 Subject: [PATCH 071/161] btrfs: introduce BTRFS_NESTING_NEW_ROOT for adding new roots The way we add new roots is confusing from a locking perspective for lockdep. We generally have the rule that we lock things in order from highest level to lowest, but in the case of adding a new level to the tree we actually allocate a new block for the root, which makes the locking go in reverse. A similar issue exists for snapshotting, we cow the original root for the root of a new tree, however they're at the same level. Address this by using BTRFS_NESTING_NEW_ROOT for these operations. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 +++-- fs/btrfs/locking.h | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 884c8b5a0e62..d61ea238ad8a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0, BTRFS_NESTING_NORMAL); + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3407,7 +3408,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, root->node->start, 0, - BTRFS_NESTING_NORMAL); + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index a6b59808e046..3ea81ed3320b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -61,6 +61,15 @@ enum btrfs_lock_nesting { */ BTRFS_NESTING_SPLIT, + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From ca9d473a3e300bcddc73c00fdf2f4bf6ca43c4a2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:08 -0400 Subject: [PATCH 072/161] btrfs: use BTRFS_NESTED_NEW_ROOT for double splits I've made this change separate since it requires both of the newly added NESTED flags and I didn't want to slip it into one of those changes. If we do a double split of a node we can end up doing a BTRFS_NESTED_SPLIT on level 0, which throws lockdep off because it appears as a double lock. Since we're maxed out on subclasses, use BTRFS_NESTED_NEW_ROOT if we had to do a double split. This is OK because we won't have to do a double split if we had to insert a new root, and the new root would be at a higher level anyway. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d61ea238ad8a..7dbfa365eb18 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4324,8 +4324,18 @@ again: else btrfs_item_key(l, &disk_key, mid); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0, BTRFS_NESTING_SPLIT); + l->start, 0, num_doubles ? + BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); From b79b724969ad5e5a01548c75f30fc14c893e825b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:38 +0300 Subject: [PATCH 073/161] btrfs: make inode_tree_del take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1fbc7d9e5103..2306fbaafb28 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5305,15 +5305,15 @@ static void inode_tree_add(struct inode *inode) spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -8685,7 +8685,7 @@ void btrfs_destroy_inode(struct inode *inode) } } btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); - inode_tree_del(inode); + inode_tree_del(BTRFS_I(inode)); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); btrfs_put_root(BTRFS_I(inode)->root); From 6d072c8e291f4127a7a40addb7ef553f9490a600 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:39 +0300 Subject: [PATCH 074/161] btrfs: make btrfs_lookup_first_ordered_extent take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 6 ++++-- fs/btrfs/inode.c | 3 ++- fs/btrfs/ordered-data.c | 6 +++--- fs/btrfs/ordered-data.h | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4bf15846efdc..7c37a43cc2e3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2541,7 +2541,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + lockend); /* * We need to make sure we have no ordered extents in this range @@ -3400,7 +3401,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + locked_end); if (ordered && ordered->file_offset + ordered->num_bytes > alloc_start && diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2306fbaafb28..93063edc2d7b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8672,7 +8672,8 @@ void btrfs_destroy_inode(struct inode *inode) return; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + (u64)-1); if (!ordered) break; else { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4732c5b89460..5a7e9c5872d3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -710,7 +710,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -838,13 +838,13 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, * if none is found */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 644258a7dfb1..b287a2a403e6 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -178,7 +178,7 @@ void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, From acbf1dd0fcbd10c67826a19958f55a053b32f532 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:40 +0300 Subject: [PATCH 075/161] btrfs: make ordered extent tracepoint take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 8 ++++---- include/trace/events/btrfs.h | 17 ++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5a7e9c5872d3..369ddad30d9c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -217,7 +217,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); + trace_btrfs_ordered_extent_add(inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -442,7 +442,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); @@ -528,7 +528,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(inode, entry); + trace_btrfs_ordered_extent_remove(BTRFS_I(inode), entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); @@ -658,7 +658,7 @@ void btrfs_start_ordered_extent(struct inode *inode, u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; - trace_btrfs_ordered_extent_start(inode, entry); + trace_btrfs_ordered_extent_start(BTRFS_I(inode), entry); /* * pages in the range can be dirty, clean or writeback. We diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b9241836d4f7..8d311062d376 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -510,7 +510,7 @@ DEFINE_EVENT( DECLARE_EVENT_CLASS(btrfs__ordered_extent, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered), @@ -529,8 +529,8 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __field( u64, truncated_len ) ), - TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), - __entry->ino = btrfs_ino(BTRFS_I(inode)); + TP_fast_assign_btrfs(inode->root->fs_info, + __entry->ino = btrfs_ino(inode); __entry->file_offset = ordered->file_offset; __entry->start = ordered->disk_bytenr; __entry->len = ordered->num_bytes; @@ -539,8 +539,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; __entry->refs = refcount_read(&ordered->refs); - __entry->root_objectid = - BTRFS_I(inode)->root->root_key.objectid; + __entry->root_objectid = inode->root->root_key.objectid; __entry->truncated_len = ordered->truncated_len; ), @@ -563,7 +562,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -571,7 +570,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -579,7 +578,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -587,7 +586,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) From 90c0304c6307a68e1e4af637d56d6613ed026875 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:41 +0300 Subject: [PATCH 076/161] btrfs: make btrfs_dec_test_ordered_pending take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ordered-data.c | 7 +++---- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 93063edc2d7b..8b50cad7ee8f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2781,8 +2781,8 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) + if (!btrfs_dec_test_ordered_pending(BTRFS_I(inode), &ordered_extent, + start, end - start + 1, uptodate)) return; if (btrfs_is_free_space_inode(BTRFS_I(inode))) @@ -8184,8 +8184,8 @@ again: ordered->truncated_len = new_len; spin_unlock_irq(&tree->lock); - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, + if (btrfs_dec_test_ordered_pending(BTRFS_I(inode), + &ordered, start, end - start + 1, 1)) btrfs_finish_ordered_io(ordered); } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 369ddad30d9c..168a5edd939d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -378,17 +378,16 @@ out: * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used * to make sure this function only returns 1 once for a given ordered extent. */ -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; int ret; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { entry = *cached; @@ -409,7 +408,7 @@ have_entry: } if (io_size > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, + btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index b287a2a403e6..1610195c6097 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -153,7 +153,7 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, From 6fee248d2beb3591f6f50aeeac843b366b116e3b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:42 +0300 Subject: [PATCH 077/161] btrfs: convert btrfs_inode_sectorsize to take btrfs_inode It's counterintuitive to have a function named btrfs_inode_xxx which takes a generic inode. Also move the function to btrfs_inode.h so that it has access to the definition of struct btrfs_inode. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 5 +++++ fs/btrfs/ctree.h | 4 ---- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/file.c | 10 +++++----- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/reflink.c | 2 +- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 00f7831d0902..6fdb46d58299 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -217,6 +217,11 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) +{ + return inode->root->fs_info->sectorsize; +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4d3a4d82053..af2bd059daae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1197,10 +1197,6 @@ struct btrfs_file_private { void *filldir_buf; }; -static inline u32 btrfs_inode_sectorsize(const struct inode *inode) -{ - return btrfs_sb(inode->i_sb)->sectorsize; -} static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f971e9d689df..94ec2455de99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4555,7 +4555,7 @@ next: static struct extent_map *get_extent_skip_holes(struct inode *inode, u64 offset, u64 last) { - u64 sectorsize = btrfs_inode_sectorsize(inode); + u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); struct extent_map *em; u64 len; @@ -4736,8 +4736,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + start = round_down(start, btrfs_inode_sectorsize(BTRFS_I(inode))); + len = round_up(max, btrfs_inode_sectorsize(BTRFS_I(inode))) - start; /* * lookup the last file extent. We're not using i_size here diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c37a43cc2e3..30c511d1e70b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2901,9 +2901,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, - btrfs_inode_sectorsize(inode)) - 1; + btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -3108,7 +3108,7 @@ enum { static int btrfs_zero_range_check_range_boundary(struct inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); struct extent_map *em; int ret; @@ -3138,7 +3138,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3319,7 +3319,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(inode); + int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; alloc_start = round_down(offset, blocksize); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 168a5edd939d..d39a0fe4c463 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -870,7 +870,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; - u32 sectorsize = btrfs_inode_sectorsize(inode); + u32 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5cd02514cf4d..7126f94cf216 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -52,7 +52,7 @@ static int copy_inline_to_page(struct inode *inode, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(inode); + const u64 block_size = btrfs_inode_sectorsize(BTRFS_I(inode)); const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); From 53ac7ead2446947facccbda392bbf499be435a59 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:43 +0300 Subject: [PATCH 078/161] btrfs: make btrfs_invalidatepage work on btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8b50cad7ee8f..301bbc41e963 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8124,15 +8124,15 @@ static int btrfs_migratepage(struct address_space *mapping, static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8143,7 +8143,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; @@ -8153,8 +8152,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); @@ -8175,7 +8173,7 @@ again: struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8184,8 +8182,8 @@ again: ordered->truncated_len = new_len; spin_unlock_irq(&tree->lock); - if (btrfs_dec_test_ordered_pending(BTRFS_I(inode), - &ordered, start, + if (btrfs_dec_test_ordered_pending(inode, &ordered, + start, end - start + 1, 1)) btrfs_finish_ordered_io(ordered); } @@ -8214,7 +8212,7 @@ again: * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | From 3347c48f276754eaf8f41a6e0368ed558a9453be Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:44 +0300 Subject: [PATCH 079/161] btrfs: make btrfs_writepage_endio_finish_ordered btrfs_inode-centric Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 301bbc41e963..6201bbb4ca90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2773,19 +2773,19 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(BTRFS_I(inode), &ordered_extent, - start, end - start + 1, uptodate)) + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; From f1bbde8d5f278050e63b7330f6d6b230fe1ea398 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:45 +0300 Subject: [PATCH 080/161] btrfs: make get_extent_skip_holes take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 94ec2455de99..5206ae72594b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4552,10 +4552,10 @@ next: * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { - u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; u64 len; @@ -4567,7 +4567,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4787,7 +4787,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(inode, start, last_for_get_extent); + em = get_extent_skip_holes(BTRFS_I(inode), start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4876,7 +4876,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); + em = get_extent_skip_holes(BTRFS_I(inode), off, + last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; From 3c5641a83ac4d2e8ef3d542b35794962e51d7135 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:46 +0300 Subject: [PATCH 081/161] btrfs: make btrfs_find_ordered_sum take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/ordered-data.c | 19 +++++++++---------- fs/btrfs/ordered-data.h | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7d5ec71615b8..8f4f2bd6d9b9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - csum, nblocks); + count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, + disk_bytenr, csum, nblocks); if (count) goto found; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d39a0fe4c463..c8a13d143877 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -861,20 +861,21 @@ out: * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len) +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; - u32 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + u32 sectorsize = btrfs_inode_sectorsize(inode); + const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); + ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) return 0; @@ -882,10 +883,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - inode->i_sb->s_blocksize_bits; - num_sectors = ordered_sum->len >> - inode->i_sb->s_blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; + num_sectors = ordered_sum->len >> blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 1610195c6097..6a1d5bf5aee3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -185,8 +185,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( u64 len); void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len); +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, From 998acfe8ffc14812ad427081b156dc286901ed9a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:47 +0300 Subject: [PATCH 082/161] btrfs: make copy_inline_to_page take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 7126f94cf216..2461be6ccb6f 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -45,19 +45,20 @@ out: return ret; } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 block_size = btrfs_inode_sectorsize(inode); const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - file_offset, block_size); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); if (!page) { ret = -ENOMEM; goto out_unlock; } set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, - 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -134,9 +134,9 @@ out_unlock: put_page(page); } if (ret) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; @@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -231,8 +231,8 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } From 948dfeb86baef1bfa108e93acac5cee23d578e62 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:48 +0300 Subject: [PATCH 083/161] btrfs: make btrfs_zero_range_check_range_boundary take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 30c511d1e70b..af4eab9cbc51 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3105,15 +3105,15 @@ enum { RANGE_BOUNDARY_HOLE, }; -static int btrfs_zero_range_check_range_boundary(struct inode *inode, +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -3228,7 +3228,8 @@ static int btrfs_zero_range(struct inode *inode, * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, offset); + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { @@ -3244,7 +3245,7 @@ static int btrfs_zero_range(struct inode *inode, } if (!IS_ALIGNED(offset + len, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; From facee0a09c15805202e37d12e0fecce1d28b4d5f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:49 +0300 Subject: [PATCH 084/161] btrfs: make extent_fiemap take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 +++++++++++++--------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5206ae72594b..a1e070ec7ad8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4696,7 +4696,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; @@ -4707,12 +4707,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct ulist *roots; struct ulist *tmp_ulist; @@ -4736,15 +4736,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } - start = round_down(start, btrfs_inode_sectorsize(BTRFS_I(inode))); - len = round_up(max, btrfs_inode_sectorsize(BTRFS_I(inode))) - start; + start = round_down(start, btrfs_inode_sectorsize(inode)); + len = round_up(max, btrfs_inode_sectorsize(inode)) - start; /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { goto out_free_ulist; } else { @@ -4758,7 +4758,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4784,10 +4784,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(BTRFS_I(inode), start, last_for_get_extent); + em = get_extent_skip_holes(inode, start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4853,8 +4853,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_check_shared(root, btrfs_ino(inode), bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; @@ -4876,8 +4875,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(BTRFS_I(inode), off, - last_for_get_extent); + em = get_extent_skip_holes(inode, off, last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -4899,7 +4897,7 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); out_free_ulist: diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9e1e22f1586a..06611947a9f7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -203,7 +203,7 @@ int extent_writepages(struct address_space *mapping, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6201bbb4ca90..a66145aac710 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8035,7 +8035,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) From ca10845a56856fff4de3804c85e6424d0f6d0cde Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Sep 2020 08:09:01 -0400 Subject: [PATCH 085/161] btrfs: sysfs: init devices outside of the chunk_mutex While running btrfs/061, btrfs/073, btrfs/078, or btrfs/178 we hit the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc3+ #4 Not tainted ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff96ecc22ef4a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc+0x37/0x270 alloc_inode+0x82/0xb0 iget_locked+0x10d/0x2c0 kernfs_get_inode+0x1b/0x130 kernfs_get_tree+0x136/0x240 sysfs_get_tree+0x16/0x40 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (kernfs_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 kernfs_add_one+0x23/0x150 kernfs_create_link+0x63/0xa0 sysfs_do_create_link_sd+0x5e/0xd0 btrfs_sysfs_add_devices_dir+0x81/0x130 btrfs_init_new_device+0x67f/0x1250 btrfs_ioctl+0x1ef/0x2e20 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_insert_empty_items+0x64/0xb0 btrfs_insert_delayed_items+0x90/0x4f0 btrfs_commit_inode_delayed_items+0x93/0x140 btrfs_log_inode+0x5de/0x2020 btrfs_log_inode_parent+0x429/0xc90 btrfs_log_new_name+0x95/0x9b btrfs_rename2+0xbb9/0x1800 vfs_rename+0x64f/0x9f0 do_renameat2+0x320/0x4e0 __x64_sys_rename+0x1f/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> kernfs_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(kernfs_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff8dd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff96ed2ade30e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 0 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb8 check_noncircular+0x12d/0x150 __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa7/0x3d0 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x41/0x50 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This happens because we are holding the chunk_mutex at the time of adding in a new device. However we only need to hold the device_list_mutex, as we're going to iterate over the fs_devices devices. Move the sysfs init stuff outside of the chunk_mutex to get rid of this lockdep splat. CC: stable@vger.kernel.org # 4.4.x: f3cd2c58110dad14e: btrfs: sysfs, rename device_link add/remove functions CC: stable@vger.kernel.org # 4.4.x Reported-by: David Sterba Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d169cba8514..c86ffad04641 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2597,9 +2597,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2607,6 +2604,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_devices_dir(fs_devices, device); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { From c3e1f96c37d0f863ac6ddcafac6921fd46f1aa37 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 25 Aug 2020 10:02:32 -0500 Subject: [PATCH 086/161] btrfs: enumerate the type of exclusive operation in progress Instead of using a flag bit for exclusive operation, use a variable to store which exclusive operation is being performed. Introduce an API to start and finish an exclusive operation. This would enable another way for tools to check which operation is running on why starting an exclusive operation failed. The followup patch adds a sysfs_notify() to alert userspace when the state changes, so userspace can perform select() on it to get notified of the change. This would enable us to enqueue a command which will wait for current exclusive operation to complete before issuing the next exclusive operation. This has been done synchronously as opposed to a background process, or else error collection (if any) will become difficult. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba [ update comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 24 ++++++++++++++++++----- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/inode.c | 14 +++++++------- fs/btrfs/ioctl.c | 44 ++++++++++++++++++++++++++---------------- fs/btrfs/volumes.c | 20 +++++++++---------- 5 files changed, 65 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af2bd059daae..98c5f6178efc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,11 +541,6 @@ enum { BTRFS_FS_QUOTA_OVERRIDE, /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, - /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. @@ -566,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -937,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -3032,6 +3043,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61d800a149b0..14901ca2508d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -1025,7 +1025,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1062,7 +1062,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a66145aac710..cce6f8789a4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10015,14 +10015,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10168,7 +10168,7 @@ out: if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b91444e810a5..f30231596fe0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,17 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -1639,7 +1650,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1753,7 +1764,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -3127,7 +3138,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3144,7 +3155,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3173,7 +3184,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3184,7 +3195,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3215,7 +3226,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3233,7 +3244,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3737,11 +3748,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3952,7 +3963,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3998,7 +4009,6 @@ again: } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4053,10 +4063,10 @@ locked: do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4075,7 +4085,7 @@ out_bargs: out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c86ffad04641..cbebb5d06773 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -291,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -318,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -4033,7 +4033,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4169,7 +4169,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4180,7 +4180,7 @@ out: reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4282,7 +4282,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4364,7 +4364,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } From 66a2823c54367b4d366cfa5dc12ef99ef560f428 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 25 Aug 2020 10:02:33 -0500 Subject: [PATCH 087/161] btrfs: sysfs: export currently running exclusive operation /sys/fs//exclusive_operation contains the currently executing exclusive operation. Add a sysfs_notify() when operation end, so userspace can be notified of exclusive operation is finished. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 1 + fs/btrfs/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f30231596fe0..8091324341a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -387,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) { WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } /* diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 438a367371c4..eb8f23578c16 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -809,6 +809,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, BTRFS_ATTR(, checksum, btrfs_checksum_show); +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return scnprintf(buf, PAGE_SIZE, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -817,6 +853,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), NULL, }; From 457f1864b569c4a5d49046fb9a88ab82e108cdfd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Sep 2020 14:29:51 -0400 Subject: [PATCH 088/161] btrfs: pretty print leaked root name I'm a actual human being so am incapable of converting u64 to s64 in my head, so add a helper to get the pretty name of a root objectid and use that helper to spit out the name for any special roots for leaked roots, so I don't have to scratch my head and figure out which root I messed up the refs for. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ++++-- fs/btrfs/print-tree.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/btrfs/print-tree.h | 4 ++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ed887242f348..f44f3d672a1f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1503,10 +1503,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); - btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", - root->root_key.objectid, root->root_key.offset, + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(root->root_key.objectid, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 80567c11ec12..7695c4783d33 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,44 @@ #include "disk-io.h" #include "print-tree.h" +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(u64 objectid, char *buf) +{ + int i; + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", objectid); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid); + return buf; +} + static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index e6bb38fd75ad..78b99385a503 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,7 +6,11 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c, bool follow); +const char *btrfs_root_name(u64 objectid, char *buf); #endif From 79dae17d8d44b2d15779e332180080af45df5352 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 3 Sep 2020 21:30:12 +0800 Subject: [PATCH 089/161] btrfs: improve device scanning messages Systems booting without the initramfs seems to scan an unusual kind of device path (/dev/root). And at a later time, the device is updated to the correct path. We generally print the process name and PID of the process scanning the device but we don't capture the same information if the device path is rescanned with a different pathname. The current message is too long, so drop the unnecessary UUID and add process name and PID. While at this also update the duplicate device warning to include the process name and PID so the messages are consistent CC: stable@vger.kernel.org # 4.19+ Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=89721 Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cbebb5d06773..13b394d401f9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -941,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, - "duplicate device fsid:devid for %pU:%llu old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); return ERR_PTR(-EEXIST); } bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, - "device fsid %pU devid %llu moved old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, rcu_str_deref(device->name), + path, current->comm, + task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); From c6a5d954950c5031444173ad2195efc163afcac9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:22 +0800 Subject: [PATCH 090/161] btrfs: fix replace of seed device If you replace a seed device in a sprouted fs, it appears to have successfully replaced the seed device, but if you look closely, it didn't. Here is an example. $ mkfs.btrfs /dev/sda $ btrfstune -S1 /dev/sda $ mount /dev/sda /btrfs $ btrfs device add /dev/sdb /btrfs $ umount /btrfs $ btrfs device scan --forget $ mount -o device=/dev/sda /dev/sdb /btrfs $ btrfs replace start -f /dev/sda /dev/sdc /btrfs $ echo $? 0 BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc started BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc finished $ btrfs fi show Label: none uuid: ab2c88b7-be81-4a7e-9849-c3666e7f9f4f Total devices 2 FS bytes used 256.00KiB devid 1 size 3.00GiB used 520.00MiB path /dev/sdc devid 2 size 3.00GiB used 896.00MiB path /dev/sdb Label: none uuid: 10bd3202-0415-43af-96a8-d5409f310a7e Total devices 1 FS bytes used 128.00KiB devid 1 size 3.00GiB used 536.00MiB path /dev/sda So as per the replace start command and kernel log replace was successful. Now let's try to clean mount. $ umount /btrfs $ btrfs device scan --forget $ mount -o device=/dev/sdc /dev/sdb /btrfs mount: /btrfs: wrong fs type, bad option, bad superblock on /dev/sdb, missing codepage or helper program, or other error. [ 636.157517] BTRFS error (device sdc): failed to read chunk tree: -2 [ 636.180177] BTRFS error (device sdc): open_ctree failed That's because per dev items it is still looking for the original seed device. $ btrfs inspect-internal dump-tree -d /dev/sdb item 0 key (DEV_ITEMS DEV_ITEM 1) itemoff 16185 itemsize 98 devid 1 total_bytes 3221225472 bytes_used 545259520 io_align 4096 io_width 4096 sector_size 4096 type 0 generation 6 start_offset 0 dev_group 0 seek_speed 0 bandwidth 0 uuid 59368f50-9af2-4b17-91da-8a783cc418d4 <--- seed uuid fsid 10bd3202-0415-43af-96a8-d5409f310a7e <--- seed fsid item 1 key (DEV_ITEMS DEV_ITEM 2) itemoff 16087 itemsize 98 devid 2 total_bytes 3221225472 bytes_used 939524096 io_align 4096 io_width 4096 sector_size 4096 type 0 generation 0 start_offset 0 dev_group 0 seek_speed 0 bandwidth 0 uuid 56a0a6bc-4630-4998-8daf-3c3030c4256a <- sprout uuid fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f <- sprout fsid But the replaced target has the following uuid+fsid in its superblock which doesn't match with the expected uuid+fsid in its devitem. $ btrfs in dump-super /dev/sdc | egrep '^generation|dev_item.uuid|dev_item.fsid|devid' generation 20 dev_item.uuid 59368f50-9af2-4b17-91da-8a783cc418d4 dev_item.fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f [match] dev_item.devid 1 So if you provide the original seed device the mount shall be successful. Which so long happening in the test case btrfs/163. $ btrfs device scan --forget $ mount -o device=/dev/sda /dev/sdb /btrfs Fix in this patch: If a seed is not sprouted then there is no replacement of it, because of its read-only filesystem with a read-only device. Similarly, in the case of a sprouted filesystem, the seed device is still read only. So, mark it as you can't replace a seed device, you can only add a new device and then delete the seed device. If replace is attempted then returns -EINVAL. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 14901ca2508d..bbb3d5ec9ca8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -230,7 +230,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } From 178a16c94041fe5ebf86c5620094e185cec27dd2 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:23 +0800 Subject: [PATCH 091/161] btrfs: add btrfs_sysfs_add_device helper btrfs_sysfs_add_devices_dir() adds device link and devid kobject (sysfs entries) for a device or all the devices in the btrfs_fs_devices. In preparation to add these sysfs entries for the seed as well, add a btrfs_sysfs_add_device() helper function and avoid code duplication. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 82 +++++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index eb8f23578c16..8ffe5df99b3c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1312,44 +1312,70 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +static int btrfs_sysfs_add_device(struct btrfs_device *device) { - int error = 0; - struct btrfs_device *dev; + int ret; unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; + + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); nofs_flag = memalloc_nofs_save(); - list_for_each_entry(dev, &fs_devices->devices, dev_list) { - if (one_device && one_device != dev) - continue; + if (device->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; - if (dev->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - - error = sysfs_create_link(fs_devices->devices_kobj, - disk_kobj, disk_kobj->name); - if (error) - break; - } - - init_completion(&dev->kobj_unregister); - error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devinfo_kobj, "%llu", - dev->devid); - if (error) { - kobject_put(&dev->devid_kobj); - break; + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; } } - memalloc_nofs_restore(nofs_flag); - return error; + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); + } + +out: + memalloc_nofs_restore(nofs_flag); + return ret; +} + +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device) +{ + int ret; + + if (device) + return btrfs_sysfs_add_device(device); + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + return ret; + } + + return 0; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) From 985e233e96e56b9980d5f632fc306bfa6e03f70f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:24 +0800 Subject: [PATCH 092/161] btrfs: add btrfs_sysfs_remove_device helper btrfs_sysfs_remove_devices_dir() removes device link and devid kobject (sysfs entries) for a device or all the devices in the btrfs_fs_devices. In preparation to remove these sysfs entries for the seed as well, add a btrfs_sysfs_remove_device() helper function and avoid code duplication. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 61 +++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8ffe5df99b3c..a3bd4ba906cf 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1186,50 +1186,43 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -/* when one_device is NULL, it removes all device links */ - -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +static void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; + struct kobject *devices_kobj; - if (!fs_devices->devices_kobj) - return -EINVAL; + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); - if (one_device) { - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } + if (device->bdev) { + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(devices_kobj, disk_kobj->name); + } - if (one_device->devid_kobj.state_initialized) { - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); - } + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); + } +} +/* When @device is NULL, remove all devices link */ +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device) +{ + if (device) { + btrfs_sysfs_remove_device(device); return 0; } - list_for_each_entry(one_device, &fs_devices->devices, dev_list) { - - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - if (one_device->devid_kobj.state_initialized) { - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); - } - } + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); return 0; } From 6a416a018f1a9372a43465c366e91d202da01e11 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:25 +0800 Subject: [PATCH 093/161] btrfs: make btrfs_sysfs_remove_devices_dir return void btrfs_sysfs_remove_devices_dir() return value is unused declare it as void. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 +++----- fs/btrfs/sysfs.h | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a3bd4ba906cf..66fbb82b0bd5 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1213,18 +1213,16 @@ static void btrfs_sysfs_remove_device(struct btrfs_device *device) } /* When @device is NULL, remove all devices link */ -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) +void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device) { if (device) { btrfs_sysfs_remove_device(device); - return 0; + return; } list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); - - return 0; } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 4217823e255c..1431dbc3c2ef 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -16,8 +16,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); From cd36da2e7ec67f0da9c1efc8a87ad31bd9242ec3 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:26 +0800 Subject: [PATCH 094/161] btrfs: simplify parameters of btrfs_sysfs_add_devices_dir When we add a device we need to add it to sysfs, so instead of using the btrfs_sysfs_add_devices_dir() fs_devices argument to specify whether to add a device or all of fs_devices, call the helper function directly btrfs_sysfs_add_device() and thus make it non-static. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/sysfs.c | 11 ++++------- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 2 +- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index bbb3d5ec9ca8..c176375c2c02 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -512,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 66fbb82b0bd5..bc341560dc69 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1303,7 +1303,7 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -static int btrfs_sysfs_add_device(struct btrfs_device *device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { int ret; unsigned int nofs_flag; @@ -1352,13 +1352,10 @@ out: return ret; } -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; - - if (device) - return btrfs_sysfs_add_device(device); + struct btrfs_device *device; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); @@ -1456,7 +1453,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 1431dbc3c2ef..6296c4f6b330 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,8 +14,7 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +int btrfs_sysfs_add_device(struct btrfs_device *device); void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 13b394d401f9..2aa7fd89efbf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2608,7 +2608,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); + btrfs_sysfs_add_device(device); mutex_unlock(&fs_devices->device_list_mutex); From 53f8a74cbeffd45a95de5dc6d16584aadb682a31 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:27 +0800 Subject: [PATCH 095/161] btrfs: split and refactor btrfs_sysfs_remove_devices_dir Similar to btrfs_sysfs_add_devices_dir()'s refactoring, split btrfs_sysfs_remove_devices_dir() so that we don't have to use the device argument to indicate whether to free all devices or just one device. Export btrfs_sysfs_remove_device() as device operations outside of sysfs.c now calls this instead of btrfs_sysfs_remove_devices_dir(). btrfs_sysfs_remove_devices_dir() is renamed to btrfs_sysfs_remove_fs_devices() to suite its new role. Now, no one outside of sysfs.c calls btrfs_sysfs_remove_fs_devices() so it is redeclared s static. And the same function had to be moved before its first caller. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/sysfs.c | 27 +++++++++++---------------- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 8 ++++---- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c176375c2c02..db4d4a53ab73 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -781,7 +781,7 @@ error: mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) btrfs_scratch_superblocks(fs_info, src_device->bdev, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index bc341560dc69..7aed0712f183 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -972,6 +972,14 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; @@ -999,7 +1007,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -1186,7 +1194,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -static void btrfs_sysfs_remove_device(struct btrfs_device *device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -1212,19 +1220,6 @@ static void btrfs_sysfs_remove_device(struct btrfs_device *device) } } -/* When @device is NULL, remove all devices link */ -void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) -{ - if (device) { - btrfs_sysfs_remove_device(device); - return; - } - - list_for_each_entry(device, &fs_devices->devices, dev_list) - btrfs_sysfs_remove_device(device); -} - static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1459,7 +1454,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6296c4f6b330..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -15,8 +15,7 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_device(struct btrfs_device *device); -void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2aa7fd89efbf..79fe9822196f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2041,7 +2041,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2145,7 +2145,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2246,7 +2246,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2682,7 +2682,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); From 30b0e4e0e3f53944755257c7d02792ee7f9c072e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:28 +0800 Subject: [PATCH 096/161] btrfs: initialize sysfs devid and device link for seed device We don't initialize the sysfs devid kobject and device-link yet for the seed devices in an sprouted filesystem. So this patch initializes the seed device devid kobject and the device link in the sysfs. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7aed0712f183..f66632fefe0a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -975,9 +975,15 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } } void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) @@ -1351,6 +1357,7 @@ static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_device *device; + struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); @@ -1358,6 +1365,14 @@ static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) return ret; } + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + return ret; + } + } + return 0; } From 7ad3912a70a696197d03856749f89dad7a377bc9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:29 +0800 Subject: [PATCH 097/161] btrfs: handle errors in btrfs_sysfs_add_fs_devices btrfs_sysfs_add_fs_devices() is called by btrfs_sysfs_add_mounted(). btrfs_sysfs_add_mounted() assumes that btrfs_sysfs_add_fs_devices() will either add sysfs entries for all the devices or none. So this patch keeps up to its caller expecatation and cleans up the created sysfs entries if it has to fail at some device in the list. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f66632fefe0a..c71db7acb89d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1362,18 +1362,22 @@ static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) - return ret; + goto fail; } list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) - return ret; + goto fail; } } return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) From 2fca0db07608a303dbe9db802ecca678e358dca9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:30 +0800 Subject: [PATCH 098/161] btrfs: reada: lock all seed/sprout devices in __reada_start_machine On an fs mounted using a sprout device, the seed fs_devices are maintained in a linked list under fs_info->fs_devices. Each seeds fs_devices also has device_list_mutex initialized to protect against the potential race with delete threads. But the delete thread (at btrfs_rm_device()) is holding the fs_info::fs_devices::device_list_mutex mutex which belongs to sprout device_list_mutex instead of seed device_list_mutex. Moreover, there aren't any significient benefits in using the seed::device_list_mutex instead of sprout::device_list_mutex. So this patch converts them of using the seed::device_list_mutex to sprout::device_list_mutex. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/reada.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e20972230823..9d4f5316a7e8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -776,13 +776,11 @@ static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) do { enqueued = 0; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(device); } - mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); @@ -795,10 +793,13 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) int i; u64 enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); + enqueued += reada_start_for_fsdevs(fs_devices); list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) enqueued += reada_start_for_fsdevs(seed_devs); + mutex_unlock(&fs_devices->device_list_mutex); if (enqueued == 0) return; From e17125b52b7ec1075fae408e1a1da8005116e34a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:31 +0800 Subject: [PATCH 099/161] btrfs: use sprout device_list_mutex in btrfs_init_devices_late On a mounted sprout filesystem, all threads now are using the sprout::device_list_mutex, and this is the only code using the seed::device_list_mutex. This patch converts to use the sprouts fs_info->fs_devices->device_list_mutex. The same reasoning holds true here, that device delete is holding the sprout::device_list_mutex. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 79fe9822196f..beacc779dbc2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7195,16 +7195,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - mutex_lock(&seed_devs->device_list_mutex); list_for_each_entry(device, &seed_devs->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&seed_devs->device_list_mutex); seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, From 1888709d71806c0bdf008ea471ce9569b8efa79b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:32 +0800 Subject: [PATCH 100/161] btrfs: remove tmp variable for list traversal in btrfs_init_dev_replace_tgtdev In the function btrfs_init_dev_replace_tgtdev(), the local variable devices is used only once, we can remove it. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db4d4a53ab73..500e45f7aa3f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -224,7 +224,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -244,8 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); From e493e8f9bcb5f6cf00f94d30eddc43c5be731ff4 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:34 +0800 Subject: [PATCH 101/161] btrfs: remove unnecessary tmp variable in btrfs_assign_next_active_device() We can check the argument value directly, no need for the temporary variable. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index beacc779dbc2..78b67cc6cec1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1962,16 +1962,13 @@ static struct btrfs_device * btrfs_find_next_active_device( * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && From c83b60c0e4d27363e284123afb50ff34f0ea1f1a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:35 +0800 Subject: [PATCH 102/161] btrfs: simplify gotos in open_seed_device The function does not have a common exit block and returns immediatelly so there's no point having the goto. Remove the two cases. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78b67cc6cec1..6c7c8819cb31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6753,19 +6753,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); -out: + return fs_devices; } From 0725c0c9351d9854a61bc68f0a59d9f91d75abbd Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:36 +0800 Subject: [PATCH 103/161] btrfs: move btrfs_dev_replace_update_device_in_mapping_tree to drop declaration The function is short and simple, we can get rid of the declaration as it's not necessary for a static function. Move it before its first caller. No functional changes. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 56 ++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 500e45f7aa3f..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -628,6 +624,32 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, return ret; } +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -797,32 +819,6 @@ error: return 0; } -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; - - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); -} - /* * Read progress of device replace status according to the state and last * stored position. The value format is the same as for From a31a5876fae21c602f14686f1e8985f98b153d2b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 9 Sep 2020 21:51:42 +0800 Subject: [PATCH 104/161] btrfs: remove unused function calc_global_rsv_need_space() It is not used since commit 0096420adb03 ("btrfs: do not account global reserve in can_overcommit"). Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: YueHaibing Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index b733718f45d3..0f16a2ce5401 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -301,11 +301,6 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) From 8fccebfa534c7984864394fa03608305e4929aae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:20 +0100 Subject: [PATCH 105/161] btrfs: fix metadata reservation for fallocate that leads to transaction aborts When doing an fallocate(), specially a zero range operation, we assume that reserving 3 units of metadata space is enough, that at most we touch one leaf in subvolume/fs tree for removing existing file extent items and inserting a new file extent item. This assumption is generally true for most common use cases. However when we end up needing to remove file extent items from multiple leaves, we can end up failing with -ENOSPC and abort the current transaction, turning the filesystem to RO mode. When this happens a stack trace like the following is dumped in dmesg/syslog: [ 1500.620934] ------------[ cut here ]------------ [ 1500.620938] BTRFS: Transaction aborted (error -28) [ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs] [ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...) [ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1 [ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs] [ 1500.621026] Code: 8b 40 50 f0 48 (...) [ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286 [ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000 [ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff [ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001 [ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000 [ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60 [ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000 [ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0 [ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1500.621049] Call Trace: [ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs] [ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs] [ 1500.621108] vfs_fallocate+0x14d/0x290 [ 1500.621112] ksys_fallocate+0x3a/0x70 [ 1500.621117] __x64_sys_fallocate+0x1a/0x20 [ 1500.621120] do_syscall_64+0x33/0x80 [ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1500.621126] RIP: 0033:0x7fb5b248c477 [ 1500.621128] Code: 89 7c 24 08 (...) [ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d [ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477 [ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003 [ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000 [ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010 [ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003 [ 1500.621151] irq event stamp: 1026217 [ 1500.621154] hardirqs last enabled at (1026223): [] console_unlock+0x500/0x5c0 [ 1500.621156] hardirqs last disabled at (1026228): [] console_unlock+0x457/0x5c0 [ 1500.621159] softirqs last enabled at (1022486): [] __do_softirq+0x3dc/0x606 [ 1500.621161] softirqs last disabled at (1022477): [] asm_call_on_stack+0x12/0x20 [ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]--- [ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left When we use fallocate() internally, for reserving an extent for a space cache, inode cache or relocation, we can't hit this problem since either there aren't any file extent items to remove from the subvolume tree or there is at most one. When using plain fallocate() it's very unlikely, since that would require having many file extent items representing holes for the target range and crossing multiple leafs - we attempt to increase the range (merge) of such file extent items when punching holes, so at most we end up with 2 file extent items for holes at leaf boundaries. However when using the zero range operation of fallocate() for a large range (100+ MiB for example) that's fairly easy to trigger. The following example reproducer triggers the issue: $ cat reproducer.sh #!/bin/bash umount /dev/sdj &> /dev/null mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null mount /dev/sdj /mnt/sdj # Create a 100M file with many file extent items. Punch a hole every 8K # just to speedup the file creation - we could do 4K sequential writes # followed by fsync (or O_SYNC) as well, but that takes a lot of time. file_size=$((100 * 1024 * 1024)) xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar for ((i = 0; i < $file_size; i += 8192)); do xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar done # Force a transaction commit, so the zero range operation will be forced # to COW all metadata extents it need to touch. sync xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar umount /mnt/sdj $ ./reproducer.sh wrote 104857600/104857600 bytes at offset 0 100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec) fallocate: No space left on device $ dmesg To fix this use the existing infrastructure that hole punching and extent cloning use for replacing a file range with another extent. This deals with doing the removal of file extent items and inserting the new one using an incremental approach, reserving more space when needed and always ensuring we don't leave an implicit hole in the range in case we need to do multiple iterations and a crash happens between iterations. A test case for fstests will follow up soon. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 16 +++++++++++ fs/btrfs/file.c | 38 +++++++++++++++++++------- fs/btrfs/inode.c | 68 +++++++++++++++++++++++++++++++--------------- fs/btrfs/reflink.c | 1 + 4 files changed, 91 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98c5f6178efc..1308a851b00f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1202,6 +1202,22 @@ struct btrfs_clone_extent_info { u64 file_offset; char *extent_buf; u32 item_size; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; }; struct btrfs_file_private { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af4eab9cbc51..73827c9c7a70 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2583,7 +2583,6 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; int slot; struct btrfs_ref ref = { 0 }; - u64 ref_offset; int ret; if (clone_len == 0) @@ -2608,6 +2607,8 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + if (clone_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -2621,13 +2622,29 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, return 0; inode_add_bytes(inode, clone_len); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); - ret = btrfs_inc_extent_ref(trans, &ref); + + if (clone_info->is_new_extent && clone_info->insertions == 0) { + key.objectid = clone_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = clone_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + clone_info->file_offset, + clone_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + clone_info->disk_offset, + clone_info->disk_len, 0); + ref_offset = clone_info->file_offset - clone_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + clone_info->insertions++; return ret; } @@ -2705,7 +2722,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && ret && ret != -EOPNOTSUPP) + if (clone_info && !clone_info->is_new_extent && + ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; } @@ -2800,7 +2818,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info) + if (clone_info && !clone_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cce6f8789a4e..53bce9351cfb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9576,11 +9576,15 @@ out_unlock: return err; } -static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, struct inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; + struct btrfs_clone_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int ret; @@ -9597,10 +9601,41 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) - return ret; - return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, - &stack_fi, ret); + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.item_size = sizeof(stack_fi); + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_punch_hole_range(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; } + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9623,14 +9658,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -9642,11 +9669,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -9659,13 +9683,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -9728,8 +9750,10 @@ next: break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 2461be6ccb6f..3c03126b52b1 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -463,6 +463,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.item_size = size; + clone_info.is_new_extent = false; ret = btrfs_punch_hole_range(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); From fb870f6cdd72a1f143d9d25ab864e474fed65616 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:21 +0100 Subject: [PATCH 106/161] btrfs: remove item_size member of struct btrfs_clone_extent_info The value of item_size of struct btrfs_clone_extent_info is always set to the size of a non-inline file extent item, and in fact the infrastructure that uses this structure (btrfs_punch_hole_range()) does not work with inline file extents at all (and it is not supposed to). So just remove that field from the structure and use directly sizeof(struct btrfs_file_extent_item) instead. Also assert that the file extent type is not inline at btrfs_insert_clone_extent(). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 5 +++-- fs/btrfs/inode.c | 1 - fs/btrfs/reflink.c | 1 - 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1308a851b00f..6200e430dfad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1200,8 +1200,8 @@ struct btrfs_clone_extent_info { u64 data_offset; u64 data_len; u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; - u32 item_size; /* * Set to true when attempting to replace a file range with a new extent * described by this structure, set to false when attempting to clone an diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 73827c9c7a70..28794a98778a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2596,15 +2596,16 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = clone_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; write_extent_buffer(leaf, clone_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); if (clone_info->is_new_extent) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 53bce9351cfb..a9f4875a4a52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9617,7 +9617,6 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.data_len = len; extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; - extent_info.item_size = sizeof(stack_fi); extent_info.is_new_extent = true; extent_info.qgroup_reserved = ret; extent_info.insertions = 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 3c03126b52b1..020da4d65b64 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -462,7 +462,6 @@ process_slot: clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; clone_info.is_new_extent = false; ret = btrfs_punch_hole_range(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, From bf385648fa4805acf206254c77916edb58dbfe25 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:22 +0100 Subject: [PATCH 107/161] btrfs: rename struct btrfs_clone_extent_info to a more generic name Now that we can use btrfs_clone_extent_info to convey information for a new prealloc extent as well, and not just for existing extents that are being cloned, rename it to btrfs_replace_extent_info, which reflects the fact that this is now more generic and it is used to replace all existing extents in a file range with the extent described by the structure. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 +++- fs/btrfs/file.c | 92 +++++++++++++++++++++++----------------------- fs/btrfs/inode.c | 2 +- fs/btrfs/reflink.c | 2 +- 4 files changed, 54 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6200e430dfad..0e4034d63111 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1194,7 +1194,11 @@ struct btrfs_root { #endif }; -struct btrfs_clone_extent_info { +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; @@ -3086,7 +3090,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, u64 end, int drop_cache); int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 28794a98778a..7ac0a20119f3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2573,8 +2573,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, - struct btrfs_clone_extent_info *clone_info, - const u64 clone_len) + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2585,67 +2585,67 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; int ret; - if (clone_len == 0) + if (replace_len == 0) return 0; - if (clone_info->disk_offset == 0 && + if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = clone_info->file_offset; + key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, clone_info->extent_buf, + write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); - btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); - btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); - if (clone_info->is_new_extent) + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - clone_info->file_offset, clone_len); + extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (clone_info->disk_offset == 0) + if (extent_info->disk_offset == 0) return 0; - inode_add_bytes(inode, clone_len); + inode_add_bytes(inode, replace_len); - if (clone_info->is_new_extent && clone_info->insertions == 0) { - key.objectid = clone_info->disk_offset; + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = clone_info->disk_len; + key.offset = extent_info->disk_len; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), - clone_info->file_offset, - clone_info->qgroup_reserved, + extent_info->file_offset, + extent_info->qgroup_reserved, &key); } else { u64 ref_offset; btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, btrfs_ino(BTRFS_I(inode)), ref_offset); ret = btrfs_inc_extent_ref(trans, &ref); } - clone_info->insertions++; + extent_info->insertions++; return ret; } @@ -2653,15 +2653,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). - * @clone_info is NULL for fallocate's hole punching and non-NULL for extent - * cloning. - * When cloning, we don't want to end up in a state where we dropped extents - * without inserting a new one, so we must abort the transaction to avoid a - * corruption. + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. */ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2690,10 +2690,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set or if we are cloning - * an extent + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; @@ -2723,7 +2723,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && !clone_info->is_new_extent && + if (extent_info && !extent_info->is_new_extent && ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; @@ -2731,7 +2731,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, trans->block_rsv = &fs_info->trans_block_rsv; - if (!clone_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); @@ -2745,7 +2745,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2765,18 +2765,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info && drop_end > clone_info->file_offset) { - u64 clone_len = drop_end - clone_info->file_offset; + if (extent_info && drop_end > extent_info->file_offset) { + u64 replace_len = drop_end - extent_info->file_offset; ret = btrfs_insert_clone_extent(trans, inode, path, - clone_info, clone_len); + extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } - clone_info->data_len -= clone_len; - clone_info->data_offset += clone_len; - clone_info->file_offset += clone_len; + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; } cur_offset = drop_end; @@ -2800,7 +2800,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!clone_info) { + if (!extent_info) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; @@ -2819,7 +2819,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info && !clone_info->is_new_extent) + if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -2845,7 +2845,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); if (ret) { @@ -2853,7 +2853,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), cur_offset, drop_end - cur_offset); @@ -2863,9 +2863,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, - clone_info->data_len); + if (extent_info) { + ret = btrfs_insert_clone_extent(trans, inode, path, extent_info, + extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9f4875a4a52..77ab9eada11d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9582,7 +9582,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( u64 file_offset) { struct btrfs_file_extent_item stack_fi; - struct btrfs_clone_extent_info extent_info; + struct btrfs_replace_extent_info extent_info; struct btrfs_trans_handle *trans = trans_in; struct btrfs_path *path; u64 start = ins->objectid; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 020da4d65b64..dc8b5397e198 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -439,7 +439,7 @@ process_slot: if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b From 306bfec02b1054d24bbbeec49ece6db396d451e0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:23 +0100 Subject: [PATCH 108/161] btrfs: rename btrfs_punch_hole_range() to a more generic name The function btrfs_punch_hole_range() is now used to replace all the file extents in a given file range with an extent described in the given struct btrfs_replace_extent_info argument. This extent can either be an existing extent that is being cloned or it can be a new extent (namely a prealloc extent). When that argument is NULL it only punches a hole (drops all the existing extents) in the file range. So rename the function to btrfs_replace_file_extents(). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 2 +- fs/btrfs/reflink.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e4034d63111..5ab55f678465 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3088,7 +3088,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ac0a20119f3..241b34e44a6c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2659,7 +2659,7 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) @@ -3007,7 +3007,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 77ab9eada11d..ff9b22b71171 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9625,7 +9625,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( if (!path) return ERR_PTR(-ENOMEM); - ret = btrfs_punch_hole_range(inode, path, file_offset, + ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index dc8b5397e198..39b3269e5760 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -463,7 +463,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; - ret = btrfs_punch_hole_range(inode, path, drop_start, + ret = btrfs_replace_file_extents(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) @@ -533,7 +533,7 @@ process_slot: btrfs_release_path(path); path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; From 0cbb5bdfea2675f4b9749f99120a79f9e0a28600 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:24 +0100 Subject: [PATCH 109/161] btrfs: rename btrfs_insert_clone_extent() to a more generic name Now that we use the same mechanism to replace all the extents in a file range with either a hole, an existing extent (when cloning) or a new extent (when using fallocate), the name of btrfs_insert_clone_extent() no longer reflects its genericity. So rename it to btrfs_insert_replace_extent(), since what it does is to either insert an existing extent or a new extent into a file range. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 241b34e44a6c..32ceda264b7d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2570,7 +2570,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } -static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, @@ -2768,7 +2768,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, if (extent_info && drop_end > extent_info->file_offset) { u64 replace_len = drop_end - extent_info->file_offset; - ret = btrfs_insert_clone_extent(trans, inode, path, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2864,7 +2864,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, extent_info, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); From a9b2e0de92cbed84b9bca1464babaf1d67a5542b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:51 -0700 Subject: [PATCH 110/161] btrfs: send: get rid of i_size logic in send_write() send_write()/fill_read_buf() have some logic for avoiding reading past i_size. However, everywhere that we call send_write()/send_extent_data(), we've already clamped the length down to i_size. Get rid of the i_size handling, which simplifies the next change. Reviewed-by: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7c7c09fc65e8..8af5e867e4ca 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4794,7 +4794,7 @@ out: return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4804,21 +4804,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - ssize_t ret = 0; + int ret = 0; + size_t read = 0; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; - last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ @@ -4859,16 +4851,15 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->read_buf + read, addr + pg_offset, cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + read += cur_len; } -out: iput(inode); return ret; } @@ -4882,7 +4873,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) @@ -4890,12 +4880,9 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; + ret = fill_read_buf(sctx, offset, len); + if (ret < 0) goto out; - } ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) @@ -4907,16 +4894,14 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5095,9 +5080,7 @@ static int send_extent_data(struct send_ctx *sctx, ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } From 8c7d9fe06f5bb87dce356dcc9ea7c003b529e6ba Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:52 -0700 Subject: [PATCH 111/161] btrfs: send: avoid copying file data send_write() currently copies from the page cache to sctx->read_buf, and then from sctx->read_buf to sctx->send_buf. Similarly, send_hole() zeroes sctx->read_buf and then copies from sctx->read_buf to sctx->send_buf. However, if we write the TLV header manually, we can copy to sctx->send_buf directly and get rid of sctx->read_buf. Reviewed-by: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 65 +++++++++++++++++++++++++++++-------------------- fs/btrfs/send.h | 1 - 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8af5e867e4ca..491c486e3960 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct send_ctx { struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -4794,7 +4792,25 @@ out: return ret; } -static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4804,8 +4820,11 @@ static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - int ret = 0; - size_t read = 0; + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) @@ -4851,14 +4870,15 @@ static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + read, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - read += cur_len; + sctx->send_size += cur_len; } iput(inode); return ret; @@ -4880,10 +4900,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - ret = fill_read_buf(sctx, offset, len); - if (ret < 0) - goto out; - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) goto out; @@ -4894,7 +4910,9 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); @@ -5013,8 +5031,8 @@ out: static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5041,16 +5059,19 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5066,17 +5087,16 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; @@ -7145,12 +7165,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; @@ -7354,7 +7368,6 @@ out: kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index ead397f7034f..de91488b7cd0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -13,7 +13,6 @@ #define BTRFS_SEND_STREAM_VERSION 1 #define BTRFS_SEND_BUF_SIZE SZ_64K -#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, From c9a949af13d6d8983de086d9ecbf34c4c94da73e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:53 -0700 Subject: [PATCH 112/161] btrfs: send: use btrfs_file_extent_end() in send_write_or_clone() send_write_or_clone() basically has an open-coded copy of btrfs_file_extent_end() except that it (incorrectly) aligns to PAGE_SIZE instead of sectorsize. Fix and simplify the code by using btrfs_file_extent_end(). Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 44 +++++++++++--------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 491c486e3960..ac0e942e8d7c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5400,51 +5400,29 @@ static int send_write_or_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } - - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } From 7573df5547c00f067e4c0b2fd7cc99f3d3a2bda7 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:54 -0700 Subject: [PATCH 113/161] btrfs: sysfs: export supported send stream version This reports the latest send stream version supported by the kernel as the feature in /sys/fs/btrfs/features/send_stream_version . Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c71db7acb89d..13354de56201 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -14,6 +14,7 @@ #include "ctree.h" #include "discard.h" #include "disk-io.h" +#include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" @@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), NULL }; From fc0716c2f6afed06a9306db3fb62cc1fcf389757 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:39:57 +0300 Subject: [PATCH 114/161] btrfs: re-arrange statements in setup_items_for_insert Rearrange statements calculating the offset of the newly added items so that the calculation has to be done only once. No functional change. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dbfa365eb18..36eee3aef6ff 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4832,8 +4832,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; + btrfs_set_token_item_offset(&token, item, data_end); btrfs_set_token_item_size(&token, item, data_size[i]); } From 3dc9dc8969dc738285cb7d43e852c98459c8f0a9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:39:58 +0300 Subject: [PATCH 115/161] btrfs: eliminate total_size parameter from setup_items_for_insert The value of this argument can be derived from the total_data as it's simply the value of the data size + size of btrfs_items being touched. Move the parameter calculation inside the function. This results in a simpler interface and also a minor size reduction: ./scripts/bloat-o-meter ctree.original fs/btrfs/ctree.o add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-34 (-34) Function old new delta btrfs_duplicate_item 260 259 -1 setup_items_for_insert 1200 1190 -10 btrfs_insert_empty_items 177 154 -23 Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 ++++------ fs/btrfs/ctree.h | 2 +- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/file.c | 5 +---- fs/btrfs/tests/extent-buffer-tests.c | 3 +-- fs/btrfs/tests/inode-tests.c | 6 ++---- 6 files changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 36eee3aef6ff..a855ce0e4cb1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4580,9 +4580,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, new_key, &item_size, item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4762,7 +4760,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) + u32 total_data, int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4773,6 +4771,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; + const u32 total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4875,8 +4874,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(root, path, cpu_key, data_size, total_data, nr); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5ab55f678465..c4f9c1985ad1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2744,7 +2744,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); + u32 total_data, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0727b10a9a89..0360e07d1b0f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -768,8 +768,8 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, - total_data_size, total_size, nitems); + setup_items_for_insert(root, path, keys, data_size, total_data_size, + nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32ceda264b7d..fefe152884fa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,10 +1057,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + + setup_items_for_insert(root, path, &key, &extent_item_size, extent_item_size, 1); *key_inserted = 1; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a1b9f9b5978e..a792a1dfd6dd 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, &key, &value_len, value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 894a63a92236..068b6ef02cec 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); } /* From fc0d82e103c78faa98cf61e681b45618e879b63d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:39:59 +0300 Subject: [PATCH 116/161] btrfs: sink total_data parameter in setup_items_for_insert That parameter can easily be derived based on the "data_size" and "nr" parameters exploit this fact to simply the function's signature. No functional changes. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 13 +++++++++---- fs/btrfs/ctree.h | 2 +- fs/btrfs/delayed-inode.c | 3 +-- fs/btrfs/file.c | 3 +-- fs/btrfs/tests/extent-buffer-tests.c | 2 +- fs/btrfs/tests/inode-tests.c | 4 ++-- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a855ce0e4cb1..a0a4a885664e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4580,7 +4580,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, item_size, 1); + setup_items_for_insert(root, path, new_key, &item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4760,7 +4760,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, int nr) + int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4771,7 +4771,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; - const u32 total_size = total_data + (nr * sizeof(struct btrfs_item)); + u32 total_size; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4874,7 +4879,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, total_data, nr); + setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4f9c1985ad1..cd644c755142 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2744,7 +2744,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, int nr); + int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0360e07d1b0f..5aba81e16113 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -768,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, total_data_size, - nitems); + setup_items_for_insert(root, path, keys, data_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fefe152884fa..2969d715c588 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,8 +1057,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, &extent_item_size, - extent_item_size, 1); + setup_items_for_insert(root, path, &key, &extent_item_size, 1); *key_inserted = 1; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a792a1dfd6dd..df54cdfdc250 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, 1); + setup_items_for_insert(root, path, &key, &value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 068b6ef02cec..cc54d4973a74 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); } /* From da9ffb242c448af20f70b431d3f04df50c6cb0d2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:40:00 +0300 Subject: [PATCH 117/161] btrfs: add kerneldoc for setup_items_for_insert Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a0a4a885664e..4fb2b0673950 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4753,10 +4753,16 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot +/** + * setup_items_for_insert - Helper called before inserting one or more items + * to a leaf. Main purpose is to save stack depth by doing the bulk of the work + * in a function that doesn't call btrfs_search_slot + * + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @cpu_key: array of keys for items to be inserted + * @data_size: size of the body of each item we are going to insert + * @nr: size of @cpu_key/@data_size arrays */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, From 7269ddd2f60286af1f5d1d47d283d2f9a26818c6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:40:01 +0300 Subject: [PATCH 118/161] btrfs: improve error message in setup_items_for_insert Reword and update formats to match variable types. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ update formats ] Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4fb2b0673950..a165093739c4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4809,7 +4809,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d old_data %d data_end %d", + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", slot, old_data, data_end); BUG(); } From 72804905001267a93e7a470430c2bcc122932ca6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Sep 2020 17:40:37 -0400 Subject: [PATCH 119/161] btrfs: kill the RCU protection for fs_info->space_info We have this thing wrapped in an RCU lock, but it's really not needed. We create all the space_info's on mount, and we destroy them on unmount. The list never changes and we're protected from messing with it by the normal mount/umount path, so kill the RCU stuff around it. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 16 ++-------------- fs/btrfs/ioctl.c | 10 ++-------- fs/btrfs/space-info.c | 14 ++++---------- fs/btrfs/super.c | 5 +---- 4 files changed, 9 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 01e8ba1da1d3..a3b27204371c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2031,8 +2031,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } - rcu_read_lock(); - list_for_each_entry_rcu(space_info, &info->space_info, list) { + list_for_each_entry(space_info, &info->space_info, list) { if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2052,7 +2051,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } - rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -3007,12 +3005,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } - rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, @@ -3343,14 +3339,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->block_group_cache_lock); - /* - * Now that all the block groups are freed, go through and free all the - * space_info structs. This is only called during the final stages of - * unmount, and so we know nobody is using them. We call - * synchronize_rcu() once before we start, just to be on the safe side. - */ - synchronize_rcu(); - btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8091324341a5..de65cf6105e4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3475,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *tmp; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3531,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, break; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0f16a2ce5401..64099565ab8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry(found, head, list) found->full = 0; - rcu_read_unlock(); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) @@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (ret) return ret; - list_add_rcu(&space_info->list, &info->space_info); + list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -290,14 +288,10 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); + list_for_each_entry(found, head, list) { + if (found->flags & flags) return found; - } } - rcu_read_unlock(); return NULL; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3ebe7240c63d..8840a4fa81eb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2164,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - rcu_read_lock(); - list_for_each_entry_rcu(found, &fs_info->space_info, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2194,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); From 49ea112da0e6e323bb923315ec54ee920759bf19 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Sep 2020 17:40:38 -0400 Subject: [PATCH 120/161] btrfs: do not create raid sysfs entries under any locks While running xfstests btrfs/177 I got the following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc3+ #5 Not tainted ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff97066aa56760 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffff9fd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc+0x37/0x270 alloc_inode+0x82/0xb0 iget_locked+0x10d/0x2c0 kernfs_get_inode+0x1b/0x130 kernfs_get_tree+0x136/0x240 sysfs_get_tree+0x16/0x40 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (kernfs_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 kernfs_add_one+0x23/0x150 kernfs_create_dir_ns+0x7a/0xb0 sysfs_create_dir_ns+0x60/0xb0 kobject_add_internal+0xc0/0x2c0 kobject_add+0x6e/0x90 btrfs_sysfs_add_block_group_type+0x102/0x160 btrfs_make_block_group+0x167/0x230 btrfs_alloc_chunk+0x54f/0xb80 btrfs_chunk_alloc+0x18e/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_insert_empty_items+0x64/0xb0 btrfs_new_inode+0x225/0x730 btrfs_create+0xab/0x1f0 lookup_open.isra.0+0x52d/0x690 path_openat+0x2a7/0x9e0 do_filp_open+0x75/0x100 do_sys_openat2+0x7b/0x130 __x64_sys_openat+0x46/0x70 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_lookup_inode+0x2a/0x8f __btrfs_update_delayed_inode+0x80/0x240 btrfs_commit_inode_delayed_inode+0x119/0x120 btrfs_evict_inode+0x357/0x500 evict+0xcf/0x1f0 do_unlinkat+0x1a9/0x2b0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> kernfs_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(kernfs_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffff9fd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff9fd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff9706629780e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 1 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #5 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb8 check_noncircular+0x12d/0x150 __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa7/0x3d0 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x41/0x50 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This happens because when we link in a block group with a new raid index type we'll create the corresponding sysfs entries for it. This is problematic because while restriping we're holding the chunk_mutex, and while mounting we're holding the tree locks. Fixing this isn't pretty, we move the call to the sysfs stuff into the btrfs_create_pending_block_groups() work, where we're not holding any locks. This creates a slight race where other threads could see that there's no sysfs kobj for that raid type, and race to create the sysfs dir. Fix this by wrapping the creation in space_info->lock, so we only get one thread calling kobject_add() for the new directory. We don't worry about the lock on cleanup as it only gets deleted on unmount. On mount it's more straightforward, we loop through the space_infos already, just check every raid index in each space_info and added the sysfs entries for the corresponding block groups. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 31 +++++++++++++++++++++++++------ fs/btrfs/sysfs.c | 25 +++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a3b27204371c..c0f1d6818df7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); - - if (first) - btrfs_sysfs_add_block_group_type(cache); } static struct btrfs_block_group *btrfs_create_block_group_cache( @@ -2032,6 +2026,17 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2091,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) return; while (!list_empty(&trans->new_bgs)) { + int index; + block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; + index = btrfs_bg_flags_to_raid_index(block_group->flags); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); @@ -2105,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + /* Already aborted the transaction if it failed. */ next: btrfs_delayed_refs_rsv_release(fs_info, 1); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 13354de56201..279d9262b676 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1137,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } - - space_info->block_group_kobjs[index] = &rkobj->kobj; } /* From bb56f02f26fe23798edb1b2175707419b28c752a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Sep 2020 15:27:50 +0100 Subject: [PATCH 121/161] btrfs: reschedule if necessary when logging directory items Logging directories with many entries can take a significant amount of time, and in some cases monopolize a cpu/core for a long time if the logging task doesn't happen to block often enough. Johannes and Lu Fengqi reported test case generic/041 triggering a soft lockup when the kernel has CONFIG_SOFTLOCKUP_DETECTOR=y. For this test case we log an inode with 3002 hard links, and because the test removed one hard link before fsyncing the file, the inode logging causes the parent directory do be logged as well, which has 6004 directory items to log (3002 BTRFS_DIR_ITEM_KEY items plus 3002 BTRFS_DIR_INDEX_KEY items), so it can take a significant amount of time and trigger the soft lockup. So just make tree-log.c:log_dir_items() reschedule when necessary, releasing the current search path before doing so and then resume from where it was before the reschedule. The stack trace produced when the soft lockup happens is the following: [10480.277653] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [xfs_io:28172] [10480.279418] Modules linked in: dm_thin_pool dm_persistent_data (...) [10480.284915] irq event stamp: 29646366 [10480.285987] hardirqs last enabled at (29646365): [] __slab_alloc.constprop.0+0x56/0x60 [10480.288482] hardirqs last disabled at (29646366): [] irqentry_enter+0x1d/0x50 [10480.290856] softirqs last enabled at (4612): [] __do_softirq+0x323/0x56c [10480.293615] softirqs last disabled at (4483): [] asm_call_on_stack+0xf/0x20 [10480.296428] CPU: 2 PID: 28172 Comm: xfs_io Not tainted 5.9.0-rc4-default+ #1248 [10480.298948] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 [10480.302455] RIP: 0010:__slab_alloc.constprop.0+0x19/0x60 [10480.304151] Code: 86 e8 31 75 21 00 66 66 2e 0f 1f 84 00 00 00 (...) [10480.309558] RSP: 0018:ffffadbe09397a58 EFLAGS: 00000282 [10480.311179] RAX: ffff8a495ab92840 RBX: 0000000000000282 RCX: 0000000000000006 [10480.313242] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff85249b66 [10480.315260] RBP: ffff8a497d04b740 R08: 0000000000000001 R09: 0000000000000001 [10480.317229] R10: ffff8a497d044800 R11: ffff8a495ab93c40 R12: 0000000000000000 [10480.319169] R13: 0000000000000000 R14: 0000000000000c40 R15: ffffffffc01daf70 [10480.321104] FS: 00007fa1dc5c0e40(0000) GS:ffff8a497da00000(0000) knlGS:0000000000000000 [10480.323559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10480.325235] CR2: 00007fa1dc5befb8 CR3: 0000000004f8a006 CR4: 0000000000170ea0 [10480.327259] Call Trace: [10480.328286] ? overwrite_item+0x1f0/0x5a0 [btrfs] [10480.329784] __kmalloc+0x831/0xa20 [10480.331009] ? btrfs_get_32+0xb0/0x1d0 [btrfs] [10480.332464] overwrite_item+0x1f0/0x5a0 [btrfs] [10480.333948] log_dir_items+0x2ee/0x570 [btrfs] [10480.335413] log_directory_changes+0x82/0xd0 [btrfs] [10480.336926] btrfs_log_inode+0xc9b/0xda0 [btrfs] [10480.338374] ? init_once+0x20/0x20 [btrfs] [10480.339711] btrfs_log_inode_parent+0x8d3/0xd10 [btrfs] [10480.341257] ? dget_parent+0x97/0x2e0 [10480.342480] btrfs_log_dentry_safe+0x3a/0x50 [btrfs] [10480.343977] btrfs_sync_file+0x24b/0x5e0 [btrfs] [10480.345381] do_fsync+0x38/0x70 [10480.346483] __x64_sys_fsync+0x10/0x20 [10480.347703] do_syscall_64+0x2d/0x70 [10480.348891] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [10480.350444] RIP: 0033:0x7fa1dc80970b [10480.351642] Code: 0f 05 48 3d 00 f0 ff ff 77 45 c3 0f 1f 40 00 48 (...) [10480.356952] RSP: 002b:00007fffb3d081d0 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [10480.359458] RAX: ffffffffffffffda RBX: 0000562d93d45e40 RCX: 00007fa1dc80970b [10480.361426] RDX: 0000562d93d44ab0 RSI: 0000562d93d45e60 RDI: 0000000000000003 [10480.363367] RBP: 0000000000000001 R08: 0000000000000000 R09: 00007fa1dc7b2a40 [10480.365317] R10: 0000562d93d0e366 R11: 0000000000000293 R12: 0000000000000001 [10480.367299] R13: 0000562d93d45290 R14: 0000562d93d45e40 R15: 0000562d93d45e60 Link: https://lore.kernel.org/linux-btrfs/20180713090216.GC575@fnst.localdomain/ Reported-by: Johannes Thumshirn CC: stable@vger.kernel.org # 4.4+ Tested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e35cfe8cf68b..56cbc1706b6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3611,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3630,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { From 2f1d3e4b930dcb83ac7ec9b106d7adb4d36c4c9d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:03 +0300 Subject: [PATCH 122/161] btrfs: remove btree_readpage There is no way for this function to be called as ->readpage() since it's called from generic_file_buffered_read/filemap_fault/do_read_cache_page/readhead code. BTRFS doesn't utilize the first 3 for the btree inode and implements it's owon readhead mechanism. So simply remove the function. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f44f3d672a1f..0fae2caf31ef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -948,11 +948,6 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_readpage(struct file *file, struct page *page) -{ - return extent_read_full_page(page, btree_get_extent, 0); -} - static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) @@ -992,7 +987,6 @@ static int btree_set_page_dirty(struct page *page) } static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, From 0420177c08b2d86034a98d45b7d3609bdc986fd2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:04 +0300 Subject: [PATCH 123/161] btrfs: simplify metadata pages reading Metadata pages currently use __do_readpage to read metadata pages, unfortunately this function is also used to deal with ordinary data pages. This makes the metadata pages reading code to go through multiple hoops in order to adhere to __do_readpage invariants. Most of these are necessary for data pages which could be compressed. For metadata it's enough to simply build a bio and submit it. To this effect simply call submit_extent_page directly from read_extent_buffer_pages which is the only callpath used to populate extent_buffers with data. This in turn enables further cleanups. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1e070ec7ad8..0a6cda4c30ed 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5573,20 +5573,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { From 208d6341e85b4f2628a90438fce5d8a2dd97d6ba Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:05 +0300 Subject: [PATCH 124/161] btrfs: remove btree_get_extent The sole purpose of this function was to satisfy the requirements of __do_readpage. Since that function is no longer used to read metadata pages the need to keep btree_get_extent around has also disappeared. Simply remove it. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 47 ---------------------------------------------- fs/btrfs/disk-io.h | 3 --- 2 files changed, 50 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0fae2caf31ef..53b588a7e150 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -204,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, #endif -/* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret == -EEXIST) { - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) - em = ERR_PTR(-EIO); - } else if (ret) { - free_extent_map(em); - em = ERR_PTR(ret); - } - write_unlock(&em_tree->lock); - -out: - return em; -} - /* * Compute the csum of a btree block and store the result to provided buffer. */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 00dc39d47ed3..89b6a709a184 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -123,9 +123,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); From 1a5ee1e6260368da8347e66321e157a7de288ef3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:06 +0300 Subject: [PATCH 125/161] btrfs: remove btrfs_get_extent indirection from __do_readpage Now that this function is only responsible for reading data pages it's no longer necessary to pass get_extent_t parameter across several layers of functions. This patch removes this parameter from multiple functions: __get_extent_map/__do_readpage/__extent_read_full_page/ extent_read_full_page and simply calls btrfs_get_extent directly in __get_extent_map. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 31 ++++++++++++------------------- fs/btrfs/extent_io.h | 3 +-- fs/btrfs/inode.c | 2 +- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0a6cda4c30ed..4a0675ec90fa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page) static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3142,9 +3141,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, +static int __do_readpage(struct page *page, struct extent_map **em_cached, struct bio **bio, int mirror_num, unsigned long *bio_flags, unsigned int read_flags, u64 *prev_em_start) @@ -3209,7 +3206,7 @@ static int __do_readpage(struct page *page, break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -3362,16 +3359,14 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + __do_readpage(pages[index], em_cached, bio, 0, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, +static int __extent_read_full_page(struct page *page, struct bio **bio, + int mirror_num, unsigned long *bio_flags, unsigned int read_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); @@ -3381,20 +3376,18 @@ static int __extent_read_full_page(struct page *page, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); + ret = __do_readpage(page, NULL, bio, mirror_num, bio_flags, read_flags, + NULL); return ret; } -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num) +int extent_read_full_page(struct page *page, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, - &bio_flags, 0); + ret = __extent_read_full_page(page, &bio, mirror_num, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 06611947a9f7..272d5281bd4d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -193,8 +193,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num); +int extent_read_full_page(struct page *page, int mirror_num); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ff9b22b71171..7db2cbb469e1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,7 +8040,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, btrfs_get_extent, 0); + return extent_read_full_page(page, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From 72cffee4634058b993685669cab62c5d1c1fee78 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:07 +0300 Subject: [PATCH 126/161] btrfs: remove mirror_num argument from extent_read_full_page It's called only from btrfs_readpage which always passes 0 so just sink the argument into extent_read_full_page. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4a0675ec90fa..355db40a1cb5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3381,15 +3381,15 @@ static int __extent_read_full_page(struct page *page, struct bio **bio, return ret; } -int extent_read_full_page(struct page *page, int mirror_num) +int extent_read_full_page(struct page *page) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(page, &bio, mirror_num, &bio_flags, 0); + ret = __extent_read_full_page(page, &bio, 0, &bio_flags, 0); if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, 0, bio_flags); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 272d5281bd4d..0ccb2dabc291 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -193,7 +193,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, int mirror_num); +int extent_read_full_page(struct page *page); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7db2cbb469e1..aaf998f3133c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,7 +8040,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, 0); + return extent_read_full_page(page); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From c1be9c1ad5cca645ffd31284674ee4e28b9472de Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:08 +0300 Subject: [PATCH 127/161] btrfs: promote extent_read_full_page to btrfs_readpage Now that btrfs_readpage is the only caller of extent_read_full_page the latter can be open coded in the former. Use the occassion to rename __extent_read_full_page to extent_read_full_page. To facillitate this change submit_one_bio has to be exported as well. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 ++++----------------- fs/btrfs/extent_io.h | 5 ++++- fs/btrfs/inode.c | 9 ++++++++- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 355db40a1cb5..402b88ddcbca 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,8 +160,8 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; @@ -3365,9 +3365,8 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static int __extent_read_full_page(struct page *page, struct bio **bio, - int mirror_num, unsigned long *bio_flags, - unsigned int read_flags) +int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, + unsigned long *bio_flags, unsigned int read_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); @@ -3381,18 +3380,6 @@ static int __extent_read_full_page(struct page *page, struct bio **bio, return ret; } -int extent_read_full_page(struct page *page) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(page, &bio, 0, &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0ccb2dabc291..8fec00c50846 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -193,7 +193,10 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page); +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags); +int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, + unsigned long *bio_flags, unsigned int read_flags); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aaf998f3133c..55c5617c180b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,7 +8040,14 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page); + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = extent_read_full_page(page, &bio, 0, &bio_flags, 0); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From 003c286aef3f73e8e7b5fce6040761966ba65b5b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:09 +0300 Subject: [PATCH 128/161] btrfs: sink mirror_num argument in extent_read_full_page It's always set to 0 from the sole caller - btrfs_readpage. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 402b88ddcbca..f43827bee7e6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3365,7 +3365,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, +int extent_read_full_page(struct page *page, struct bio **bio, unsigned long *bio_flags, unsigned int read_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); @@ -3375,8 +3375,7 @@ int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, NULL, bio, mirror_num, bio_flags, read_flags, - NULL); + ret = __do_readpage(page, NULL, bio, 0, bio_flags, read_flags, NULL); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8fec00c50846..3562c9203de3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -195,7 +195,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); -int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, +int extent_read_full_page(struct page *page, struct bio **bio, unsigned long *bio_flags, unsigned int read_flags); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55c5617c180b..a08917e72f05 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8044,7 +8044,7 @@ int btrfs_readpage(struct file *file, struct page *page) unsigned long bio_flags = 0; int ret; - ret = extent_read_full_page(page, &bio, 0, &bio_flags, 0); + ret = extent_read_full_page(page, &bio, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, 0, bio_flags); return ret; From 6f15af6060057babf03d3b0dce2b0d7a9258620c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:10 +0300 Subject: [PATCH 129/161] btrfs: sink read_flags argument into extent_read_full_page It's always set to 0 by its sole caller - btrfs_readpage. Simply remove it. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f43827bee7e6..1c959d66195c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3366,7 +3366,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags, unsigned int read_flags) + unsigned long *bio_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); @@ -3375,7 +3375,7 @@ int extent_read_full_page(struct page *page, struct bio **bio, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, NULL, bio, 0, bio_flags, read_flags, NULL); + ret = __do_readpage(page, NULL, bio, 0, bio_flags, 0, NULL); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3562c9203de3..5fa248570145 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -196,7 +196,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags, unsigned int read_flags); + unsigned long *bio_flags); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a08917e72f05..4b40d9703be6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8044,7 +8044,7 @@ int btrfs_readpage(struct file *file, struct page *page) unsigned long bio_flags = 0; int ret; - ret = extent_read_full_page(page, &bio, &bio_flags, 0); + ret = extent_read_full_page(page, &bio, &bio_flags); if (bio) ret = submit_one_bio(bio, 0, bio_flags); return ret; From fd513000eb279347df1f5a7ce720ae66b4ac3b1b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:11 +0300 Subject: [PATCH 130/161] btrfs: sink mirror_num argument in __do_readpage It's always set to 0 by the 2 callers so move it inside __do_readpage. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1c959d66195c..1ef857c0d784 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3142,9 +3142,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ static int __do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3322,7 +3321,7 @@ static int __do_readpage(struct page *page, struct extent_map **em_cached, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, - end_bio_extent_readpage, mirror_num, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3359,7 +3358,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], em_cached, bio, 0, bio_flags, + __do_readpage(pages[index], em_cached, bio, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } @@ -3375,7 +3374,7 @@ int extent_read_full_page(struct page *page, struct bio **bio, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, NULL, bio, 0, bio_flags, 0, NULL); + ret = __do_readpage(page, NULL, bio, bio_flags, 0, NULL); return ret; } From 0f208812493f4ce17910633ad3f7f3b7c8c35cd3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 14:39:16 +0300 Subject: [PATCH 131/161] btrfs: open code extent_read_full_page to its sole caller This makes reading the code a tad easier by decreasing the level of indirection by one. Reviewed-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 24 +++++------------------- fs/btrfs/extent_io.h | 5 +++-- fs/btrfs/inode.c | 9 +++++++-- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1ef857c0d784..afac70ef0cc5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3141,9 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, - unsigned int read_flags, u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3358,26 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], em_cached, bio, bio_flags, - REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = __do_readpage(page, NULL, bio, bio_flags, 0, NULL); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5fa248570145..3bbc25b816ea 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -195,8 +195,9 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); -int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags); +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4b40d9703be6..c5b24a029eb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,11 +8040,16 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - struct bio *bio = NULL; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; unsigned long bio_flags = 0; + struct bio *bio = NULL; int ret; - ret = extent_read_full_page(page, &bio, &bio_flags); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); if (bio) ret = submit_one_bio(bio, 0, bio_flags); return ret; From 633cc816f742ef34b0dac76581d732a28ca8d62d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:49 +0300 Subject: [PATCH 132/161] btrfs: clean BTRFS_I usage in btrfs_destroy_inode Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c5b24a029eb3..d8cdc9829f84 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8657,21 +8657,21 @@ void btrfs_free_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8682,24 +8682,23 @@ void btrfs_destroy_inode(struct inode *inode) return; while (1) { - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - (u64)-1); + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); - btrfs_remove_ordered_extent(inode, ordered); + btrfs_remove_ordered_extent(vfs_inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); - inode_tree_del(BTRFS_I(inode)); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); - btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); - btrfs_put_root(BTRFS_I(inode)->root); + btrfs_qgroup_check_reserved_leak(inode); + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) From 71fe0a55dae7e5ef7762c5627891b37bf1b367bf Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:50 +0300 Subject: [PATCH 133/161] btrfs: switch btrfs_remove_ordered_extent to btrfs_inode Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- fs/btrfs/ordered-data.c | 7 +++---- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8cdc9829f84..42d5e1c0a5d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2753,7 +2753,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -8689,7 +8689,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); - btrfs_remove_ordered_extent(vfs_inode, ordered); + btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c8a13d143877..c93b4f2c778b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -463,13 +463,12 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; bool pending; @@ -527,7 +526,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(BTRFS_I(inode), entry); + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 6a1d5bf5aee3..8fe720da0b31 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -151,7 +151,7 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) } void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, From 3c38c877fcb9504b1db12cb24de9f514cedfae74 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:51 +0300 Subject: [PATCH 134/161] btrfs: sink inode argument in insert_ordered_extent_file_extent Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 42d5e1c0a5d1..eca42f95b0a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2549,7 +2549,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; @@ -2569,8 +2568,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, - &stack_fi, oe->qgroup_rsv); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); } /* @@ -2667,8 +2667,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_ordered_extent_file_extent(trans, inode, - ordered_extent); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, From 510f85edf1cdf1dbc937eed377442b2826bbb120 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:52 +0300 Subject: [PATCH 135/161] btrfs: remove inode argument from add_pending_csums It's used to reference the csum root which can be done from the trans handle as well. Simplify the signature and while at it also remove the noinline attribute as the function uses only at most 16 bytes of stack space. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eca42f95b0a8..0280eb31dd63 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2246,16 +2246,15 @@ out: * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2683,7 +2682,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; From c0a43603056cb79be8a0dffeedc544ce9b5d115d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:53 +0300 Subject: [PATCH 136/161] btrfs: remove inode argument from btrfs_start_ordered_extent The passed in ordered_extent struct is always well-formed and contains the inode making the explicit argument redundant. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 3 +-- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 15 +++++++-------- fs/btrfs/ordered-data.h | 3 +-- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2969d715c588..038e0afaf3d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1491,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(&inode->vfs_inode, - ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0280eb31dd63..b8481632dfc7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2357,7 +2357,7 @@ again: unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -4581,7 +4581,7 @@ again: &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -7149,7 +7149,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -8324,7 +8324,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de65cf6105e4..ab408a23ba32 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1319,7 +1319,7 @@ again: break; unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c93b4f2c778b..87bac9ecdf4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -543,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered->inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); complete(&ordered->completion); } @@ -649,14 +649,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, * in the extent, and it waits on the io completion code to insert * metadata into the btree corresponding to the extent */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); - trace_btrfs_ordered_extent_start(BTRFS_I(inode), entry); + trace_btrfs_ordered_extent_start(inode, entry); /* * pages in the range can be dirty, clean or writeback. We @@ -664,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -719,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -939,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent_cached(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8fe720da0b31..c3a2325e64a4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -174,8 +174,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); From 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 21 Sep 2020 20:03:35 +0300 Subject: [PATCH 137/161] btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send() btrfs_ioctl_send() used open-coded kvzalloc implementation earlier. The code was accidentally replaced with kzalloc() call [1]. Restore the original code by using kvzalloc() to allocate sctx->clone_roots. [1] https://patchwork.kernel.org/patch/9757891/#20529627 Fixes: 818e010bf9d0 ("btrfs: replace opencoded kvzalloc with the helper") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Denis Efremov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ac0e942e8d7c..79b7d15ca50f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7149,7 +7149,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; From bae12df966f0e1a9b40a2c46d01a0ad79b2c865c Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 21 Sep 2020 20:03:36 +0300 Subject: [PATCH 138/161] btrfs: use kvcalloc for allocation in btrfs_ioctl_send() Replace kvzalloc() call with kvcalloc() that also checks the size internally. There's a standalone overflow check in the function so we can return invalid parameter combination. Use array_size() helper to compute the memory size for clone_sources_tmp. Cc: Kees Cook Signed-off-by: Denis Efremov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 79b7d15ca50f..b84f921ed6c0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7061,7 +7061,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) @@ -7147,15 +7147,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); From 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 22 Sep 2020 17:27:29 +0900 Subject: [PATCH 139/161] btrfs: reschedule when cloning lots of extents We have several occurrences of a soft lockup from fstest's generic/175 testcase, which look more or less like this one: watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [xfs_io:10030] Kernel panic - not syncing: softlockup: hung tasks CPU: 0 PID: 10030 Comm: xfs_io Tainted: G L 5.9.0-rc5+ #768 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4-rebuilt.opensuse.org 04/01/2014 Call Trace: dump_stack+0x77/0xa0 panic+0xfa/0x2cb watchdog_timer_fn.cold+0x85/0xa5 ? lockup_detector_update_enable+0x50/0x50 __hrtimer_run_queues+0x99/0x4c0 ? recalibrate_cpu_khz+0x10/0x10 hrtimer_run_queues+0x9f/0xb0 update_process_times+0x28/0x80 tick_handle_periodic+0x1b/0x60 __sysvec_apic_timer_interrupt+0x76/0x210 asm_call_on_stack+0x12/0x20 sysvec_apic_timer_interrupt+0x7f/0x90 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0010:btrfs_tree_unlock+0x91/0x1a0 [btrfs] RSP: 0018:ffffc90007123a58 EFLAGS: 00000282 RAX: ffff8881cea2fbe0 RBX: ffff8881cea2fbe0 RCX: 0000000000000000 RDX: ffff8881d23fd200 RSI: ffffffff82045220 RDI: ffff8881cea2fba0 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000032 R10: 0000160000000000 R11: 0000000000001000 R12: 0000000000001000 R13: ffff8882357fd5b0 R14: ffff88816fa76e70 R15: ffff8881cea2fad0 ? btrfs_tree_unlock+0x15b/0x1a0 [btrfs] btrfs_release_path+0x67/0x80 [btrfs] btrfs_insert_replace_extent+0x177/0x2c0 [btrfs] btrfs_replace_file_extents+0x472/0x7c0 [btrfs] btrfs_clone+0x9ba/0xbd0 [btrfs] btrfs_clone_files.isra.0+0xeb/0x140 [btrfs] ? file_update_time+0xcd/0x120 btrfs_remap_file_range+0x322/0x3b0 [btrfs] do_clone_file_range+0xb7/0x1e0 vfs_clone_file_range+0x30/0xa0 ioctl_file_clone+0x8a/0xc0 do_vfs_ioctl+0x5b2/0x6f0 __x64_sys_ioctl+0x37/0xa0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f87977fc247 RSP: 002b:00007ffd51a2f6d8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f87977fc247 RDX: 00007ffd51a2f710 RSI: 000000004020940d RDI: 0000000000000003 RBP: 0000000000000004 R08: 00007ffd51a79080 R09: 0000000000000000 R10: 00005621f11352f2 R11: 0000000000000206 R12: 0000000000000000 R13: 0000000000000000 R14: 00005621f128b958 R15: 0000000080000000 Kernel Offset: disabled ---[ end Kernel panic - not syncing: softlockup: hung tasks ]--- All of these lockup reports have the call chain btrfs_clone_files() -> btrfs_clone() in common. btrfs_clone_files() calls btrfs_clone() with both source and destination extents locked and loops over the source extent to create the clones. Conditionally reschedule in the btrfs_clone() loop, to give some time back to other processes. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 39b3269e5760..99aa87c08912 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -520,6 +520,8 @@ process_slot: ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; From 2c53a14dd30124de9468316933715deaf7443096 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 15 Sep 2020 13:35:27 +0800 Subject: [PATCH 140/161] btrfs: use own btree inode io_tree owner id Btree inode is special compared to all other inode extent io_trees, although it has a btrfs inode, it doesn't have the track_uptodate bit at all. This means a lot of things like extent locking doesn't even need to be applied to btree io tree. Since it's so special, adds a new owner value for it to make debuging a little easier. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-io-tree.h | 1 + include/trace/events/btrfs.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 53b588a7e150..ebc110231f30 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2062,7 +2062,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_INODE_IO, inode); + IO_TREE_BTREE_INODE_IO, inode); BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 250b8cbaaf97..53e7b5b5d6ad 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -40,6 +40,7 @@ struct io_failure_record; enum { IO_TREE_FS_PINNED_EXTENTS, IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8d311062d376..ecd24c719de4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -79,6 +79,7 @@ struct btrfs_space_info; #define IO_TREE_OWNER \ EM( IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS") \ EM( IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS") \ + EM( IO_TREE_BTREE_INODE_IO, "BTREE_INODE_IO") \ EM( IO_TREE_INODE_IO, "INODE_IO") \ EM( IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE") \ EM( IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS") \ From e2f896b3180e3b5d34b1d174b931205a43072d17 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 10:54:23 +0200 Subject: [PATCH 141/161] btrfs: send: use helpers for unaligned access to header members The header is mapped onto the send buffer and thus its members may be potentially unaligned so use the helpers instead of directly assigning the pointers. This has worked so far but let's use the helpers to make that clear. Signed-off-by: David Sterba --- fs/btrfs/send.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b84f921ed6c0..9f1ee52482c9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -577,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -688,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -700,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx) u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; From 6994ca367ce5160dea2718e0993542d148fa68f8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 13:32:34 +0200 Subject: [PATCH 142/161] btrfs: free-space-cache: use unaligned helpers to access data The free space inode stores the tracking data, checksums etc, using the io_ctl structure and moving the pointers. The data are generally aligned to at least 4 bytes (u32 for CRC) so it's not completely unaligned but for clarity we should use the proper helpers whenever a struct is initialized from io_ctl->cur pointer. Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8759f5a1d6a0..af0013d3df63 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); From e97659cefe1e9dc0bb4284195ba4ab380d0cf1fe Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 14:58:42 +0200 Subject: [PATCH 143/161] btrfs: use unaligned helpers for stack and header set/get helpers In the definitions generated by BTRFS_SETGET_HEADER_FUNCS there's direct pointer assignment but we should use the helpers for unaligned access for clarity. It hasn't been a problem so far because of the natural alignment. Similarly for BTRFS_SETGET_STACK_FUNCS, that usually get a structure from stack that has an aligned start but some members may not be aligned due to packing. This as well hasn't caused problems so far. Move the put/get_unaligned_le8 stubs to ctree.h so we can use them. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 20 ++++++++++++++------ fs/btrfs/struct-funcs.c | 10 ---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd644c755142..5cb8af6af25d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1420,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, #define cpu_to_le8(v) (v) #define __le8 u8 +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ @@ -1478,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ + return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ - return le##bits##_to_cpu(s->member); \ + return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ - s->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &s->member); \ } - static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 079b059818e9..c46be27be700 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -7,16 +7,6 @@ #include "ctree.h" -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) { From 1465af12e254a68706e110846f59cf0f09683184 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Sep 2020 10:37:01 +0800 Subject: [PATCH 144/161] btrfs: tree-checker: fix false alert caused by legacy btrfs root item Commit 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") introduced btrfs root item size check, however btrfs root item has two versions, the legacy one which just ends before generation_v2 member, is smaller than current btrfs root item size. This caused btrfs kernel to reject valid but old tree root leaves. Fix this problem by also allowing legacy root item, since kernel can already handle them pretty well and upgrade to newer root item format when needed. Reported-by: Martin Steigerwald Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") CC: stable@vger.kernel.org # 5.4+ Tested-By: Martin Steigerwald Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 17 ++++++++++++----- include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b1fee630f97..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 9ba64ca6b4ac..6b885982ece6 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -4,6 +4,11 @@ #include #include +#ifdef __KERNEL__ +#include +#else +#include +#endif /* * This header contains the structure definitions and constants used @@ -644,6 +649,15 @@ struct btrfs_root_item { __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); +/* + * Btrfs root item used to be smaller than current size. The old format ends + * at where member generation_v2 is. + */ +static inline __u32 btrfs_legacy_root_item_size(void) +{ + return offsetof(struct btrfs_root_item, generation_v2); +} + /* * this is used for both forward and backward root refs */ From 98272bb77bf4cc20ed1ffca89832d713e70ebf09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Sep 2020 14:13:29 +0100 Subject: [PATCH 145/161] btrfs: send, orphanize first all conflicting inodes when processing references When doing an incremental send it is possible that when processing the new references for an inode we end up issuing rename or link operations that have an invalid path, which contains the orphanized name of a directory before we actually orphanized it, causing the receiver to fail. The following reproducer triggers such scenario: $ cat reproducer.sh #!/bin/bash mkfs.btrfs -f /dev/sdi >/dev/null mount /dev/sdi /mnt/sdi touch /mnt/sdi/a touch /mnt/sdi/b mkdir /mnt/sdi/testdir # We want "a" to have a lower inode number then "testdir" (257 vs 259). mv /mnt/sdi/a /mnt/sdi/testdir/a # Filesystem looks like: # # . (ino 256) # |----- testdir/ (ino 259) # | |----- a (ino 257) # | # |----- b (ino 258) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1 btrfs send -f /tmp/snap1.send /mnt/sdi/snap1 # Now rename 259 to "testdir_2", then change the name of 257 to # "testdir" and make it a direct descendant of the root inode (256). # Also create a new link for inode 257 with the old name of inode 258. # By swapping the names and location of several inodes and create a # nasty dependency chain of rename and link operations. mv /mnt/sdi/testdir/a /mnt/sdi/a2 touch /mnt/sdi/testdir/a mv /mnt/sdi/b /mnt/sdi/b2 ln /mnt/sdi/a2 /mnt/sdi/b mv /mnt/sdi/testdir /mnt/sdi/testdir_2 mv /mnt/sdi/a2 /mnt/sdi/testdir # Filesystem now looks like: # # . (ino 256) # |----- testdir_2/ (ino 259) # | |----- a (ino 260) # | # |----- testdir (ino 257) # |----- b (ino 257) # |----- b2 (ino 258) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2 btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2 mkfs.btrfs -f /dev/sdj >/dev/null mount /dev/sdj /mnt/sdj btrfs receive -f /tmp/snap1.send /mnt/sdj btrfs receive -f /tmp/snap2.send /mnt/sdj umount /mnt/sdi umount /mnt/sdj When running the reproducer, the receive of the incremental send stream fails: $ ./reproducer.sh Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' At subvol /mnt/sdi/snap1 Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' At subvol /mnt/sdi/snap2 At subvol snap1 At snapshot snap2 ERROR: link b -> o259-6-0/a failed: No such file or directory The problem happens because of the following: 1) Before we start iterating the list of new references for inode 257, we generate its current path and store it at @valid_path, done at the very beginning of process_recorded_refs(). The generated path is "o259-6-0/a", containing the orphanized name for inode 259; 2) Then we iterate over the list of new references, which has the references "b" and "testdir" in that specific order; 3) We process reference "b" first, because it is in the list before reference "testdir". We then issue a link operation to create the new reference "b" using a target path corresponding to the content at @valid_path, which corresponds to "o259-6-0/a". However we haven't yet orphanized inode 259, its name is still "testdir", and not "o259-6-0". The orphanization of 259 did not happen yet because we will process the reference named "testdir" for inode 257 only in the next iteration of the loop that goes over the list of new references. Fix the issue by having a preliminar iteration over all the new references at process_recorded_refs(). This iteration is responsible only for doing the orphanization of other inodes that have and old reference that conflicts with one of the new references of the inode we are currently processing. The emission of rename and link operations happen now in the next iteration of the new references. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 127 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 40 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9f1ee52482c9..f9c14c33e753 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3873,52 +3873,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * than the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -3997,6 +4001,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } } + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) From 9c2b4e0347067396ceb3ae929d6888c81d610259 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Sep 2020 14:13:30 +0100 Subject: [PATCH 146/161] btrfs: send, recompute reference path after orphanization of a directory During an incremental send, when an inode has multiple new references we might end up emitting rename operations for orphanizations that have a source path that is no longer valid due to a previous orphanization of some directory inode. This causes the receiver to fail since it tries to rename a path that does not exists. Example reproducer: $ cat reproducer.sh #!/bin/bash mkfs.btrfs -f /dev/sdi >/dev/null mount /dev/sdi /mnt/sdi touch /mnt/sdi/f1 touch /mnt/sdi/f2 mkdir /mnt/sdi/d1 mkdir /mnt/sdi/d1/d2 # Filesystem looks like: # # . (ino 256) # |----- f1 (ino 257) # |----- f2 (ino 258) # |----- d1/ (ino 259) # |----- d2/ (ino 260) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1 btrfs send -f /tmp/snap1.send /mnt/sdi/snap1 # Now do a series of changes such that: # # *) inode 258 has one new hardlink and the previous name changed # # *) both names conflict with the old names of two other inodes: # # 1) the new name "d1" conflicts with the old name of inode 259, # under directory inode 256 (root) # # 2) the new name "d2" conflicts with the old name of inode 260 # under directory inode 259 # # *) inodes 259 and 260 now have the old names of inode 258 # # *) inode 257 is now located under inode 260 - an inode with a number # smaller than the inode (258) for which we created a second hard # link and swapped its names with inodes 259 and 260 # ln /mnt/sdi/f2 /mnt/sdi/d1/f2_link mv /mnt/sdi/f1 /mnt/sdi/d1/d2/f1 # Swap d1 and f2. mv /mnt/sdi/d1 /mnt/sdi/tmp mv /mnt/sdi/f2 /mnt/sdi/d1 mv /mnt/sdi/tmp /mnt/sdi/f2 # Swap d2 and f2_link mv /mnt/sdi/f2/d2 /mnt/sdi/tmp mv /mnt/sdi/f2/f2_link /mnt/sdi/f2/d2 mv /mnt/sdi/tmp /mnt/sdi/f2/f2_link # Filesystem now looks like: # # . (ino 256) # |----- d1 (ino 258) # |----- f2/ (ino 259) # |----- f2_link/ (ino 260) # | |----- f1 (ino 257) # | # |----- d2 (ino 258) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2 btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2 mkfs.btrfs -f /dev/sdj >/dev/null mount /dev/sdj /mnt/sdj btrfs receive -f /tmp/snap1.send /mnt/sdj btrfs receive -f /tmp/snap2.send /mnt/sdj umount /mnt/sdi umount /mnt/sdj When executed the receive of the incremental stream fails: $ ./reproducer.sh Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' At subvol /mnt/sdi/snap1 Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' At subvol /mnt/sdi/snap2 At subvol snap1 At snapshot snap2 ERROR: rename d1/d2 -> o260-6-0 failed: No such file or directory This happens because: 1) When processing inode 257 we end up computing the name for inode 259 because it is an ancestor in the send snapshot, and at that point it still has its old name, "d1", from the parent snapshot because inode 259 was not yet processed. We then cache that name, which is valid until we start processing inode 259 (or set the progress to 260 after processing its references); 2) Later we start processing inode 258 and collecting all its new references into the list sctx->new_refs. The first reference in the list happens to be the reference for name "d1" while the reference for name "d2" is next (the last element of the list). We compute the full path "d1/d2" for this second reference and store it in the reference (its ->full_path member). The path used for the new parent directory was "d1" and not "f2" because inode 259, the new parent, was not yet processed; 3) When we start processing the new references at process_recorded_refs() we start with the first reference in the list, for the new name "d1". Because there is a conflicting inode that was not yet processed, which is directory inode 259, we orphanize it, renaming it from "d1" to "o259-6-0"; 4) Then we start processing the new reference for name "d2", and we realize it conflicts with the reference of inode 260 in the parent snapshot. So we issue an orphanization operation for inode 260 by emitting a rename operation with a destination path of "o260-6-0" and a source path of "d1/d2" - this source path is the value we stored in the reference earlier at step 2), corresponding to the ->full_path member of the reference, however that path is no longer valid due to the orphanization of the directory inode 259 in step 3). This makes the receiver fail since the path does not exists, it should have been "o259-6-0/d2". Fix this by recomputing the full path of a reference before emitting an orphanization if we previously orphanized any directory, since that directory could be a parent in the new path. This is a rare scenario so keeping it simple and not checking if that previously orphanized directory is in fact an ancestor of the inode we are trying to orphanize. A test case for fstests follows soon. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f9c14c33e753..340c76a12ce1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3805,6 +3805,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) return 0; } +/* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -3939,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct name_cache_entry *nce; struct waiting_dir_move *wdm; + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) From 9a446d6a9fc7df48e53c54b161dfdfc6f5031d4b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:33 +0300 Subject: [PATCH 147/161] btrfs: replace readpage_end_io_hook with direct calls Don't call readpage_end_io_hook for the btree inode. Instead of relying on indirect calls to implement metadata buffer validation simply check if the inode whose page we are processing equals the btree inode. If it does call the necessary function. This is an improvement in 2 directions: 1. We aren't paying the penalty of indirect calls in a post-speculation attacks world. 2. The function is now named more explicitly so it's obvious what's going on This is in preparation to removing struct extent_io_ops altogether. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/disk-io.h | 4 +++- fs/btrfs/extent_io.c | 9 ++++++--- fs/btrfs/inode.c | 7 +++---- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5cb8af6af25d..391c2b25cca1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2969,6 +2969,8 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ebc110231f30..1c84a7ea9894 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -524,9 +524,9 @@ static int check_tree_block_fsid(struct extent_buffer *eb) return 1; } -static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) { u64 found_start; int found_level; @@ -4638,5 +4638,5 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_end_io_hook = NULL }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 89b6a709a184..bc2e49246199 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,9 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); - +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index afac70ef0cc5..6cee6b611d0c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2851,9 +2851,12 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (data_inode) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8481632dfc7..982187b0cd59 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2832,9 +2832,8 @@ zeroit: * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -10261,7 +10260,7 @@ static const struct file_operations btrfs_dir_file_operations = { static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .readpage_end_io_hook = NULL }; /* From 1f03d9cfda53d86e7822bf0dd5f0ef5d8fa6d1ad Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:34 +0300 Subject: [PATCH 148/161] btrfs: remove extent_io_ops::readpage_end_io_hook It's no longer used so let's remove it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - fs/btrfs/extent_io.h | 5 +---- fs/btrfs/inode.c | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c84a7ea9894..4c1f63924498 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4638,5 +4638,4 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = NULL }; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3bbc25b816ea..68c40902e571 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -76,13 +76,10 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct extent_io_ops { /* - * The following callbacks must be always defined, the function + * The following callback must be always defined, the function * pointer will be called unconditionally. */ submit_bio_hook_t *submit_bio_hook; - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 982187b0cd59..204cd9d39894 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10260,7 +10260,6 @@ static const struct file_operations btrfs_dir_file_operations = { static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = NULL }; /* From cd0537449c275ca7da6f6e3ea2389ae5225c6b4e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:35 +0300 Subject: [PATCH 149/161] btrfs: call submit_bio_hook directly in submit_one_bio BTRFS has 2 inode types (for the purposes of the code in submit_one_bio) - ordinary data inodes (including the freespace inode) and the btree inode. Both of these implement submit_bio_hook so btrfsic_submit_bio can never be called from submit_one_bio so just remove it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6cee6b611d0c..1a96251ade1f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -168,11 +168,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); - else - btrfsic_submit_bio(bio); + ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, + bio_flags); return blk_status_to_errno(ret); } From be17b3afc4a6255008d4bbf7b1b45a36ac54bece Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:36 +0300 Subject: [PATCH 150/161] btrfs: don't opencode is_data_inode in end_bio_extent_readpage Use the is_data_inode helper. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a96251ade1f..eb6094c87a55 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2816,8 +2816,6 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - bool data_inode = btrfs_ino(BTRFS_I(inode)) - != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2848,7 +2846,7 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - if (data_inode) + if (is_data_inode(inode)) ret = btrfs_verify_data_csum(io_bio, offset, page, start, end, mirror); else @@ -2866,7 +2864,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (data_inode) { + if (is_data_inode(inode)) { /* * The generic bio_readpage_error handles errors the From 908930f3edad79f7e5e35b7fac09da8c54ca96b5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:37 +0300 Subject: [PATCH 151/161] btrfs: stop calling submit_bio_hook for data inodes Instead export and rename the function to btrfs_submit_data_bio and call it directly in submit_one_bio. This avoids paying the cost for speculative attacks mitigations and improves code readability. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/extent_io.c | 10 +++++++--- fs/btrfs/inode.c | 8 +++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 391c2b25cca1..75fe6b34e6a2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2969,6 +2969,8 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb6094c87a55..bf1cbc171bec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -168,8 +168,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; - ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, - bio_flags); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); + else + ret = tree->ops->submit_bio_hook(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -2879,7 +2883,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (!btrfs_submit_read_repair(inode, bio, offset, page, start - page_offset(page), start, end, mirror, - tree->ops->submit_bio_hook)) { + btrfs_submit_data_bio)) { uptodate = !bio->bi_status; offset += len; continue; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 204cd9d39894..dddff27774a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2184,9 +2184,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -10258,8 +10257,7 @@ static const struct file_operations btrfs_dir_file_operations = { }; static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, + .submit_bio_hook = NULL }; /* From 1b36294a6cd50ee5dcfee71ba5241cbb60f0295e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:38 +0300 Subject: [PATCH 152/161] btrfs: call submit_bio_hook directly for metadata pages No need to go through a function pointer indirection simply call submit_bio_hook directly by exporting and renaming the helper to btrfs_submit_metadata_bio. This makes the code more readable and should result in somewhat faster code due to no longer paying the price for specualtive attack mitigations that come with indirect function calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 +++----- fs/btrfs/disk-io.h | 2 ++ fs/btrfs/extent_io.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c1f63924498..d07ec69eb525 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -814,9 +814,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info, return 1; } -static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(fs_info, BTRFS_I(inode)); @@ -4636,6 +4635,5 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } static const struct extent_io_ops btree_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btree_submit_bio_hook, + .submit_bio_hook = NULL }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bc2e49246199..fee69ced58b4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -79,6 +79,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf1cbc171bec..199dcc7d2db6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -172,8 +172,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, bio_flags); else - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } From 905eb88bceb253aa0f475eb04eec149d6059126e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:39 +0300 Subject: [PATCH 153/161] btrfs: remove struct extent_io_ops It's no longer used just remove the function and any related code which was initialising it for inodes. No functional changes. Removing 8 bytes from extent_io_tree in turn reduces size of other structures where it is embedded, notably btrfs_inode where it reduces size by 24 bytes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- fs/btrfs/disk-io.c | 7 ------- fs/btrfs/extent-io-tree.h | 1 - fs/btrfs/extent_io.c | 2 -- fs/btrfs/extent_io.h | 9 --------- fs/btrfs/inode.c | 16 ---------------- fs/btrfs/tests/inode-tests.c | 1 - 7 files changed, 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 75fe6b34e6a2..aac3d6f4e35b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3583,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); - static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d07ec69eb525..3d39f5d47ad3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -2065,8 +2064,6 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -4633,7 +4630,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } - -static const struct extent_io_ops btree_extent_io_ops = { - .submit_bio_hook = NULL -}; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 53e7b5b5d6ad..9800a8306368 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -63,7 +63,6 @@ struct extent_io_tree { u8 owner; spinlock_t lock; - const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 199dcc7d2db6..60f5f68d892d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -281,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, { tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; @@ -3055,7 +3054,6 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - ASSERT(tree->ops); if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 68c40902e571..f39d02e7f7ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -74,15 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -struct extent_io_ops { - /* - * The following callback must be always defined, the function - * pointer will be called unconditionally. - */ - submit_bio_hook_t *submit_bio_hook; -}; - - #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dddff27774a1..d526833b5ec0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -141,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif - static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -3391,7 +3383,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -6307,7 +6298,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -9518,7 +9508,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9838,7 +9827,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10256,10 +10244,6 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static const struct extent_io_ops btrfs_extent_io_ops = { - .submit_bio_hook = NULL -}; - /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index cc54d4973a74..e6719f7db386 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -949,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } BTRFS_I(inode)->root = root; - btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, From 124604eb50f88e6c1ff6587bdd2fc09bea810b32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Sep 2020 16:44:32 -0400 Subject: [PATCH 154/161] btrfs: init device stats for seed devices We recently started recording device stats across the fleet, and noticed a large increase in messages such as this BTRFS warning (device dm-0): get dev_stats failed, not yet valid on our tiers that use seed devices for their root devices. This is because we do not initialize the device stats for any seed devices if we have a sprout device and mount using that sprout device. The basic steps for reproducing are: $ mkfs seed device $ mount seed device # fill seed device $ umount seed device $ btrfstune -S 1 seed device $ mount seed device $ btrfs device add -f sprout device /mnt/wherever $ umount /mnt/wherever $ mount sprout device /mnt/wherever $ btrfs device stats /mnt/wherever This will fail with the above message in dmesg. Fix this by iterating over the fs_devices->seed if they exist in btrfs_init_dev_stats. This fixed the problem and properly reports the stats for both devices. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ rename to btrfs_device_init_dev_stats ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 87 ++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6c7c8819cb31..d25650600179 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7223,61 +7223,66 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } +static void btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) +{ + struct btrfs_dev_stats_item *ptr; + struct extent_buffer *eb; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); +} + int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { - struct btrfs_key key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_set(device, i, 0); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_set(device, i, 0); - } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_device_init_dev_stats(device, path); + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) + btrfs_device_init_dev_stats(device, path); } mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return ret < 0 ? ret : 0; + return 0; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, From 92e26df43b1a9725c205a7e4416240ce72eac4a3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Sep 2020 16:44:33 -0400 Subject: [PATCH 155/161] btrfs: return error if we're unable to read device stats I noticed when fixing device stats for seed devices that we simply threw away the return value from btrfs_search_slot(). This is because we may not have stat items, but we could very well get an error, and thus miss reporting the error up the chain. Fix this by returning ret if it's an actual error, and then stop trying to init the rest of the devices stats and return the error up the chain. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d25650600179..46f4efd58652 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7223,8 +7223,8 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } -static void btrfs_device_init_dev_stats(struct btrfs_device *device, - struct btrfs_path *path) +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) { struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; @@ -7241,7 +7241,7 @@ static void btrfs_device_init_dev_stats(struct btrfs_device *device, btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); - return; + return ret < 0 ? ret : 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -7260,6 +7260,8 @@ static void btrfs_device_init_dev_stats(struct btrfs_device *device, device->dev_stats_valid = 1; btrfs_dev_stat_print_on_load(device); btrfs_release_path(path); + + return 0; } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) @@ -7267,22 +7269,30 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; + int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - btrfs_device_init_dev_stats(device, path); - list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - list_for_each_entry(device, &seed_devs->devices, dev_list) - btrfs_device_init_dev_stats(device, path); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + } +out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, From c33fe275b530f1ce1ab99923d50eb399f53ed545 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:08 -0500 Subject: [PATCH 156/161] fs: remove no longer used dio_end_io() Since we removed the last user of dio_end_io() when btrfs got converted to iomap infrastructure ("btrfs: switch to iomap for direct IO"), remove the helper function dio_end_io(). Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/direct-io.c | 19 ------------------- include/linux/fs.h | 2 -- 2 files changed, 21 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 183299892465..abf535b036ab 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, diff --git a/include/linux/fs.h b/include/linux/fs.h index 7519ae003a08..8e2842a9c0b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3079,8 +3079,6 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio); - ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, From e3c57805f8f2215a1586741009111b92930cc6ac Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:09 -0500 Subject: [PATCH 157/161] btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK Since we now perform direct reads using i_rwsem, we can remove this inode flag used to co-ordinate unlocked reads. The truncate call takes i_rwsem. This means it is correctly synchronized with concurrent direct reads. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Reviewed-by: Christoph Hellwig Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 18 ------------------ fs/btrfs/inode.c | 3 --- 2 files changed, 21 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6fdb46d58299..738009a22320 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -33,7 +33,6 @@ enum { BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -334,23 +333,6 @@ struct btrfs_dio_private { u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d526833b5ec0..36efed0a24de 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4844,10 +4844,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 Sep 2020 08:53:54 -0400 Subject: [PATCH 158/161] btrfs: cleanup cow block on error In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead to a system lockup. It gets stuck trying to write back inodes, and the write back thread was trying to lock an extent buffer: $ cat /proc/2143497/stack [<0>] __btrfs_tree_lock+0x108/0x250 [<0>] lock_extent_buffer_for_io+0x35e/0x3a0 [<0>] btree_write_cache_pages+0x15a/0x3b0 [<0>] do_writepages+0x28/0xb0 [<0>] __writeback_single_inode+0x54/0x5c0 [<0>] writeback_sb_inodes+0x1e8/0x510 [<0>] wb_writeback+0xcc/0x440 [<0>] wb_workfn+0xd7/0x650 [<0>] process_one_work+0x236/0x560 [<0>] worker_thread+0x55/0x3c0 [<0>] kthread+0x13a/0x150 [<0>] ret_from_fork+0x1f/0x30 This is because we got an error while COWing a block, specifically here if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } } [16402.241552] BTRFS: Transaction aborted (error -2) [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540 [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8 [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540 [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282 [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388 [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380 [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000 [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0 [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000 [16402.256321] FS: 00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000 [16402.256973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0 [16402.257867] Call Trace: [16402.258072] btrfs_cow_block+0x109/0x230 [16402.258356] btrfs_search_slot+0x530/0x9d0 [16402.258655] btrfs_lookup_file_extent+0x37/0x40 [16402.259155] __btrfs_drop_extents+0x13c/0xd60 [16402.259628] ? btrfs_block_rsv_migrate+0x4f/0xb0 [16402.259949] btrfs_replace_file_extents+0x190/0x820 [16402.260873] btrfs_clone+0x9ae/0xc00 [16402.261139] btrfs_extent_same_range+0x66/0x90 [16402.261771] btrfs_remap_file_range+0x353/0x3b1 [16402.262333] vfs_dedupe_file_range_one.part.0+0xd5/0x140 [16402.262821] vfs_dedupe_file_range+0x189/0x220 [16402.263150] do_vfs_ioctl+0x552/0x700 [16402.263662] __x64_sys_ioctl+0x62/0xb0 [16402.264023] do_syscall_64+0x33/0x40 [16402.264364] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [16402.264862] RIP: 0033:0x7f542c7d15cb [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003 [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64 [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80 [16402.271498] irq event stamp: 0 [16402.271846] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [16402.272497] hardirqs last disabled at (0): [] copy_process+0x6b9/0x1ba0 [16402.273343] softirqs last enabled at (0): [] copy_process+0x6b9/0x1ba0 [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0 [16402.274338] ---[ end trace 737874a5a41a8236 ]--- [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry [16402.276179] BTRFS info (device dm-9): forced readonly [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30 The problem here is that as soon as we allocate the new block it is locked and marked dirty in the btree inode. This means that we could attempt to writeback this block and need to lock the extent buffer. However we're not unlocking it here and thus we deadlock. Fix this by unlocking the cow block if we have any errors inside of __btrfs_cow_block, and also free it so we do not leak it. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a165093739c4..113da62dc17f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1064,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1071,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1103,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } From 96c2e067ed3e3e004580a643c76f58729206b829 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 30 Sep 2020 21:09:52 +0800 Subject: [PATCH 159/161] btrfs: skip devices without magic signature when mounting Many things can happen after the device is scanned and before the device is mounted. One such thing is losing the BTRFS_MAGIC on the device. If it happens we still won't free that device from the memory and cause the userland confusion. For example: As the BTRFS_IOC_DEV_INFO still carries the device path which does not have the BTRFS_MAGIC, 'btrfs fi show' still lists device which does not belong to the filesystem anymore: $ mkfs.btrfs -fq -draid1 -mraid1 /dev/sda /dev/sdb $ wipefs -a /dev/sdb # /dev/sdb does not contain magic signature $ mount -o degraded /dev/sda /btrfs $ btrfs fi show -m Label: none uuid: 470ec6fb-646b-4464-b3cb-df1b26c527bd Total devices 2 FS bytes used 128.00KiB devid 1 size 3.00GiB used 571.19MiB path /dev/sda devid 2 size 3.00GiB used 571.19MiB path /dev/sdb We need to distinguish the missing signature and invalid superblock, so add a specific error code ENODATA for that. This also fixes failure of fstest btrfs/198. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++-- fs/btrfs/volumes.c | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3d39f5d47ad3..764001609a15 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3424,8 +3424,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_CAST(page); super = page_address(page); - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 46f4efd58652..58b9c419a2b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1198,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } if (fs_devices->open_devices == 0) return -EINVAL; From 8d1a7aae89dc0c41ffb76fe1007dbba59d13881b Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 5 Dec 2019 01:49:01 +0530 Subject: [PATCH 160/161] btrfs: annotate device name rcu_string with __rcu This patch fixes the following sparse errors in fs/btrfs/super.c in function btrfs_show_devname() fs/btrfs/super.c: error: incompatible types in comparison expression (different address spaces): fs/btrfs/super.c: struct rcu_string [noderef] * fs/btrfs/super.c: struct rcu_string * The error was because of the following line in function btrfs_show_devname(): if (first_dev) seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\"); Annotating the btrfs_device::name member with __rcu fixes the sparse error. Acked-by: Joel Fernandes (Google) Signed-off-by: Madhuparna Bhowmik Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 48bdca01e237..bf27ac07d315 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string *name; + struct rcu_string __rcu *name; u64 generation; From 1fd4033dd011a3525bacddf37ab9eac425d25c4f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 1 Oct 2020 09:40:39 +0300 Subject: [PATCH 161/161] btrfs: rename BTRFS_INODE_ORDERED_DATA_CLOSE flag Commit 8d875f95da43 ("btrfs: disable strict file flushes for renames and truncates") eliminated the notion of ordered operations and instead BTRFS_INODE_ORDERED_DATA_CLOSE only remained as a flag indicating that a file's content should be synced to disk in case a file is truncated and any writes happen to it concurrently. In fact this intendend behavior was broken until it was fixed in f6dc45c7a93a ("Btrfs: fix filemap_flush call in btrfs_file_release"). All things considered let's give the flag a more descriptive name. Also slightly reword comments. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/file.c | 10 +++++----- fs/btrfs/inode.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 738009a22320..92dd86bceae3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -21,7 +21,7 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 038e0afaf3d0..0ff659455b1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2091,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by setattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 36efed0a24de..936c3137c646 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4835,11 +4835,11 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize);