diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 302f37c56546..d71915e04e92 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1279,6 +1279,7 @@ struct btrfs_block_group_cache { unsigned int dirty:1; unsigned int iref:1; unsigned int has_caching_ctl:1; + unsigned int removed:1; int disk_cache_state; @@ -1311,6 +1312,8 @@ struct btrfs_block_group_cache { /* For read-only block groups */ struct list_head ro_list; + + atomic_t trimming; }; /* delayed seq elem */ @@ -1740,6 +1743,12 @@ struct btrfs_fs_info { /* For btrfs to record security options */ struct security_mnt_opts security_opts; + + /* + * Chunks that can't be freed yet (under a trim/discard operation) + * and will be latter freed. Protected by fs_info->chunk_mutex. + */ + struct list_head pinned_chunks; }; struct btrfs_subvolume_writers { @@ -3405,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start); + struct btrfs_root *root, u64 group_start, + struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1e3e414c8501..30965120772b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + INIT_LIST_HEAD(&fs_info->pinned_chunks); + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -3715,6 +3717,17 @@ void close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; + + lock_chunks(root); + while (!list_empty(&fs_info->pinned_chunks)) { + struct extent_map *em; + + em = list_first_entry(&fs_info->pinned_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + unlock_chunks(root); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5e64dda2db9..dbc115a25798 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9005,6 +9005,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); return cache; } @@ -9306,7 +9307,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -9319,6 +9321,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -9464,6 +9467,61 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + lock_chunks(root); + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + unlock_chunks(root); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 33848196550e..0ddc114e2aed 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,6 +27,7 @@ #include "disk-io.h" #include "extent_io.h" #include "inode-map.h" +#include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) @@ -3101,11 +3102,46 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, *trimmed = 0; + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + atomic_inc(&block_group->trimming); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); if (ret) - return ret; + goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + spin_lock(&block_group->lock); + if (atomic_dec_and_test(&block_group->trimming) && + block_group->removed) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + spin_unlock(&block_group->lock); + + em_tree = &block_group->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->key.objectid, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + lock_chunks(block_group->fs_info->chunk_root); + list_del_init(&em->list); + unlock_chunks(block_group->fs_info->chunk_root); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + } else { + spin_unlock(&block_group->lock); + } return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 01920515f90d..588f37e0a564 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -static void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - static struct btrfs_fs_devices *__alloc_fs_devices(void) { struct btrfs_fs_devices *fs_devs; @@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, u64 *start, u64 len) { struct extent_map *em; + struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; - list_for_each_entry(em, &trans->transaction->pending_chunks, list) { +again: + list_for_each_entry(em, search_list, list) { struct map_lookup *map; int i; @@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, ret = 1; } } + if (search_list == &trans->transaction->pending_chunks) { + search_list = &trans->root->fs_info->pinned_chunks; + goto again; + } return ret; } @@ -2653,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for the tree */ - free_extent_map(em); out: /* once for us */ free_extent_map(em); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4cc00e64427e..637bcfadadb2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -515,4 +515,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, struct btrfs_transaction *transaction); + +static inline void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static inline void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + + #endif