1
0
Fork 0

for-5.6-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAl4vDYkACgkQxWXV+ddt
 WDsNJQ//WJEcYoRpN5Y7oOIk/vo5ulF68P3kUh3hl206A13xpaHorvTvZKAD5s2o
 C6xACJk839sGEhMdDRWvdeBDCHTedMk7EXjiZ6kJD+7EPpWmDllI5O6DTolT7SR2
 b9zId4KCO+m8LiLZccRsxCJbdkJ7nJnz2c5+063TjsS3uq1BFudctRUjW/XnFCCZ
 JIE5iOkdXrA+bFqc+l2zKTwgByQyJg+hVKRTZEJBT0QZsyNQvHKzXAmXxGopW8bO
 SeuzFkiFTA0raK8xBz6mUwaZbk40Qlzm9v9AitFZx0x2nvQnMu447N3xyaiuyDWd
 Li1aMN0uFZNgSz+AemuLfG0Wj70x1HrQisEj958XKzn4cPpUuMcc3lr1PZ2NIX+C
 p6pSgaLOEq8Rc0U78/euZX6oyiLJPAmQO1TdkVMHrcMi36esBI6uG11rds+U+xeK
 XoP20qXLFVYLLrl3wH9F4yIzydfMYu66Us1AeRPRB14NSSa7tbCOG//aCafOoLM6
 518sJCazSWlv1kDewK8dtLiXc8eM6XJN+KI4NygFZrUj2Rq376q5oovUUKKkn3iN
 pdHtF/7gAxIx6bZ+jY/gyt/Xe5AdPi7sKggahvrSOL3X+LLINwC4r+vAnnpd6yh4
 NfJj5fobvc/mO9PEVMwgJ8PmHw5uNqeMlORGjk7stQs7Oez3tCw=
 =4OkE
 -----END PGP SIGNATURE-----

Merge tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "Features, highlights:

   - async discard
       - "mount -o discard=async" to enable it
       - freed extents are not discarded immediatelly, but grouped
         together and trimmed later, with IO rate limiting
       - the "sync" mode submits short extents that could have been
         ignored completely by the device, for SATA prior to 3.1 the
         requests are unqueued and have a big impact on performance
       - the actual discard IO requests have been moved out of
         transaction commit to a worker thread, improving commit latency
       - IO rate and request size can be tuned by sysfs files, for now
         enabled only with CONFIG_BTRFS_DEBUG as we might need to
         add/delete the files and don't have a stable-ish ABI for
         general use, defaults are conservative

   - export device state info in sysfs, eg. missing, writeable

   - no discard of extents known to be untouched on disk (eg. after
     reservation)

   - device stats reset is logged with process name and PID that called
     the ioctl

  Fixes:

   - fix missing hole after hole punching and fsync when using NO_HOLES

   - writeback: range cyclic mode could miss some dirty pages and lead
     to OOM

   - two more corner cases for metadata_uuid change after power loss
     during the change

   - fix infinite loop during fsync after mix of rename operations

  Core changes:

   - qgroup assign returns ENOTCONN when quotas not enabled, used to
     return EINVAL that was confusing

   - device closing does not need to allocate memory anymore

   - snapshot aware code got removed, disabled for years due to
     performance problems, reimplmentation will allow to select wheter
     defrag breaks or does not break COW on shared extents

   - tree-checker:
       - check leaf chunk item size, cross check against number of
         stripes
       - verify location keys for DIR_ITEM, DIR_INDEX and XATTR items

   - new self test for physical -> logical mapping code, used for super
     block range exclusion

   - assertion helpers/macros updated to avoid objtool "unreachable
     code" reports on older compilers or config option combinations"

* tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (84 commits)
  btrfs: free block groups after free'ing fs trees
  btrfs: Fix split-brain handling when changing FSID to metadata uuid
  btrfs: Handle another split brain scenario with metadata uuid feature
  btrfs: Factor out metadata_uuid code from find_fsid.
  btrfs: Call find_fsid from find_fsid_inprogress
  Btrfs: fix infinite loop during fsync after rename operations
  btrfs: set trans->drity in btrfs_commit_transaction
  btrfs: drop log root for dropped roots
  btrfs: sysfs, add devid/dev_state kobject and device attributes
  btrfs: Refactor btrfs_rmap_block to improve readability
  btrfs: Add self-tests for btrfs_rmap_block
  btrfs: selftests: Add support for dummy devices
  btrfs: Move and unexport btrfs_rmap_block
  btrfs: separate definition of assertion failure handlers
  btrfs: device stats, log when stats are zeroed
  btrfs: fix improper setting of scanned for range cyclic write cache pages
  btrfs: safely advance counter when looking up bio csums
  btrfs: remove unused member btrfs_device::work
  btrfs: remove unnecessary wrapper get_alloc_profile
  btrfs: add correction to handle -1 edge case in async discard
  ...
This commit is contained in:
Linus Torvalds 2020-01-28 14:53:31 -08:00
commit 81a046b18b
43 changed files with 3050 additions and 1771 deletions

View file

@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o
block-rsv.o delalloc-space.o block-group.o discard.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

View file

@ -14,6 +14,8 @@
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
return extended_to_chunk(flags | allowed);
}
static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
unsigned seq;
u64 flags;
@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
return btrfs_reduce_alloc_profile(fs_info, flags);
}
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
return get_alloc_profile(fs_info, orig_flags);
}
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
atomic_inc(&cache->count);
@ -131,6 +128,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
* Remove the block_group from the discard_list to prevent us
* from causing a panic due to NULL pointer dereference.
*/
if (WARN_ON(!list_empty(&cache->discard_list)))
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
/*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
total_added += size;
ret = btrfs_add_free_space(block_group, start,
size);
ret = btrfs_add_free_space_async_trimmed(block_group,
start, size);
BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
if (start < end) {
size = end - start;
total_added += size;
ret = btrfs_add_free_space(block_group, start, size);
ret = btrfs_add_free_space_async_trimmed(block_group, start,
size);
BUG_ON(ret); /* -ENOMEM or logic error */
}
@ -1185,21 +1192,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
/*
* We need some metadata space and system metadata space for
* allocating chunks in some corner cases until we force to set
* it to be readonly.
*/
if ((sinfo->flags &
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
!force)
min_allocable_bytes = SZ_1M;
else
min_allocable_bytes = 0;
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
@ -1217,10 +1211,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
* sinfo_used + num_bytes should always <= sinfo->total_bytes.
*
* Here we make sure if we mark this bg RO, we still have enough
* free space as buffer (if min_allocable_bytes is not 0).
* free space as buffer.
*/
if (sinfo_used + num_bytes + min_allocable_bytes <=
sinfo->total_bytes) {
if (sinfo_used + num_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@ -1233,8 +1226,8 @@ out:
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
btrfs_info(cache->fs_info,
"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
sinfo_used, num_bytes, min_allocable_bytes);
"sinfo_used=%llu bg_num_bytes=%llu",
sinfo_used, num_bytes);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
@ -1249,6 +1242,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
int ret = 0;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@ -1272,10 +1266,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
/*
* Async discard moves the final block group discard to be prior
* to the unused_bgs code path. Therefore, if it's not fully
* trimmed, punt it back to the async discard lists.
*/
if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
!btrfs_is_free_space_trimmed(block_group)) {
trace_btrfs_skip_unused_block_group(block_group);
up_write(&space_info->groups_sem);
/* Requeue if we failed because of async discard */
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group);
goto next;
}
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
block_group->used || block_group->ro ||
@ -1347,6 +1359,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/*
* At this point, the block_group is read only and should fail
* new allocations. However, btrfs_finish_extent_commit() can
* cause this block_group to be placed back on the discard
* lists because now the block_group isn't fully discarded.
* Bail here and try again later after discarding everything.
*/
spin_lock(&fs_info->discard_ctl.lock);
if (!list_empty(&block_group->discard_list)) {
spin_unlock(&fs_info->discard_ctl.lock);
btrfs_dec_block_group_ro(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group);
goto end_trans;
}
spin_unlock(&fs_info->discard_ctl.lock);
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
@ -1362,8 +1391,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
/*
* The normal path here is an unused block group is passed here,
* then trimming is handled in the transaction commit path.
* Async discard interposes before this to do the trimming
* before coming down the unused block group path as trimming
* will no longer be done later in the transaction commit path.
*/
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
/* DISCARD can flip during remount */
trimming = btrfs_test_opt(fs_info, DISCARD);
trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
/* Implicit trim during transaction commit. */
if (trimming)
@ -1406,6 +1445,13 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@ -1516,6 +1562,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
/**
* btrfs_rmap_block - Map a physical disk address to a list of logical addresses
* @chunk_start: logical address of block group
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
* @stripe_len: size of IO stripe for the given block group
*
* Maps a particular @physical disk address to a list of @logical addresses.
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
u64 io_stripe_size;
int i, nr = 0;
int ret = 0;
em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
map = em->map_lookup;
data_stripe_length = em->len;
io_stripe_size = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
data_stripe_length = div_u64(data_stripe_length,
map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
data_stripe_length = div_u64(data_stripe_length,
nr_data_stripes(map));
io_stripe_size = map->stripe_len * nr_data_stripes(map);
}
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
u64 stripe_nr;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
}
/*
* The remaining case would be for RAID56, multiply by
* nr_data_stripes(). Alternatively, just use rmap_len below
* instead of map->stripe_len
*/
bytenr = chunk_start + stripe_nr * io_stripe_size;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr) {
already_inserted = true;
break;
}
}
if (!already_inserted)
buf[nr++] = bytenr;
}
*logical = buf;
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
free_extent_map(em);
return ret;
}
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@ -1610,6 +1752,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@ -1617,6 +1761,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
@ -1775,7 +1920,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
inc_block_group_ro(cache, 1);
} else if (cache->used == 0) {
ASSERT(list_empty(&cache->bg_list));
btrfs_mark_bg_unused(cache);
if (btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_discard_queue_work(&info->discard_ctl, cache);
else
btrfs_mark_bg_unused(cache);
}
return 0;
error:
@ -2738,8 +2886,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* dirty list to avoid races between cleaner kthread and space
* cache writeout.
*/
if (!alloc && old_val == 0)
btrfs_mark_bg_unused(cache);
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
}
btrfs_put_block_group(cache);
total -= num_bytes;

View file

@ -12,6 +12,19 @@ enum btrfs_disk_cache_state {
BTRFS_DC_SETUP,
};
/*
* This describes the state of the block_group for async discard. This is due
* to the two pass nature of it where extent discarding is prioritized over
* bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
* between lists to prevent contention for discard state variables
* (eg. discard_cursor).
*/
enum btrfs_discard_state {
BTRFS_DISCARD_EXTENTS,
BTRFS_DISCARD_BITMAPS,
BTRFS_DISCARD_RESET_CURSOR,
};
/*
* Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
* only allocate a chunk if we really need one.
@ -116,7 +129,13 @@ struct btrfs_block_group {
/* For read-only block groups */
struct list_head ro_list;
/* For discard operations */
atomic_t trimming;
struct list_head discard_list;
int discard_index;
u64 discard_eligible_time;
u64 discard_cursor;
enum btrfs_discard_state discard_state;
/* For dirty block groups */
struct list_head dirty_list;
@ -158,6 +177,22 @@ struct btrfs_block_group {
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
{
return (block_group->start + block_group->length);
}
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
/*
* In mixed mode the fragmentation is expected to be high, lowering the
* efficiency, so only proper data block groups are considered.
*/
return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
!(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
}
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
struct btrfs_block_group *block_group)
@ -248,4 +283,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
cache->cached == BTRFS_CACHE_ERROR;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len);
#endif
#endif /* BTRFS_BLOCK_GROUP_H */

View file

@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfs_fs_devices *fs_devices)
{
struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *selected_super;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@ -637,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int ret = 0;
int pass;
BUG_ON(NULL == state);
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
pr_info("btrfsic: error, kmalloc failed!\n");
@ -700,7 +698,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
break;
}
num_copies = btrfs_num_copies(fs_info, next_bytenr,
num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",

View file

@ -763,7 +763,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
sums);
(u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
@ -791,7 +791,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}

View file

@ -101,6 +101,14 @@ struct btrfs_ref;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
* Deltas are an effective way to populate global statistics. Give macro names
* to make it clear what we're doing. An example is discard_extents in
* btrfs_free_space_ctl.
*/
#define BTRFS_STAT_NR_ENTRIES 2
#define BTRFS_STAT_CURR 0
#define BTRFS_STAT_PREV 1
/*
* Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
@ -440,6 +448,36 @@ struct btrfs_full_stripe_locks_tree {
struct mutex lock;
};
/* Discard control. */
/*
* Async discard uses multiple lists to differentiate the discard filter
* parameters. Index 0 is for completely free block groups where we need to
* ensure the entire block group is trimmed without being lossy. Indices
* afterwards represent monotonically decreasing discard filter sizes to
* prioritize what should be discarded next.
*/
#define BTRFS_NR_DISCARD_LISTS 3
#define BTRFS_DISCARD_INDEX_UNUSED 0
#define BTRFS_DISCARD_INDEX_START 1
struct btrfs_discard_ctl {
struct workqueue_struct *discard_workers;
struct delayed_work work;
spinlock_t lock;
struct btrfs_block_group *block_group;
struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
u64 prev_discard;
atomic_t discardable_extents;
atomic64_t discardable_bytes;
u64 max_discard_size;
unsigned long delay;
u32 iops_limit;
u32 kbps_limit;
u64 discard_extent_bytes;
u64 discard_bitmap_bytes;
atomic64_t discard_bytes_saved;
};
/* delayed seq elem */
struct seq_list {
struct list_head list;
@ -526,6 +564,9 @@ enum {
* so we don't need to offload checksums to workqueues.
*/
BTRFS_FS_CSUM_IMPL_FAST,
/* Indicate that the discard workqueue can service discards. */
BTRFS_FS_DISCARD_RUNNING,
};
struct btrfs_fs_info {
@ -816,6 +857,8 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
struct btrfs_discard_ctl discard_ctl;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
@ -902,6 +945,11 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
#endif
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct kobject *discard_debug_kobj;
#endif
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@ -1170,7 +1218,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
#define BTRFS_MOUNT_DISCARD (1 << 10)
#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
@ -1189,6 +1237,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@ -2449,8 +2498,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len);
int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start,
u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@ -2789,9 +2838,7 @@ struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u8 *dst);
blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
u64 logical_offset);
u64 offset, u8 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@ -2877,7 +2924,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end, int create);
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@ -3110,17 +3157,21 @@ do { \
rcu_read_unlock(); \
} while (0)
__cold
static inline void assfail(const char *expr, const char *file, int line)
#ifdef CONFIG_BTRFS_ASSERT
__cold __noreturn
static inline void assertfail(const char *expr, const char *file, int line)
{
if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
BUG();
}
pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
BUG();
}
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#define ASSERT(expr) \
(likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
#else
static inline void assertfail(const char *expr, const char* file, int line) { }
#define ASSERT(expr) (void)(expr)
#endif
/*
* Use that for functions that are conditionally exported for sanity tests but

View file

@ -704,6 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs_sysfs_update_devid(tgt_device);
btrfs_rm_dev_replace_free_srcdev(src_device);
/* write back the superblocks */

702
fs/btrfs/discard.c Normal file
View file

@ -0,0 +1,702 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/math64.h>
#include <linux/sizes.h>
#include <linux/workqueue.h>
#include "ctree.h"
#include "block-group.h"
#include "discard.h"
#include "free-space-cache.h"
/*
* This contains the logic to handle async discard.
*
* Async discard manages trimming of free space outside of transaction commit.
* Discarding is done by managing the block_groups on a LRU list based on free
* space recency. Two passes are used to first prioritize discarding extents
* and then allow for trimming in the bitmap the best opportunity to coalesce.
* The block_groups are maintained on multiple lists to allow for multiple
* passes with different discard filter requirements. A delayed work item is
* used to manage discarding with timeout determined by a max of the delay
* incurred by the iops rate limit, the byte rate limit, and the max delay of
* BTRFS_DISCARD_MAX_DELAY.
*
* Note, this only keeps track of block_groups that are explicitly for data.
* Mixed block_groups are not supported.
*
* The first list is special to manage discarding of fully free block groups.
* This is necessary because we issue a final trim for a full free block group
* after forgetting it. When a block group becomes unused, instead of directly
* being added to the unused_bgs list, we add it to this first list. Then
* from there, if it becomes fully discarded, we place it onto the unused_bgs
* list.
*
* The in-memory free space cache serves as the backing state for discard.
* Consequently this means there is no persistence. We opt to load all the
* block groups in as not discarded, so the mount case degenerates to the
* crashing case.
*
* As the free space cache uses bitmaps, there exists a tradeoff between
* ease/efficiency for find_free_extent() and the accuracy of discard state.
* Here we opt to let untrimmed regions merge with everything while only letting
* trimmed regions merge with other trimmed regions. This can cause
* overtrimming, but the coalescing benefit seems to be worth it. Additionally,
* bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
* the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
* this resets the state and we will retry trimming the whole bitmap. This is a
* tradeoff between discard state accuracy and the cost of accounting.
*/
/* This is an initial delay to give some chance for block reuse */
#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
/* Target completion latency of discarding all discardable extents */
#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC)
#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
#define BTRFS_DISCARD_MAX_IOPS (10U)
/* Montonically decreasing minimum length filters after index 0 */
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
0,
BTRFS_ASYNC_DISCARD_MAX_FILTER,
BTRFS_ASYNC_DISCARD_MIN_FILTER
};
static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
return &discard_ctl->discard_list[block_group->discard_index];
}
static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (!btrfs_run_discard_work(discard_ctl))
return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
block_group->discard_index = BTRFS_DISCARD_INDEX_START;
block_group->discard_eligible_time = (ktime_get_ns() +
BTRFS_DISCARD_DELAY);
block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
}
list_move_tail(&block_group->discard_list,
get_discard_list(discard_ctl, block_group));
}
static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (!btrfs_is_block_group_data_only(block_group))
return;
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
}
static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
spin_lock(&discard_ctl->lock);
if (!btrfs_run_discard_work(discard_ctl)) {
spin_unlock(&discard_ctl->lock);
return;
}
list_del_init(&block_group->discard_list);
block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
block_group->discard_eligible_time = (ktime_get_ns() +
BTRFS_DISCARD_UNUSED_DELAY);
block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
list_add_tail(&block_group->discard_list,
&discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
spin_unlock(&discard_ctl->lock);
}
static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
bool running = false;
spin_lock(&discard_ctl->lock);
if (block_group == discard_ctl->block_group) {
running = true;
discard_ctl->block_group = NULL;
}
block_group->discard_eligible_time = 0;
list_del_init(&block_group->discard_list);
spin_unlock(&discard_ctl->lock);
return running;
}
/**
* find_next_block_group - find block_group that's up next for discarding
* @discard_ctl: discard control
* @now: current time
*
* Iterate over the discard lists to find the next block_group up for
* discarding checking the discard_eligible_time of block_group.
*/
static struct btrfs_block_group *find_next_block_group(
struct btrfs_discard_ctl *discard_ctl,
u64 now)
{
struct btrfs_block_group *ret_block_group = NULL, *block_group;
int i;
for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
struct list_head *discard_list = &discard_ctl->discard_list[i];
if (!list_empty(discard_list)) {
block_group = list_first_entry(discard_list,
struct btrfs_block_group,
discard_list);
if (!ret_block_group)
ret_block_group = block_group;
if (ret_block_group->discard_eligible_time < now)
break;
if (ret_block_group->discard_eligible_time >
block_group->discard_eligible_time)
ret_block_group = block_group;
}
}
return ret_block_group;
}
/**
* peek_discard_list - wrap find_next_block_group()
* @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
*
* This wraps find_next_block_group() and sets the block_group to be in use.
* discard_state's control flow is managed here. Variables related to
* discard_state are reset here as needed (eg discard_cursor). @discard_state
* and @discard_index are remembered as it may change while we're discarding,
* but we want the discard to execute in the context determined here.
*/
static struct btrfs_block_group *peek_discard_list(
struct btrfs_discard_ctl *discard_ctl,
enum btrfs_discard_state *discard_state,
int *discard_index)
{
struct btrfs_block_group *block_group;
const u64 now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
again:
block_group = find_next_block_group(discard_ctl, now);
if (block_group && now > block_group->discard_eligible_time) {
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group))
__add_to_discard_list(discard_ctl, block_group);
else
list_del_init(&block_group->discard_list);
goto again;
}
if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
} else {
block_group = NULL;
}
spin_unlock(&discard_ctl->lock);
return block_group;
}
/**
* btrfs_discard_check_filter - updates a block groups filters
* @block_group: block group of interest
* @bytes: recently freed region size after coalescing
*
* Async discard maintains multiple lists with progressively smaller filters
* to prioritize discarding based on size. Should a free space that matches
* a larger filter be returned to the free_space_cache, prioritize that discard
* by moving @block_group to the proper filter.
*/
void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
u64 bytes)
{
struct btrfs_discard_ctl *discard_ctl;
if (!block_group ||
!btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
discard_ctl = &block_group->fs_info->discard_ctl;
if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
bytes >= discard_minlen[block_group->discard_index - 1]) {
int i;
remove_from_discard_list(discard_ctl, block_group);
for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
i++) {
if (bytes >= discard_minlen[i]) {
block_group->discard_index = i;
add_to_discard_list(discard_ctl, block_group);
break;
}
}
}
}
/**
* btrfs_update_discard_index - moves a block group along the discard lists
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* Increment @block_group's discard_index. If it falls of the list, let it be.
* Otherwise add it back to the appropriate list.
*/
static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
block_group->discard_index++;
if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
block_group->discard_index = 1;
return;
}
add_to_discard_list(discard_ctl, block_group);
}
/**
* btrfs_discard_cancel_work - remove a block_group from the discard lists
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This removes @block_group from the discard lists. If necessary, it waits on
* the current work and then reschedules the delayed work.
*/
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (remove_from_discard_list(discard_ctl, block_group)) {
cancel_delayed_work_sync(&discard_ctl->work);
btrfs_discard_schedule_work(discard_ctl, true);
}
}
/**
* btrfs_discard_queue_work - handles queuing the block_groups
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This maintains the LRU order of the discard lists.
*/
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
if (block_group->used == 0)
add_to_discard_unused_list(discard_ctl, block_group);
else
add_to_discard_list(discard_ctl, block_group);
if (!delayed_work_pending(&discard_ctl->work))
btrfs_discard_schedule_work(discard_ctl, false);
}
/**
* btrfs_discard_schedule_work - responsible for scheduling the discard work
* @discard_ctl: discard control
* @override: override the current timer
*
* Discards are issued by a delayed workqueue item. @override is used to
* update the current delay as the baseline delay interval is reevaluated on
* transaction commit. This is also maxed with any other rate limit.
*/
void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
bool override)
{
struct btrfs_block_group *block_group;
const u64 now = ktime_get_ns();
spin_lock(&discard_ctl->lock);
if (!btrfs_run_discard_work(discard_ctl))
goto out;
if (!override && delayed_work_pending(&discard_ctl->work))
goto out;
block_group = find_next_block_group(discard_ctl, now);
if (block_group) {
unsigned long delay = discard_ctl->delay;
u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
/*
* A single delayed workqueue item is responsible for
* discarding, so we can manage the bytes rate limit by keeping
* track of the previous discard.
*/
if (kbps_limit && discard_ctl->prev_discard) {
u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
u64 bps_delay = div64_u64(discard_ctl->prev_discard *
MSEC_PER_SEC, bps_limit);
delay = max(delay, msecs_to_jiffies(bps_delay));
}
/*
* This timeout is to hopefully prevent immediate discarding
* in a recently allocated block group.
*/
if (now < block_group->discard_eligible_time) {
u64 bg_timeout = block_group->discard_eligible_time - now;
delay = max(delay, nsecs_to_jiffies(bg_timeout));
}
mod_delayed_work(discard_ctl->discard_workers,
&discard_ctl->work, delay);
}
out:
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_finish_discard_pass - determine next step of a block_group
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This determines the next step for a block group after it's finished going
* through a pass on a discard list. If it is unused and fully trimmed, we can
* mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
* the appropriate filter list or let it fall off.
*/
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
remove_from_discard_list(discard_ctl, block_group);
if (block_group->used == 0) {
if (btrfs_is_free_space_trimmed(block_group))
btrfs_mark_bg_unused(block_group);
else
add_to_discard_unused_list(discard_ctl, block_group);
} else {
btrfs_update_discard_index(discard_ctl, block_group);
}
}
/**
* btrfs_discard_workfn - discard work function
* @work: work
*
* This finds the next block_group to start discarding and then discards a
* single region. It does this in a two-pass fashion: first extents and second
* bitmaps. Completely discarded block groups are sent to the unused_bgs path.
*/
static void btrfs_discard_workfn(struct work_struct *work)
{
struct btrfs_discard_ctl *discard_ctl;
struct btrfs_block_group *block_group;
enum btrfs_discard_state discard_state;
int discard_index = 0;
u64 trimmed = 0;
u64 minlen = 0;
discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index);
if (!block_group || !btrfs_run_discard_work(discard_ctl))
return;
/* Perform discarding */
minlen = discard_minlen[discard_index];
if (discard_state == BTRFS_DISCARD_BITMAPS) {
u64 maxlen = 0;
/*
* Use the previous levels minimum discard length as the max
* length filter. In the case something is added to make a
* region go beyond the max filter, the entire bitmap is set
* back to BTRFS_TRIM_STATE_UNTRIMMED.
*/
if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
maxlen = discard_minlen[discard_index - 1];
btrfs_trim_block_group_bitmaps(block_group, &trimmed,
block_group->discard_cursor,
btrfs_block_group_end(block_group),
minlen, maxlen, true);
discard_ctl->discard_bitmap_bytes += trimmed;
} else {
btrfs_trim_block_group_extents(block_group, &trimmed,
block_group->discard_cursor,
btrfs_block_group_end(block_group),
minlen, true);
discard_ctl->discard_extent_bytes += trimmed;
}
discard_ctl->prev_discard = trimmed;
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
if (discard_state == BTRFS_DISCARD_BITMAPS) {
btrfs_finish_discard_pass(discard_ctl, block_group);
} else {
block_group->discard_cursor = block_group->start;
spin_lock(&discard_ctl->lock);
if (block_group->discard_state !=
BTRFS_DISCARD_RESET_CURSOR)
block_group->discard_state =
BTRFS_DISCARD_BITMAPS;
spin_unlock(&discard_ctl->lock);
}
}
spin_lock(&discard_ctl->lock);
discard_ctl->block_group = NULL;
spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
}
/**
* btrfs_run_discard_work - determines if async discard should be running
* @discard_ctl: discard control
*
* Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_fs_info *fs_info = container_of(discard_ctl,
struct btrfs_fs_info,
discard_ctl);
return (!(fs_info->sb->s_flags & SB_RDONLY) &&
test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
}
/**
* btrfs_discard_calc_delay - recalculate the base delay
* @discard_ctl: discard control
*
* Recalculate the base delay which is based off the total number of
* discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
* and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
*/
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
{
s32 discardable_extents;
s64 discardable_bytes;
u32 iops_limit;
unsigned long delay;
unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
discardable_extents = atomic_read(&discard_ctl->discardable_extents);
if (!discardable_extents)
return;
spin_lock(&discard_ctl->lock);
/*
* The following is to fix a potential -1 discrepenancy that we're not
* sure how to reproduce. But given that this is the only place that
* utilizes these numbers and this is only called by from
* btrfs_finish_extent_commit() which is synchronized, we can correct
* here.
*/
if (discardable_extents < 0)
atomic_add(-discardable_extents,
&discard_ctl->discardable_extents);
discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
if (discardable_bytes < 0)
atomic64_add(-discardable_bytes,
&discard_ctl->discardable_bytes);
if (discardable_extents <= 0) {
spin_unlock(&discard_ctl->lock);
return;
}
iops_limit = READ_ONCE(discard_ctl->iops_limit);
if (iops_limit)
lower_limit = max_t(unsigned long, lower_limit,
MSEC_PER_SEC / iops_limit);
delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
discard_ctl->delay = msecs_to_jiffies(delay);
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_discard_update_discardable - propagate discard counters
* @block_group: block_group of interest
* @ctl: free_space_ctl of @block_group
*
* This propagates deltas of counters up to the discard_ctl. It maintains a
* current counter and a previous counter passing the delta up to the global
* stat. Then the current counter value becomes the previous counter value.
*/
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl)
{
struct btrfs_discard_ctl *discard_ctl;
s32 extents_delta;
s64 bytes_delta;
if (!block_group ||
!btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
!btrfs_is_block_group_data_only(block_group))
return;
discard_ctl = &block_group->fs_info->discard_ctl;
extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
ctl->discardable_extents[BTRFS_STAT_PREV];
if (extents_delta) {
atomic_add(extents_delta, &discard_ctl->discardable_extents);
ctl->discardable_extents[BTRFS_STAT_PREV] =
ctl->discardable_extents[BTRFS_STAT_CURR];
}
bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
ctl->discardable_bytes[BTRFS_STAT_PREV];
if (bytes_delta) {
atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
ctl->discardable_bytes[BTRFS_STAT_PREV] =
ctl->discardable_bytes[BTRFS_STAT_CURR];
}
}
/**
* btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
* order of operations is changed. In the normal sychronous discard path, the
* block groups are trimmed via a single large trim in transaction commit. This
* is ultimately what we are trying to avoid with asynchronous discard. Thus,
* it must be done before going down the unused_bgs path.
*/
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group *block_group, *next;
spin_lock(&fs_info->unused_bgs_lock);
/* We enabled async discard, so punt all to the queue */
list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
bg_list) {
list_del_init(&block_group->bg_list);
btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
/**
* btrfs_discard_purge_list - purge discard lists
* @discard_ctl: discard control
*
* If we are disabling async discard, we may have intercepted block groups that
* are completely free and ready for the unused_bgs path. As discarding will
* now happen in transaction commit or not at all, we can safely mark the
* corresponding block groups as unused and they will be sent on their merry
* way to the unused_bgs list.
*/
static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
{
struct btrfs_block_group *block_group, *next;
int i;
spin_lock(&discard_ctl->lock);
for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
list_for_each_entry_safe(block_group, next,
&discard_ctl->discard_list[i],
discard_list) {
list_del_init(&block_group->discard_list);
spin_unlock(&discard_ctl->lock);
if (block_group->used == 0)
btrfs_mark_bg_unused(block_group);
spin_lock(&discard_ctl->lock);
}
}
spin_unlock(&discard_ctl->lock);
}
void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
btrfs_discard_cleanup(fs_info);
return;
}
btrfs_discard_punt_unused_bgs_list(fs_info);
set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
}
void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
}
void btrfs_discard_init(struct btrfs_fs_info *fs_info)
{
struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
int i;
spin_lock_init(&discard_ctl->lock);
INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
discard_ctl->prev_discard = 0;
atomic_set(&discard_ctl->discardable_extents, 0);
atomic64_set(&discard_ctl->discardable_bytes, 0);
discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
discard_ctl->kbps_limit = 0;
discard_ctl->discard_extent_bytes = 0;
discard_ctl->discard_bitmap_bytes = 0;
atomic64_set(&discard_ctl->discard_bytes_saved, 0);
}
void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
{
btrfs_discard_stop(fs_info);
cancel_delayed_work_sync(&fs_info->discard_ctl.work);
btrfs_discard_purge_list(&fs_info->discard_ctl);
}

41
fs/btrfs/discard.h Normal file
View file

@ -0,0 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
#include <linux/sizes.h>
struct btrfs_fs_info;
struct btrfs_discard_ctl;
struct btrfs_block_group;
/* Discard size limits */
#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M)
#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M)
#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K)
/* List operations */
void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes);
/* Work operations */
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group);
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group);
void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
bool override);
bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
/* Update operations */
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl);
/* Setup/cleanup operations */
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
void btrfs_discard_resume(struct btrfs_fs_info *fs_info);
void btrfs_discard_stop(struct btrfs_fs_info *fs_info);
void btrfs_discard_init(struct btrfs_fs_info *fs_info);
void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info);
#endif

View file

@ -41,6 +41,7 @@
#include "tree-checker.h"
#include "ref-verify.h"
#include "block-group.h"
#include "discard.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@ -202,8 +203,8 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
* that covers the entire device
*/
struct extent_map *btree_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
struct page *page, size_t pg_offset,
u64 start, u64 len)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@ -1953,6 +1954,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers)
destroy_workqueue(fs_info->discard_ctl.discard_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
@ -2148,6 +2151,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
if (!(fs_info->workers && fs_info->delalloc_workers &&
fs_info->flush_workers &&
@ -2158,7 +2163,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
fs_info->qgroup_rescan_workers)) {
fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@ -2792,6 +2798,7 @@ int __cold open_ctree(struct super_block *sb,
btrfs_init_dev_replace_locks(fs_info);
btrfs_init_qgroup(fs_info);
btrfs_discard_init(fs_info);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@ -3082,20 +3089,13 @@ int __cold open_ctree(struct super_block *sb,
btrfs_free_extra_devids(fs_devices, 1);
ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
ret);
goto fail_block_groups;
}
ret = btrfs_sysfs_add_device(fs_devices);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs device interface: %d",
ret);
goto fail_fsdev_sysfs;
}
ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
@ -3262,6 +3262,7 @@ int __cold open_ctree(struct super_block *sb,
}
btrfs_qgroup_rescan_resume(fs_info);
btrfs_discard_resume(fs_info);
if (!fs_info->uuid_root) {
btrfs_info(fs_info, "creating UUID tree");
@ -3978,6 +3979,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
if (!sb_rdonly(fs_info->sb)) {
/*
* The cleaner kthread is stopped, so do one final pass over
@ -4026,11 +4030,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
/*
* We must free the block groups after dropping the fs_roots as we could
* have had an IO error and have left over tree log blocks that aren't
* cleaned up until the fs roots are freed. This makes the block group
* accounting appear to be wrong because there's pending reserved bytes,
* so make sure we do the block group cleanup afterwards.
*/
btrfs_free_block_groups(fs_info);
iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY

View file

@ -134,8 +134,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
struct extent_map *btree_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create);
struct page *page, size_t pg_offset,
u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);

View file

@ -32,6 +32,7 @@
#include "block-rsv.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "discard.h"
#undef SCRAMBLE_DELAYED_REFS
@ -2923,7 +2924,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
break;
}
if (btrfs_test_opt(fs_info, DISCARD))
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
@ -2934,6 +2935,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
cond_resched();
}
if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
btrfs_discard_calc_delay(&fs_info->discard_ctl);
btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
}
/*
* Transaction is finished. We don't need the lock anymore. We
* do need to clean up the block groups in case of a transaction
@ -3438,7 +3444,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
*/
struct find_free_extent_ctl {
/* Basic allocation info */
u64 ram_bytes;
u64 num_bytes;
u64 empty_size;
u64 flags;
@ -3810,7 +3815,6 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
WARN_ON(num_bytes < fs_info->sectorsize);
ffe_ctl.ram_bytes = ram_bytes;
ffe_ctl.num_bytes = num_bytes;
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
@ -4165,12 +4169,10 @@ again:
return ret;
}
static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len,
int pin, int delalloc)
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc)
{
struct btrfs_block_group *cache;
int ret = 0;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
@ -4179,32 +4181,30 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
if (pin)
pin_down_extent(cache, start, len, 1);
else {
if (btrfs_test_opt(fs_info, DISCARD))
ret = btrfs_discard_extent(fs_info, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_free_reserved_bytes(cache, len, delalloc);
trace_btrfs_reserved_extent_free(fs_info, start, len);
}
btrfs_add_free_space(cache, start, len);
btrfs_free_reserved_bytes(cache, len, delalloc);
trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache);
return 0;
}
int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
struct btrfs_block_group *cache;
int ret = 0;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
return -ENOSPC;
}
ret = pin_down_extent(cache, start, len, 1);
btrfs_put_block_group(cache);
return ret;
}
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc)
{
return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
}
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,

View file

@ -3043,7 +3043,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@ -3455,11 +3455,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
update_nr_written(wbc, nr_written + 1);
end = page_end;
if (i_size <= start) {
btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
goto done;
}
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
@ -3471,8 +3466,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, 1);
break;
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
end - cur + 1, 1);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
@ -3497,22 +3492,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
/*
* end_io notification does not happen here for
* compressed extents
*/
if (!compressed)
btrfs_writepage_endio_finish_ordered(page, cur,
cur + iosize - 1,
1);
else if (compressed) {
/* we don't want to end_page_writeback on
* a compressed extent. this happens
* elsewhere
*/
if (compressed)
nr++;
}
else
btrfs_writepage_endio_finish_ordered(page, cur,
cur + iosize - 1, 1);
cur += iosize;
pg_offset += iosize;
continue;
@ -3540,7 +3524,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
pg_offset += iosize;
nr++;
}
done:
*nr_ret = nr;
return ret;
}
@ -3562,7 +3545,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset = 0;
size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
unsigned long nr_written = 0;
@ -3591,14 +3574,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
flush_dcache_page(page);
}
pg_offset = 0;
set_page_extent_mapped(page);
if (!epd->extent_locked) {
ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
if (ret == 1)
goto done_unlocked;
return 0;
if (ret)
goto done;
}
@ -3606,7 +3587,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ret = __extent_writepage_io(inode, page, wbc, epd,
i_size, nr_written, &nr);
if (ret == 1)
goto done_unlocked;
return 0;
done:
if (nr == 0) {
@ -3621,9 +3602,6 @@ done:
unlock_page(page);
ASSERT(ret <= 0);
return ret;
done_unlocked:
return 0;
}
void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
@ -3941,6 +3919,11 @@ int btree_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@ -3958,7 +3941,6 @@ retry:
tag))) {
unsigned i;
scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@ -4087,6 +4069,11 @@ static int extent_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@ -4120,7 +4107,6 @@ retry:
&index, end, tag))) {
unsigned i;
scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];

View file

@ -183,10 +183,8 @@ static inline int extent_compress_type(unsigned long bio_flags)
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
struct page *page,
size_t pg_offset,
u64 start, u64 len,
int create);
struct page *page, size_t pg_offset,
u64 start, u64 len);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);

View file

@ -148,8 +148,19 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u8 *dst, int dio)
/**
* btrfs_lookup_bio_sums - Look up checksums for a bio.
* @inode: inode that the bio is for.
* @bio: bio embedded in btrfs_io_bio.
* @offset: Unless (u64)-1, look up checksums for this offset in the file.
* If (u64)-1, use the page offsets from the bio instead.
* @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If
* NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead.
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 offset, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio_vec bvec;
@ -158,8 +169,8 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_path *path;
const bool page_offsets = (offset == (u64)-1);
u8 *csum;
u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
@ -205,15 +216,13 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
}
disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
if (dio)
offset = logical_offset;
bio_for_each_segment(bvec, bio, iter) {
page_bytes_left = bvec.bv_len;
if (count)
goto next;
if (!dio)
if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
csum, nblocks);
@ -274,7 +283,8 @@ found:
csum += count * csum_size;
nblocks -= count;
next:
while (count--) {
while (count > 0) {
count--;
disk_bytenr += fs_info->sectorsize;
offset += fs_info->sectorsize;
page_bytes_left -= fs_info->sectorsize;
@ -285,18 +295,7 @@ next:
WARN_ON_ONCE(count);
btrfs_free_path(path);
return 0;
}
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u8 *dst)
{
return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
}
blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
{
return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
return BLK_STS_OK;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@ -483,8 +482,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- 1);
for (i = 0; i < nr_sectors; i++) {
if (offset >= ordered->file_offset + ordered->len ||
offset < ordered->file_offset) {
if (offset >= ordered->file_offset + ordered->num_bytes ||
offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;

View file

@ -477,8 +477,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
u64 em_len;
int ret = 0;
em = btrfs_get_extent(inode, NULL, 0, search_start,
search_len, 0);
em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
@ -1501,7 +1500,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->len > start_pos &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
last_pos, cached_state);
@ -2390,7 +2389,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize), 0);
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
return PTR_ERR(em);
@ -2426,7 +2425,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
* we need to try again.
*/
if ((!ordered ||
(ordered->file_offset + ordered->len <= lockstart ||
(ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
lockstart, lockend)) {
@ -2957,7 +2956,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@ -2990,8 +2989,8 @@ static int btrfs_zero_range(struct inode *inode,
inode_dio_wait(inode);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
alloc_start, alloc_end - alloc_start, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@ -3034,8 +3033,8 @@ static int btrfs_zero_range(struct inode *inode,
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
alloc_start, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@ -3248,7 +3247,7 @@ static long btrfs_fallocate(struct file *file, int mode,
ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
if (ordered &&
ordered->file_offset + ordered->len > alloc_start &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
@ -3273,7 +3272,7 @@ static long btrfs_fallocate(struct file *file, int mode,
INIT_LIST_HEAD(&reserve_list);
while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
break;

File diff suppressed because it is too large Load diff

View file

@ -6,6 +6,20 @@
#ifndef BTRFS_FREE_SPACE_CACHE_H
#define BTRFS_FREE_SPACE_CACHE_H
/*
* This is the trim state of an extent or bitmap.
*
* BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a
* bitmap as we may need several trims to fully trim a single bitmap entry.
* This is reset should any free space other than trimmed space be added to the
* bitmap.
*/
enum btrfs_trim_state {
BTRFS_TRIM_STATE_UNTRIMMED,
BTRFS_TRIM_STATE_TRIMMED,
BTRFS_TRIM_STATE_TRIMMING,
};
struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
@ -13,8 +27,21 @@ struct btrfs_free_space {
u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
enum btrfs_trim_state trim_state;
s32 bitmap_extents;
};
static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info)
{
return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED);
}
static inline bool btrfs_free_space_trimming_bitmap(
struct btrfs_free_space *info)
{
return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
}
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
@ -24,6 +51,8 @@ struct btrfs_free_space_ctl {
int total_bitmaps;
int unit;
u64 start;
s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
@ -83,13 +112,17 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
u64 bytenr, u64 size);
u64 bytenr, u64 size,
enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
@ -108,6 +141,12 @@ int btrfs_return_cluster_to_free_space(
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
bool async);
int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS

View file

@ -107,7 +107,7 @@ again:
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
key.objectid - last - 1);
key.objectid - last - 1, 0);
wake_up(&root->ino_cache_wait);
}
@ -118,7 +118,7 @@ next:
if (last < root->highest_objectid - 1) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
root->highest_objectid - last - 1);
root->highest_objectid - last - 1, 0);
}
spin_lock(&root->ino_cache_lock);
@ -175,7 +175,8 @@ static void start_caching(struct btrfs_root *root)
ret = btrfs_find_free_objectid(root, &objectid);
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
__btrfs_add_free_space(fs_info, ctl, objectid,
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
BTRFS_LAST_FREE_OBJECTID - objectid + 1,
0);
wake_up(&root->ino_cache_wait);
}
@ -221,7 +222,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(fs_info, pinned, objectid, 1);
__btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
} else {
down_write(&fs_info->commit_root_sem);
spin_lock(&root->ino_cache_lock);
@ -234,7 +235,7 @@ again:
start_caching(root);
__btrfs_add_free_space(fs_info, pinned, objectid, 1);
__btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
up_write(&fs_info->commit_root_sem);
}
@ -281,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
spin_unlock(rbroot_lock);
if (count)
__btrfs_add_free_space(root->fs_info, ctl,
info->offset, count);
info->offset, count, 0);
kmem_cache_free(btrfs_free_space_cachep, info);
}
}

File diff suppressed because it is too large Load diff

View file

@ -1128,7 +1128,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
/* get the big lock and read metadata off disk */
lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))

View file

@ -20,9 +20,9 @@ static struct kmem_cache *btrfs_ordered_extent_cache;
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
if (entry->file_offset + entry->len < entry->file_offset)
if (entry->file_offset + entry->num_bytes < entry->file_offset)
return (u64)-1;
return entry->file_offset + entry->len;
return entry->file_offset + entry->num_bytes;
}
/* returns NULL if the insertion worked, or it returns the node it did find
@ -52,14 +52,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
return NULL;
}
static void ordered_data_tree_panic(struct inode *inode, int errno,
u64 offset)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
btrfs_panic(fs_info, errno,
"Inconsistency in ordered tree at offset %llu", offset);
}
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
@ -120,7 +112,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
if (file_offset < entry->file_offset ||
entry->file_offset + entry->len <= file_offset)
entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
@ -129,7 +121,7 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
if (file_offset + len <= entry->file_offset ||
entry->file_offset + entry->len <= file_offset)
entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
@ -161,19 +153,14 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
}
/* allocate and add a new ordered_extent into the per-inode tree.
* file_offset is the logical offset in the file
*
* start is the disk block number of an extent already reserved in the
* extent allocation tree
*
* len is the length of the extent
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int dio, int compress_type)
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type, int dio,
int compress_type)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@ -187,10 +174,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
return -ENOMEM;
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->disk_bytenr = disk_bytenr;
entry->num_bytes = num_bytes;
entry->disk_num_bytes = disk_num_bytes;
entry->bytes_left = num_bytes;
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
@ -198,7 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
set_bit(type, &entry->flags);
if (dio) {
percpu_counter_add_batch(&fs_info->dio_bytes, len,
percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
fs_info->delalloc_batch);
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
}
@ -219,7 +206,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
ordered_data_tree_panic(inode, -EEXIST, file_offset);
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
file_offset);
spin_unlock_irq(&tree->lock);
spin_lock(&root->ordered_extent_lock);
@ -247,27 +236,30 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
}
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 0,
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 0,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type)
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 1,
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 1,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type)
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type)
{
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
disk_len, type, 0,
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 0,
compress_type);
}
@ -328,8 +320,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
}
dec_start = max(*file_offset, entry->file_offset);
dec_end = min(*file_offset + io_size, entry->file_offset +
entry->len);
dec_end = min(*file_offset + io_size,
entry->file_offset + entry->num_bytes);
*file_offset = dec_end;
if (dec_start > dec_end) {
btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
@ -471,10 +463,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
if (root != fs_info->tree_root)
btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
false);
if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len,
percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
@ -534,8 +527,8 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
if (range_end <= ordered->start ||
ordered->start + ordered->disk_len <= range_start) {
if (range_end <= ordered->disk_bytenr ||
ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
list_move_tail(&ordered->root_extent_list, &skipped);
cond_resched_lock(&root->ordered_extent_lock);
continue;
@ -619,7 +612,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->len - 1;
u64 end = start + entry->num_bytes - 1;
trace_btrfs_ordered_extent_start(inode, entry);
@ -680,7 +673,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
if (ordered->file_offset + ordered->len <= start) {
if (ordered->file_offset + ordered->num_bytes <= start) {
btrfs_put_ordered_extent(ordered);
break;
}

View file

@ -67,14 +67,13 @@ struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
/* disk byte number */
u64 start;
/* ram length of the extent in bytes */
u64 len;
/* extent length on disk */
u64 disk_len;
/*
* These fields directly correspond to the same fields in
* btrfs_file_extent_item.
*/
u64 disk_bytenr;
u64 num_bytes;
u64 disk_num_bytes;
/* number of bytes that still need writing */
u64 bytes_left;
@ -161,12 +160,15 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
u64 *file_offset, u64 io_size,
int uptodate);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len, int type);
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type);
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
u64 start, u64 len, u64 disk_len,
int type, int compress_type);
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type,
int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,

View file

@ -317,7 +317,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size_nr(l, i));
break;
};
}
}
}

View file

@ -1243,7 +1243,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@ -1259,9 +1258,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
return -ENOMEM;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
@ -1307,7 +1305,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@ -1320,9 +1317,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
if (!tmp)
return -ENOMEM;
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
@ -1387,11 +1383,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (qgroup) {
ret = -EEXIST;
@ -1416,15 +1412,13 @@ out:
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
@ -1465,7 +1459,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
/* Sometimes we would want to clear the limit on this qgroup.
@ -1475,9 +1468,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
if (!fs_info->quota_root) {
ret = -ENOTCONN;
goto out;
}
@ -2582,10 +2574,9 @@ cleanup:
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
if (!quota_root)
if (!fs_info->quota_root)
return ret;
spin_lock(&fs_info->qgroup_lock);
@ -2879,7 +2870,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
@ -2898,8 +2888,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enforce = false;
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@ -2966,7 +2955,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@ -2984,8 +2972,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
}
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@ -3685,7 +3672,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@ -3693,7 +3679,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
if (num_bytes == 0)
return;
if (!quota_root)
if (!fs_info->quota_root)
return;
spin_lock(&fs_info->qgroup_lock);

View file

@ -4332,6 +4332,15 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
block_group->start, buf);
}
static const char *stage_to_string(int stage)
{
if (stage == MOVE_DATA_EXTENTS)
return "move data extents";
if (stage == UPDATE_DATA_PTRS)
return "update data pointers";
return "unknown";
}
/*
* function to relocate all extents in a block group.
*/
@ -4406,12 +4415,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->length);
while (1) {
int finishes_stage;
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0)
err = ret;
finishes_stage = rc->stage;
/*
* We may have gotten ENOSPC after we already dirtied some
* extents. If writeout happens while we're relocating a
@ -4437,8 +4449,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (rc->extents_found == 0)
break;
btrfs_info(fs_info, "found %llu extents", rc->extents_found);
btrfs_info(fs_info, "found %llu extents, stage: %s",
rc->extents_found, stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
@ -4656,7 +4668,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
LIST_HEAD(list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
@ -4680,7 +4692,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
* disk_len vs real len like with real inodes since it's all
* disk length.
*/
new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
sums->bytenr = new_bytenr;
btrfs_add_ordered_sum(ordered, sums);

View file

@ -8,6 +8,7 @@
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "discard.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
@ -3682,7 +3683,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache->removed && !cache->ro && cache->reserved == 0 &&
cache->used == 0) {
spin_unlock(&cache->lock);
btrfs_mark_bg_unused(cache);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_queue_work(&fs_info->discard_ctl,
cache);
else
btrfs_mark_bg_unused(cache);
} else {
spin_unlock(&cache->lock);
}

View file

@ -161,8 +161,7 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush,
bool system_chunk)
enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
if (system_chunk)
if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
@ -227,8 +226,7 @@ again:
/* Check and see if our ticket can be satisified now. */
if ((used + ticket->bytes <= space_info->total_bytes) ||
can_overcommit(fs_info, space_info, ticket->bytes, flush,
false)) {
can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
@ -626,8 +624,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
bool system_chunk)
struct btrfs_space_info *space_info)
{
struct reserve_ticket *ticket;
u64 used;
@ -643,13 +640,12 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
if (can_overcommit(fs_info, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL, system_chunk))
BTRFS_RESERVE_FLUSH_ALL))
return 0;
used = btrfs_space_info_used(space_info, true);
if (can_overcommit(fs_info, space_info, SZ_1M,
BTRFS_RESERVE_FLUSH_ALL, system_chunk))
if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@ -665,7 +661,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 used, bool system_chunk)
u64 used)
{
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
@ -673,8 +669,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
system_chunk))
if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
return 0;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@ -765,8 +760,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
false);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
@ -785,8 +779,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
return;
}
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
space_info,
false);
space_info);
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
@ -858,8 +851,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int flush_state;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
false);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
spin_unlock(&space_info->lock);
return;
@ -990,8 +982,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush,
bool system_chunk)
enum btrfs_reserve_flush_enum flush)
{
struct reserve_ticket ticket;
u64 used;
@ -1013,8 +1004,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
*/
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
can_overcommit(fs_info, space_info, orig_bytes, flush,
system_chunk))) {
can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
@ -1054,8 +1044,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
need_do_async_reclaim(fs_info, space_info,
used, system_chunk) &&
need_do_async_reclaim(fs_info, space_info, used) &&
!work_busy(&fs_info->async_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
@ -1092,10 +1081,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
bool system_chunk = (root == fs_info->chunk_root);
ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
orig_bytes, flush, system_chunk);
orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&

View file

@ -46,6 +46,7 @@
#include "sysfs.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
#define CREATE_TRACE_POINTS
@ -146,6 +147,8 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
if (sb_rdonly(sb))
return;
btrfs_discard_stop(fs_info);
/* btrfs handle error by forcing the filesystem readonly */
sb->s_flags |= SB_RDONLY;
btrfs_info(fs_info, "forced readonly");
@ -313,6 +316,7 @@ enum {
Opt_datasum, Opt_nodatasum,
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
Opt_discard_mode,
Opt_nologreplay,
Opt_norecovery,
Opt_ratio,
@ -375,6 +379,7 @@ static const match_table_t tokens = {
{Opt_defrag, "autodefrag"},
{Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
{Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
{Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
@ -695,12 +700,26 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->metadata_ratio);
break;
case Opt_discard:
btrfs_set_and_info(info, DISCARD,
"turning on discard");
case Opt_discard_mode:
if (token == Opt_discard ||
strcmp(args[0].from, "sync") == 0) {
btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
btrfs_set_and_info(info, DISCARD_SYNC,
"turning on sync discard");
} else if (strcmp(args[0].from, "async") == 0) {
btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
ret = -EINVAL;
goto out;
}
break;
case Opt_nodiscard:
btrfs_clear_and_info(info, DISCARD,
btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
btrfs_clear_and_info(info, DISCARD_ASYNC,
"turning off async discard");
break;
case Opt_space_cache:
case Opt_space_cache_version:
@ -1322,8 +1341,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nologreplay");
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD))
if (btrfs_test_opt(info, DISCARD_SYNC))
seq_puts(seq, ",discard");
if (btrfs_test_opt(info, DISCARD_ASYNC))
seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_test_opt(info, SPACE_CACHE))
@ -1713,6 +1734,14 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_cleanup_defrag_inodes(fs_info);
}
/* If we toggled discard async */
if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_resume(fs_info);
else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
@ -1760,6 +1789,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
*/
cancel_work_sync(&fs_info->async_reclaim_work);
btrfs_discard_cleanup(fs_info);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al. */

View file

@ -12,6 +12,7 @@
#include <crypto/hash.h>
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
#include "transaction.h"
#include "sysfs.h"
@ -338,12 +339,178 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
#ifdef CONFIG_BTRFS_DEBUG
/*
* Discard statistics and tunables
*/
#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discardable_bytes));
}
BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%d\n",
atomic_read(&fs_info->discard_ctl.discardable_extents));
}
BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
}
BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%lld\n",
fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n",
READ_ONCE(fs_info->discard_ctl.iops_limit));
}
static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
struct kobj_attribute *a,
const char *buf, size_t len)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
u32 iops_limit;
int ret;
ret = kstrtou32(buf, 10, &iops_limit);
if (ret)
return -EINVAL;
WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
return len;
}
BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
btrfs_discard_iops_limit_store);
static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%u\n",
READ_ONCE(fs_info->discard_ctl.kbps_limit));
}
static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
struct kobj_attribute *a,
const char *buf, size_t len)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
u32 kbps_limit;
int ret;
ret = kstrtou32(buf, 10, &kbps_limit);
if (ret)
return -EINVAL;
WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
return len;
}
BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
btrfs_discard_kbps_limit_store);
static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
return snprintf(buf, PAGE_SIZE, "%llu\n",
READ_ONCE(fs_info->discard_ctl.max_discard_size));
}
static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
struct kobj_attribute *a,
const char *buf, size_t len)
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
u64 max_discard_size;
int ret;
ret = kstrtou64(buf, 10, &max_discard_size);
if (ret)
return -EINVAL;
WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size);
return len;
}
BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
btrfs_discard_max_discard_size_store);
static const struct attribute *discard_debug_attrs[] = {
BTRFS_ATTR_PTR(discard, discardable_bytes),
BTRFS_ATTR_PTR(discard, discardable_extents),
BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
BTRFS_ATTR_PTR(discard, discard_bytes_saved),
BTRFS_ATTR_PTR(discard, discard_extent_bytes),
BTRFS_ATTR_PTR(discard, iops_limit),
BTRFS_ATTR_PTR(discard, kbps_limit),
BTRFS_ATTR_PTR(discard, max_discard_size),
NULL,
};
/*
* Runtime debugging exported via sysfs
*
* /sys/fs/btrfs/debug - applies to module or all filesystems
* /sys/fs/btrfs/UUID - applies only to the given filesystem
*/
static const struct attribute *btrfs_debug_mount_attrs[] = {
NULL,
};
static struct attribute *btrfs_debug_feature_attrs[] = {
NULL
};
@ -734,10 +901,10 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
if (fs_devs->device_dir_kobj) {
kobject_del(fs_devs->device_dir_kobj);
kobject_put(fs_devs->device_dir_kobj);
fs_devs->device_dir_kobj = NULL;
if (fs_devs->devices_kobj) {
kobject_del(fs_devs->devices_kobj);
kobject_put(fs_devs->devices_kobj);
fs_devs->devices_kobj = NULL;
}
if (fs_devs->fsid_kobj.state_initialized) {
@ -771,6 +938,19 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
#ifdef CONFIG_BTRFS_DEBUG
if (fs_info->discard_debug_kobj) {
sysfs_remove_files(fs_info->discard_debug_kobj,
discard_debug_attrs);
kobject_del(fs_info->discard_debug_kobj);
kobject_put(fs_info->discard_debug_kobj);
}
if (fs_info->debug_kobj) {
sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
kobject_del(fs_info->debug_kobj);
kobject_put(fs_info->debug_kobj);
}
#endif
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
@ -969,45 +1149,119 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct hd_struct *disk;
struct kobject *disk_kobj;
if (!fs_devices->device_dir_kobj)
if (!fs_devices->devices_kobj)
return -EINVAL;
if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
if (one_device) {
if (one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
sysfs_remove_link(fs_devices->devices_kobj,
disk_kobj->name);
}
sysfs_remove_link(fs_devices->device_dir_kobj,
disk_kobj->name);
}
kobject_del(&one_device->devid_kobj);
kobject_put(&one_device->devid_kobj);
wait_for_completion(&one_device->kobj_unregister);
if (one_device)
return 0;
}
list_for_each_entry(one_device,
&fs_devices->devices, dev_list) {
if (!one_device->bdev)
continue;
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
sysfs_remove_link(fs_devices->device_dir_kobj,
disk_kobj->name);
if (one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
sysfs_remove_link(fs_devices->devices_kobj,
disk_kobj->name);
}
kobject_del(&one_device->devid_kobj);
kobject_put(&one_device->devid_kobj);
wait_for_completion(&one_device->kobj_unregister);
}
return 0;
}
int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
if (!fs_devs->device_dir_kobj)
fs_devs->device_dir_kobj = kobject_create_and_add("devices",
&fs_devs->fsid_kobj);
int val;
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
if (!fs_devs->device_dir_kobj)
return -ENOMEM;
val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
return 0;
return snprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
int val;
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show);
static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
int val;
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
int val;
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
return snprintf(buf, PAGE_SIZE, "%d\n", val);
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
static struct attribute *devid_attrs[] = {
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
BTRFS_ATTR_PTR(devid, writeable),
NULL
};
ATTRIBUTE_GROUPS(devid);
static void btrfs_release_devid_kobj(struct kobject *kobj)
{
struct btrfs_device *device = container_of(kobj, struct btrfs_device,
devid_kobj);
memset(&device->devid_kobj, 0, sizeof(struct kobject));
complete(&device->kobj_unregister);
}
static struct kobj_type devid_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = devid_groups,
.release = btrfs_release_devid_kobj,
};
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
@ -1016,22 +1270,31 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *dev;
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
if (!dev->bdev)
continue;
if (one_device && one_device != dev)
continue;
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
if (dev->bdev) {
struct hd_struct *disk;
struct kobject *disk_kobj;
error = sysfs_create_link(fs_devices->device_dir_kobj,
disk_kobj, disk_kobj->name);
if (error)
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
error = sysfs_create_link(fs_devices->devices_kobj,
disk_kobj, disk_kobj->name);
if (error)
break;
}
init_completion(&dev->kobj_unregister);
error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
fs_devices->devices_kobj, "%llu",
dev->devid);
if (error) {
kobject_put(&dev->devid_kobj);
break;
}
}
return error;
@ -1063,27 +1326,49 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
"sysfs: failed to create fsid for sprout");
}
void btrfs_sysfs_update_devid(struct btrfs_device *device)
{
char tmp[24];
snprintf(tmp, sizeof(tmp), "%llu", device->devid);
if (kobject_rename(&device->devid_kobj, tmp))
btrfs_warn(device->fs_devices->fs_info,
"sysfs: failed to update devid for %llu",
device->devid);
}
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
/*
* Creates:
* /sys/fs/btrfs/UUID
*
* Can be called by the device discovery thread.
* And parent can be specified for seed device
*/
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent)
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
int error;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
"%pU", fs_devs->fsid);
if (error) {
kobject_put(&fs_devs->fsid_kobj);
return error;
}
fs_devs->devices_kobj = kobject_create_and_add("devices",
&fs_devs->fsid_kobj);
if (!fs_devs->devices_kobj) {
btrfs_err(fs_devs->fs_info,
"failed to init sysfs device interface");
kobject_put(&fs_devs->fsid_kobj);
return -ENOMEM;
}
return 0;
}
@ -1111,8 +1396,26 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
error = sysfs_create_group(fsid_kobj,
&btrfs_debug_feature_attr_group);
fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
if (!fs_info->debug_kobj) {
error = -ENOMEM;
goto failure;
}
error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
if (error)
goto failure;
/* Discard directory */
fs_info->discard_debug_kobj = kobject_create_and_add("discard",
fs_info->debug_kobj);
if (!fs_info->discard_debug_kobj) {
error = -ENOMEM;
goto failure;
}
error = sysfs_create_files(fs_info->discard_debug_kobj,
discard_debug_attrs);
if (error)
goto failure;
#endif
@ -1209,6 +1512,9 @@ void __cold btrfs_exit_sysfs(void)
sysfs_unmerge_group(&btrfs_kset->kobj,
&btrfs_static_feature_attr_group);
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
#ifdef CONFIG_BTRFS_DEBUG
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
#endif
kset_unregister(btrfs_kset);
}

View file

@ -18,9 +18,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
const u8 *fsid);
@ -36,5 +34,6 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
void btrfs_sysfs_update_devid(struct btrfs_device *device);
#endif

View file

@ -86,6 +86,27 @@ static void btrfs_destroy_test_fs(void)
unregister_filesystem(&test_type);
}
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return ERR_PTR(-ENOMEM);
extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
return dev;
}
static void btrfs_free_dummy_device(struct btrfs_device *dev)
{
extent_io_tree_release(&dev->alloc_state);
kfree(dev);
}
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
@ -132,12 +153,14 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->fs_devices->devices);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
extent_map_tree_init(&fs_info->mapping_tree);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
@ -150,6 +173,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
struct radix_tree_iter iter;
void **slot;
struct btrfs_device *dev, *tmp;
if (!fs_info)
return;
@ -180,6 +204,11 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
}
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
cleanup_srcu_struct(&fs_info->subvol_srcu);

View file

@ -46,6 +46,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
{

View file

@ -6,6 +6,9 @@
#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../block-group.h"
static void free_extent_map_tree(struct extent_map_tree *em_tree)
{
@ -437,11 +440,153 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
return ret;
}
struct rmap_test_vector {
u64 raid_type;
u64 physical_start;
u64 data_stripe_size;
u64 num_data_stripes;
u64 num_stripes;
/* Assume we won't have more than 5 physical stripes */
u64 data_stripe_phys_start[5];
bool expected_mapped_addr;
/* Physical to logical addresses */
u64 mapped_logical[5];
};
static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
struct extent_map *em;
struct map_lookup *map = NULL;
u64 *logical = NULL;
int i, out_ndaddrs, out_stripe_len;
int ret;
em = alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
}
map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
if (!map) {
kfree(em);
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
/* Start at 4GiB logical address */
em->start = SZ_4G;
em->len = test->data_stripe_size * test->num_data_stripes;
em->block_len = em->len;
em->orig_block_len = test->data_stripe_size;
em->map_lookup = map;
map->num_stripes = test->num_stripes;
map->stripe_len = BTRFS_STRIPE_LEN;
map->type = test->raid_type;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info);
if (IS_ERR(dev)) {
test_err("cannot allocate device");
ret = PTR_ERR(dev);
goto out;
}
map->stripes[i].dev = dev;
map->stripes[i].physical = test->data_stripe_phys_start[i];
}
write_lock(&fs_info->mapping_tree.lock);
ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
write_unlock(&fs_info->mapping_tree.lock);
if (ret) {
test_err("error adding block group mapping to mapping tree");
goto out_free;
}
ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
test->expected_mapped_addr);
goto out;
}
if (out_stripe_len != BTRFS_STRIPE_LEN) {
test_err("calculated stripe length doesn't match");
goto out;
}
if (out_ndaddrs != test->expected_mapped_addr) {
for (i = 0; i < out_ndaddrs; i++)
test_msg("mapped %llu", logical[i]);
test_err("unexpected number of mapped addresses: %d", out_ndaddrs);
goto out;
}
for (i = 0; i < out_ndaddrs; i++) {
if (logical[i] != test->mapped_logical[i]) {
test_err("unexpected logical address mapped");
goto out;
}
}
ret = 0;
out:
write_lock(&fs_info->mapping_tree.lock);
remove_extent_mapping(&fs_info->mapping_tree, em);
write_unlock(&fs_info->mapping_tree.lock);
/* For us */
free_extent_map(em);
out_free:
/* For the tree */
free_extent_map(em);
kfree(logical);
return ret;
}
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
struct extent_map_tree *em_tree;
int ret = 0;
int ret = 0, i;
struct rmap_test_vector rmap_tests[] = {
{
/*
* Test a chunk with 2 data stripes one of which
* interesects the physical address of the super block
* is correctly recognised.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
.physical_start = SZ_64M - SZ_4M,
.data_stripe_size = SZ_256M,
.num_data_stripes = 2,
.num_stripes = 2,
.data_stripe_phys_start =
{SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M},
.expected_mapped_addr = true,
.mapped_logical= {SZ_4G + SZ_4M}
},
{
/*
* Test that out-of-range physical addresses are
* ignored
*/
/* SINGLE chunk type */
.raid_type = 0,
.physical_start = SZ_4G,
.data_stripe_size = SZ_256M,
.num_data_stripes = 1,
.num_stripes = 1,
.data_stripe_phys_start = {SZ_256M},
.expected_mapped_addr = false,
.mapped_logical = {0}
}
};
test_msg("running extent_map tests");
@ -474,6 +619,13 @@ int btrfs_test_extent_map(void)
goto out;
ret = test_case_4(fs_info, em_tree);
test_msg("running rmap tests");
for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) {
ret = test_rmap_block(fs_info, &rmap_tests[i]);
if (ret)
goto out;
}
out:
kfree(em_tree);
btrfs_free_dummy_fs_info(fs_info);

View file

@ -263,7 +263,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* First with no extents */
BTRFS_I(inode)->root = root;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
if (IS_ERR(em)) {
em = NULL;
test_err("got an error when we shouldn't have");
@ -283,7 +283,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
*/
setup_file_extents(root, sectorsize);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -333,7 +333,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -356,7 +356,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Regular extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -384,7 +384,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are split extents */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -435,7 +435,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -469,7 +469,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -498,7 +498,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -528,7 +528,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -561,7 +561,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -596,7 +596,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Now for the compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -630,7 +630,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Split compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -665,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -727,8 +727,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* A hole between regular extents but no hole extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6,
sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -755,7 +754,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -788,7 +787,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -872,7 +871,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
insert_inode_item_key(root);
insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@ -894,8 +893,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
}
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize,
2 * sectorsize, 0);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;

View file

@ -147,13 +147,14 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
static noinline void switch_commit_roots(struct btrfs_transaction *trans)
static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(root, tmp, &trans->switch_commits,
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
dirty_list) {
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
@ -165,16 +166,17 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
}
/* We can free old roots now. */
spin_lock(&trans->dropped_roots_lock);
while (!list_empty(&trans->dropped_roots)) {
root = list_first_entry(&trans->dropped_roots,
spin_lock(&cur_trans->dropped_roots_lock);
while (!list_empty(&cur_trans->dropped_roots)) {
root = list_first_entry(&cur_trans->dropped_roots,
struct btrfs_root, root_list);
list_del_init(&root->root_list);
spin_unlock(&trans->dropped_roots_lock);
spin_unlock(&cur_trans->dropped_roots_lock);
btrfs_free_log(trans, root);
btrfs_drop_and_free_fs_root(fs_info, root);
spin_lock(&trans->dropped_roots_lock);
spin_lock(&cur_trans->dropped_roots_lock);
}
spin_unlock(&trans->dropped_roots_lock);
spin_unlock(&cur_trans->dropped_roots_lock);
up_write(&fs_info->commit_root_sem);
}
@ -1421,7 +1423,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
ret = commit_cowonly_roots(trans);
if (ret)
goto out;
switch_commit_roots(trans->transaction);
switch_commit_roots(trans);
ret = btrfs_write_and_wait_transaction(trans);
if (ret)
btrfs_handle_fs_error(fs_info, ret,
@ -2013,6 +2015,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ASSERT(refcount_read(&trans->use_count) == 1);
/*
* Some places just start a transaction to commit it. We need to make
* sure that if this commit fails that the abort code actually marks the
* transaction as failed, so set trans->dirty to make the abort code do
* the right thing.
*/
trans->dirty = true;
/* Stop the commit early if ->aborted is set */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
@ -2301,7 +2311,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
switch_commit_roots(cur_trans);
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));

View file

@ -373,6 +373,104 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
return 0;
}
/* Inode item error output has the same format as dir_item_err() */
#define inode_item_err(eb, slot, fmt, ...) \
dir_item_err(eb, slot, fmt, __VA_ARGS__)
static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_key item_key;
bool is_inode_item;
btrfs_item_key_to_cpu(leaf, &item_key, slot);
is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY);
/* For XATTR_ITEM, location key should be all 0 */
if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
if (key->type != 0 || key->objectid != 0 || key->offset != 0)
return -EUCLEAN;
return 0;
}
if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
key->objectid != BTRFS_FREE_INO_OBJECTID) {
if (is_inode_item) {
generic_err(leaf, slot,
"invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
BTRFS_FIRST_FREE_OBJECTID,
BTRFS_LAST_FREE_OBJECTID,
BTRFS_FREE_INO_OBJECTID);
} else {
dir_item_err(leaf, slot,
"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
BTRFS_FIRST_FREE_OBJECTID,
BTRFS_LAST_FREE_OBJECTID,
BTRFS_FREE_INO_OBJECTID);
}
return -EUCLEAN;
}
if (key->offset != 0) {
if (is_inode_item)
inode_item_err(leaf, slot,
"invalid key offset: has %llu expect 0",
key->offset);
else
dir_item_err(leaf, slot,
"invalid location key offset:has %llu expect 0",
key->offset);
return -EUCLEAN;
}
return 0;
}
static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_key item_key;
bool is_root_item;
btrfs_item_key_to_cpu(leaf, &item_key, slot);
is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
/* No such tree id */
if (key->objectid == 0) {
if (is_root_item)
generic_err(leaf, slot, "invalid root id 0");
else
dir_item_err(leaf, slot,
"invalid location key root id 0");
return -EUCLEAN;
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
if (!is_fstree(key->objectid) && !is_root_item) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
BTRFS_LAST_FREE_OBJECTID);
return -EUCLEAN;
}
/*
* ROOT_ITEM with non-zero offset means this is a snapshot, created at
* @offset transid.
* Furthermore, for location key in DIR_ITEM, its offset is always -1.
*
* So here we only check offset for reloc tree whose key->offset must
* be a valid tree.
*/
if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
generic_err(leaf, slot, "invalid root id 0 for reloc tree");
return -EUCLEAN;
}
return 0;
}
static int check_dir_item(struct extent_buffer *leaf,
struct btrfs_key *key, struct btrfs_key *prev_key,
int slot)
@ -386,12 +484,14 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
struct btrfs_key location_key;
u32 name_len;
u32 data_len;
u32 max_name_len;
u32 total_size;
u32 name_hash;
u8 dir_type;
int ret;
/* header itself should not cross item boundary */
if (cur + sizeof(*di) > item_size) {
@ -401,6 +501,25 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
/* Location key check */
btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
ret = check_root_key(leaf, &location_key, slot);
if (ret < 0)
return ret;
} else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
location_key.type == 0) {
ret = check_inode_key(leaf, &location_key, slot);
if (ret < 0)
return ret;
} else {
dir_item_err(leaf, slot,
"invalid location key type, have %u, expect %u or %u",
location_key.type, BTRFS_ROOT_ITEM_KEY,
BTRFS_INODE_ITEM_KEY);
return -EUCLEAN;
}
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
if (dir_type >= BTRFS_FT_MAX) {
@ -738,6 +857,44 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return 0;
}
/*
* Enhanced version of chunk item checker.
*
* The common btrfs_check_chunk_valid() doesn't check item size since it needs
* to work on super block sys_chunk_array which doesn't have full item ptr.
*/
static int check_leaf_chunk_item(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_key *key, int slot)
{
int num_stripes;
if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size_nr(leaf, slot),
sizeof(struct btrfs_chunk),
BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
return -EUCLEAN;
}
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
/* Let btrfs_check_chunk_valid() handle this error type */
if (num_stripes == 0)
goto out;
if (btrfs_chunk_item_size(num_stripes) !=
btrfs_item_size_nr(leaf, slot)) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
btrfs_item_size_nr(leaf, slot),
btrfs_chunk_item_size(num_stripes));
return -EUCLEAN;
}
out:
return btrfs_check_chunk_valid(leaf, chunk, key->offset);
}
__printf(3, 4)
__cold
static void dev_item_err(const struct extent_buffer *eb, int slot,
@ -801,7 +958,7 @@ static int check_dev_item(struct extent_buffer *leaf,
}
/* Inode item error output has the same format as dir_item_err() */
#define inode_item_err(fs_info, eb, slot, fmt, ...) \
#define inode_item_err(eb, slot, fmt, ...) \
dir_item_err(eb, slot, fmt, __VA_ARGS__)
static int check_inode_item(struct extent_buffer *leaf,
@ -812,30 +969,17 @@ static int check_inode_item(struct extent_buffer *leaf,
u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
u32 mode;
int ret;
ret = check_inode_key(leaf, key, slot);
if (ret < 0)
return ret;
if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
key->objectid != BTRFS_FREE_INO_OBJECTID) {
generic_err(leaf, slot,
"invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
BTRFS_FIRST_FREE_OBJECTID,
BTRFS_LAST_FREE_OBJECTID,
BTRFS_FREE_INO_OBJECTID);
return -EUCLEAN;
}
if (key->offset != 0) {
inode_item_err(fs_info, leaf, slot,
"invalid key offset: has %llu expect 0",
key->offset);
return -EUCLEAN;
}
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
inode_item_err(fs_info, leaf, slot,
inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect (0, %llu]",
btrfs_inode_generation(leaf, iitem),
super_gen + 1);
@ -843,7 +987,7 @@ static int check_inode_item(struct extent_buffer *leaf,
}
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
inode_item_err(fs_info, leaf, slot,
inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
@ -856,7 +1000,7 @@ static int check_inode_item(struct extent_buffer *leaf,
*/
mode = btrfs_inode_mode(leaf, iitem);
if (mode & ~valid_mask) {
inode_item_err(fs_info, leaf, slot,
inode_item_err(leaf, slot,
"unknown mode bit detected: 0x%x",
mode & ~valid_mask);
return -EUCLEAN;
@ -869,20 +1013,20 @@ static int check_inode_item(struct extent_buffer *leaf,
*/
if (!has_single_bit_set(mode & S_IFMT)) {
if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
inode_item_err(fs_info, leaf, slot,
inode_item_err(leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
mode & S_IFMT);
return -EUCLEAN;
}
}
if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
inode_item_err(fs_info, leaf, slot,
inode_item_err(leaf, slot,
"invalid nlink: has %u expect no more than 1 for dir",
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
inode_item_err(fs_info, leaf, slot,
inode_item_err(leaf, slot,
"unknown flags detected: 0x%llx",
btrfs_inode_flags(leaf, iitem) &
~BTRFS_INODE_FLAG_MASK);
@ -898,22 +1042,11 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
struct btrfs_root_item ri;
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
/* No such tree id */
if (key->objectid == 0) {
generic_err(leaf, slot, "invalid root id 0");
return -EUCLEAN;
}
/*
* Some older kernel may create ROOT_ITEM with non-zero offset, so here
* we only check offset for reloc tree whose key->offset must be a
* valid tree.
*/
if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
generic_err(leaf, slot, "invalid root id 0 for reloc tree");
return -EUCLEAN;
}
ret = check_root_key(leaf, key, slot);
if (ret < 0)
return ret;
if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
generic_err(leaf, slot,
@ -1302,8 +1435,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
return 0;
}
#define inode_ref_err(fs_info, eb, slot, fmt, args...) \
inode_item_err(fs_info, eb, slot, fmt, ##args)
#define inode_ref_err(eb, slot, fmt, args...) \
inode_item_err(eb, slot, fmt, ##args)
static int check_inode_ref(struct extent_buffer *leaf,
struct btrfs_key *key, struct btrfs_key *prev_key,
int slot)
@ -1316,7 +1449,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
inode_ref_err(fs_info, leaf, slot,
inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
btrfs_item_size_nr(leaf, slot),
sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
@ -1329,7 +1462,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
u16 namelen;
if (ptr + sizeof(iref) > end) {
inode_ref_err(fs_info, leaf, slot,
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
ptr, end, sizeof(iref));
return -EUCLEAN;
@ -1338,7 +1471,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
iref = (struct btrfs_inode_ref *)ptr;
namelen = btrfs_inode_ref_name_len(leaf, iref);
if (ptr + sizeof(*iref) + namelen > end) {
inode_ref_err(fs_info, leaf, slot,
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu namelen %u",
ptr, end, namelen);
return -EUCLEAN;
@ -1384,7 +1517,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_CHUNK_ITEM_KEY:
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
ret = btrfs_check_chunk_valid(leaf, chunk, key->offset);
ret = check_leaf_chunk_item(leaf, chunk, key, slot);
break;
case BTRFS_DEV_ITEM_KEY:
ret = check_dev_item(leaf, key, slot);

View file

@ -2674,14 +2674,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
u32 blocksize;
int ret = 0;
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
while (*level > 0) {
struct btrfs_key first_key;
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
WARN_ON(btrfs_header_level(cur) != *level);
@ -2732,9 +2727,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(
fs_info, bytenr,
blocksize);
ret = btrfs_pin_reserved_extent(fs_info,
bytenr, blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
@ -2749,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
WARN_ON(*level <= 0);
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
path->nodes[*level-1] = next;
@ -2757,9 +2750,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level] = 0;
cond_resched();
}
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
@ -2815,8 +2805,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
}
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(
fs_info,
ret = btrfs_pin_reserved_extent(fs_info,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
@ -2896,10 +2885,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
clear_extent_buffer_dirty(next);
}
WARN_ON(log->root_key.objectid !=
BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(fs_info,
next->start, next->len);
ret = btrfs_pin_reserved_extent(fs_info, next->start,
next->len);
if (ret)
goto out;
}
@ -3935,7 +3922,7 @@ static int log_csums(struct btrfs_trans_handle *trans,
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path, u64 *last_extent,
struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
@ -3946,7 +3933,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
struct btrfs_key first_key, last_key, key;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@ -3954,9 +3940,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int i;
struct list_head ordered_sums;
int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
bool has_extents = false;
bool need_find_last_extent = true;
bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@ -3965,8 +3948,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
if (!ins_data)
return -ENOMEM;
first_key.objectid = (u64)-1;
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@ -3987,9 +3968,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
if (i == nr - 1)
last_key = ins_keys[i];
if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
@ -4003,20 +3981,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset, ins_sizes[i]);
}
/*
* We set need_find_last_extent here in case we know we were
* processing other items and then walk into the first extent in
* the inode. If we don't hit an extent then nothing changes,
* we'll do the last search the next time around.
*/
if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
has_extents = true;
if (first_key.objectid == (u64)-1)
first_key = ins_keys[i];
} else {
need_find_last_extent = false;
}
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
@ -4082,167 +4046,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
kfree(sums);
}
if (!has_extents)
return ret;
if (need_find_last_extent && *last_extent == first_key.offset) {
/*
* We don't have any leafs between our current one and the one
* we processed before that can have file extent items for our
* inode (and have a generation number smaller than our current
* transaction id).
*/
need_find_last_extent = false;
}
/*
* Because we use btrfs_search_forward we could skip leaves that were
* not modified and then assume *last_extent is valid when it really
* isn't. So back up to the previous leaf and read the end of the last
* extent before we go and fill in holes.
*/
if (need_find_last_extent) {
u64 len;
ret = btrfs_prev_leaf(inode->root, src_path);
if (ret < 0)
return ret;
if (ret)
goto fill_holes;
if (src_path->slots[0])
src_path->slots[0]--;
src = src_path->nodes[0];
btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY)
goto fill_holes;
extent = btrfs_item_ptr(src, src_path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
len = btrfs_file_extent_ram_bytes(src, extent);
*last_extent = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
len = btrfs_file_extent_num_bytes(src, extent);
*last_extent = key.offset + len;
}
}
fill_holes:
/* So we did prev_leaf, now we need to move to the next leaf, but a few
* things could have happened
*
* 1) A merge could have happened, so we could currently be on a leaf
* that holds what we were copying in the first place.
* 2) A split could have happened, and now not all of the items we want
* are on the same leaf.
*
* So we need to adjust how we search for holes, we need to drop the
* path and re-search for the first extent key we found, and then walk
* forward until we hit the last one we copied.
*/
if (need_find_last_extent) {
/* btrfs_prev_leaf could return 1 without releasing the path */
btrfs_release_path(src_path);
ret = btrfs_search_slot(NULL, inode->root, &first_key,
src_path, 0, 0);
if (ret < 0)
return ret;
ASSERT(ret == 0);
src = src_path->nodes[0];
i = src_path->slots[0];
} else {
i = start_slot;
}
/*
* Ok so here we need to go through and fill in any holes we may have
* to make sure that holes are punched for those areas in case they had
* extents previously.
*/
while (!done) {
u64 offset, len;
u64 extent_end;
if (i >= btrfs_header_nritems(src_path->nodes[0])) {
ret = btrfs_next_leaf(inode->root, src_path);
if (ret < 0)
return ret;
ASSERT(ret == 0);
src = src_path->nodes[0];
i = 0;
need_find_last_extent = true;
}
btrfs_item_key_to_cpu(src, &key, i);
if (!btrfs_comp_cpu_keys(&key, &last_key))
done = true;
if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
i++;
continue;
}
extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(src, extent) ==
BTRFS_FILE_EXTENT_INLINE) {
len = btrfs_file_extent_ram_bytes(src, extent);
extent_end = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
len = btrfs_file_extent_num_bytes(src, extent);
extent_end = key.offset + len;
}
i++;
if (*last_extent == key.offset) {
*last_extent = extent_end;
continue;
}
offset = *last_extent;
len = key.offset - *last_extent;
ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
offset, 0, 0, len, 0, len, 0, 0, 0);
if (ret)
break;
*last_extent = extent_end;
}
/*
* Check if there is a hole between the last extent found in our leaf
* and the first extent in the next leaf. If there is one, we need to
* log an explicit hole so that at replay time we can punch the hole.
*/
if (ret == 0 &&
key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY &&
i == btrfs_header_nritems(src_path->nodes[0])) {
ret = btrfs_next_leaf(inode->root, src_path);
need_find_last_extent = true;
if (ret > 0) {
ret = 0;
} else if (ret == 0) {
btrfs_item_key_to_cpu(src_path->nodes[0], &key,
src_path->slots[0]);
if (key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY &&
*last_extent < key.offset) {
const u64 len = key.offset - *last_extent;
ret = btrfs_insert_file_extent(trans, log,
btrfs_ino(inode),
*last_extent, 0,
0, len, 0, len,
0, 0, 0);
*last_extent += len;
}
}
}
/*
* Need to let the callers know we dropped the path so they should
* re-search.
*/
if (!ret && need_find_last_extent)
ret = 1;
return ret;
}
@ -4407,7 +4210,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
u64 last_extent = (u64)-1;
bool dropped_extents = false;
int ins_nr = 0;
int start_slot;
int ret;
@ -4429,8 +4232,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
&last_extent, start_slot,
ins_nr, 1, 0);
start_slot, ins_nr, 1, 0);
if (ret < 0)
goto out;
ins_nr = 0;
@ -4454,8 +4256,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
if (last_extent == (u64)-1) {
last_extent = key.offset;
if (!dropped_extents) {
/*
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
@ -4469,6 +4270,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
} while (ret == -EAGAIN);
if (ret)
goto out;
dropped_extents = true;
}
if (ins_nr == 0)
start_slot = slot;
@ -4483,7 +4285,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
}
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path, &last_extent,
ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret > 0)
ret = 0;
@ -4670,13 +4472,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
u64 last_extent = 0;
ret = copy_items(trans, inode, dst_path, path,
&last_extent, start_slot,
ins_nr, 1, 0);
/* can't be 1, extent items aren't processed */
ASSERT(ret <= 0);
start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
ins_nr = 0;
@ -4700,13 +4497,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
cond_resched();
}
if (ins_nr > 0) {
u64 last_extent = 0;
ret = copy_items(trans, inode, dst_path, path,
&last_extent, start_slot,
ins_nr, 1, 0);
/* can't be 1, extent items aren't processed */
ASSERT(ret <= 0);
start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
}
@ -4715,100 +4507,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
}
/*
* If the no holes feature is enabled we need to make sure any hole between the
* last extent and the i_size of our inode is explicitly marked in the log. This
* is to make sure that doing something like:
*
* 1) create file with 128Kb of data
* 2) truncate file to 64Kb
* 3) truncate file to 256Kb
* 4) fsync file
* 5) <crash/power failure>
* 6) mount fs and trigger log replay
*
* Will give us a file with a size of 256Kb, the first 64Kb of data match what
* the file had in its first 64Kb of data at step 1 and the last 192Kb of the
* file correspond to a hole. The presence of explicit holes in a log tree is
* what guarantees that log replay will remove/adjust file extent items in the
* fs/subvol tree.
*
* Here we do not need to care about holes between extents, that is already done
* by copy_items(). We also only need to do this in the full sync path, where we
* lookup for extents from the fs/subvol tree only. In the fast path case, we
* lookup the list of modified extent maps and if any represents a hole, we
* insert a corresponding extent representing a hole in the log tree.
* When using the NO_HOLES feature if we punched a hole that causes the
* deletion of entire leafs or all the extent items of the first leaf (the one
* that contains the inode item and references) we may end up not processing
* any extents, because there are no leafs with a generation matching the
* current transaction that have extent items for our inode. So we need to find
* if any holes exist and then log them. We also need to log holes after any
* truncate operation that changes the inode's size.
*/
static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct btrfs_key key;
u64 hole_start;
u64 hole_size;
struct extent_buffer *leaf;
struct btrfs_root *log = root->log_root;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
u64 prev_extent_end = 0;
int ret;
if (!btrfs_fs_incompat(fs_info, NO_HOLES))
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
return 0;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = (u64)-1;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
ASSERT(ret != 0);
if (ret < 0)
return ret;
ASSERT(path->slots[0] > 0);
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
/* inode does not have any extents */
hole_start = 0;
hole_size = i_size;
} else {
while (true) {
struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf = path->nodes[0];
u64 len;
/*
* If there's an extent beyond i_size, an explicit hole was
* already inserted by copy_items().
*/
if (key.offset >= i_size)
return 0;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
if (ret > 0) {
ret = 0;
break;
}
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
break;
/* We have a hole, log it. */
if (prev_extent_end < key.offset) {
const u64 hole_len = key.offset - prev_extent_end;
/*
* Release the path to avoid deadlocks with other code
* paths that search the root while holding locks on
* leafs from the log root.
*/
btrfs_release_path(path);
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0,
0, hole_len, 0, hole_len,
0, 0, 0);
if (ret < 0)
return ret;
/*
* Search for the same key again in the root. Since it's
* an extent item and we are holding the inode lock, the
* key must still exist. If it doesn't just emit warning
* and return an error to fall back to a transaction
* commit.
*/
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
if (WARN_ON(ret > 0))
return -ENOENT;
leaf = path->nodes[0];
}
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, extent) ==
BTRFS_FILE_EXTENT_INLINE)
return 0;
BTRFS_FILE_EXTENT_INLINE) {
len = btrfs_file_extent_ram_bytes(leaf, extent);
prev_extent_end = ALIGN(key.offset + len,
fs_info->sectorsize);
} else {
len = btrfs_file_extent_num_bytes(leaf, extent);
prev_extent_end = key.offset + len;
}
len = btrfs_file_extent_num_bytes(leaf, extent);
/* Last extent goes beyond i_size, no need to log a hole. */
if (key.offset + len > i_size)
return 0;
hole_start = key.offset + len;
hole_size = i_size - hole_start;
path->slots[0]++;
cond_resched();
}
btrfs_release_path(path);
/* Last extent ends at i_size. */
if (hole_size == 0)
return 0;
if (prev_extent_end < i_size) {
u64 hole_len;
hole_size = ALIGN(hole_size, fs_info->sectorsize);
ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
hole_size, 0, hole_size, 0, 0, 0);
return ret;
btrfs_release_path(path);
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
ret = btrfs_insert_file_extent(trans, root->log_root,
ino, prev_extent_end, 0, 0,
hole_len, 0, hole_len,
0, 0, 0);
if (ret < 0)
return ret;
}
return 0;
}
/*
@ -5011,6 +4822,50 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
}
continue;
}
/*
* If the inode was already logged skip it - otherwise we can
* hit an infinite loop. Example:
*
* From the commit root (previous transaction) we have the
* following inodes:
*
* inode 257 a directory
* inode 258 with references "zz" and "zz_link" on inode 257
* inode 259 with reference "a" on inode 257
*
* And in the current (uncommitted) transaction we have:
*
* inode 257 a directory, unchanged
* inode 258 with references "a" and "a2" on inode 257
* inode 259 with reference "zz_link" on inode 257
* inode 261 with reference "zz" on inode 257
*
* When logging inode 261 the following infinite loop could
* happen if we don't skip already logged inodes:
*
* - we detect inode 258 as a conflicting inode, with inode 261
* on reference "zz", and log it;
*
* - we detect inode 259 as a conflicting inode, with inode 258
* on reference "a", and log it;
*
* - we detect inode 258 as a conflicting inode, with inode 259
* on reference "zz_link", and log it - again! After this we
* repeat the above steps forever.
*/
spin_lock(&BTRFS_I(inode)->lock);
/*
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
* and it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_add_delayed_iput(inode);
continue;
}
spin_unlock(&BTRFS_I(inode)->lock);
/*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
@ -5110,7 +4965,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
u64 last_extent = 0;
int err = 0;
int ret;
int nritems;
@ -5288,7 +5142,7 @@ again:
ins_start_slot = path->slots[0];
}
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
ins_start_slot,
ins_nr, inode_only,
logged_isize);
if (ret < 0) {
@ -5311,17 +5165,13 @@ again:
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
if (ret) {
btrfs_release_path(path);
continue;
}
goto next_slot;
}
@ -5334,18 +5184,13 @@ again:
goto next_slot;
}
ret = copy_items(trans, inode, dst_path, path, &last_extent,
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
if (ret) {
ins_nr = 0;
btrfs_release_path(path);
continue;
}
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
@ -5359,13 +5204,12 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ret = 0;
ins_nr = 0;
}
btrfs_release_path(path);
@ -5380,14 +5224,13 @@ next_key:
}
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, &last_extent,
ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ret = 0;
ins_nr = 0;
}
@ -5400,7 +5243,7 @@ next_key:
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
err = btrfs_log_trailing_hole(trans, root, inode, path);
err = btrfs_log_holes(trans, root, inode, path);
if (err)
goto out_unlock;
}

View file

@ -30,6 +30,7 @@
#include "tree-checker.h"
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@ -66,6 +67,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 2,
.devs_increment = 3,
.ncopies = 3,
.nparity = 0,
.raid_name = "raid1c3",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
.mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
@ -78,6 +80,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 3,
.devs_increment = 4,
.ncopies = 4,
.nparity = 0,
.raid_name = "raid1c4",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
.mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
@ -438,39 +441,6 @@ static noinline struct btrfs_fs_devices *find_fsid(
ASSERT(fsid);
if (metadata_fsid) {
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by first scanning
* a device which didn't have its fsid/metadata_uuid changed
* at all and the CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (fs_devices->fsid_change &&
memcmp(metadata_fsid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by a device that
* has an outdated pair of fsid/metadata_uuid and
* CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (fs_devices->fsid_change &&
memcmp(fs_devices->metadata_uuid,
fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
memcmp(metadata_fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
}
/* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (metadata_fsid) {
@ -486,6 +456,47 @@ static noinline struct btrfs_fs_devices *find_fsid(
return NULL;
}
static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by first scanning
* a device which didn't have its fsid/metadata_uuid changed
* at all and the CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (fs_devices->fsid_change &&
memcmp(disk_super->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
/*
* Handle scanned device having completed its fsid change but
* belonging to a fs_devices that was created by a device that
* has an outdated pair of fsid/metadata_uuid and
* CHANGING_FSID_V2 flag set.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (fs_devices->fsid_change &&
memcmp(fs_devices->metadata_uuid,
fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) == 0) {
return fs_devices;
}
}
return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
}
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
@ -669,7 +680,9 @@ error_brelse:
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
* being created with a disk that has already completed its fsid change.
* being created with a disk that has already completed its fsid change. Such
* disk can belong to an fs which has its FSID changed or to one which doesn't.
* Handle both cases here.
*/
static struct btrfs_fs_devices *find_fsid_inprogress(
struct btrfs_super_block *disk_super)
@ -685,7 +698,7 @@ static struct btrfs_fs_devices *find_fsid_inprogress(
}
}
return NULL;
return find_fsid(disk_super->fsid, NULL);
}
@ -697,17 +710,54 @@ static struct btrfs_fs_devices *find_fsid_changed(
/*
* Handles the case where scanned device is part of an fs that had
* multiple successful changes of FSID but curently device didn't
* observe it. Meaning our fsid will be different than theirs.
* observe it. Meaning our fsid will be different than theirs. We need
* to handle two subcases :
* 1 - The fs still continues to have different METADATA/FSID uuids.
* 2 - The fs is switched back to its original FSID (METADATA/FSID
* are equal).
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
/* Changed UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE) != 0) {
BTRFS_FSID_SIZE) != 0)
return fs_devices;
/* Unchanged UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
return NULL;
}
static struct btrfs_fs_devices *find_fsid_reverted_metadata(
struct btrfs_super_block *disk_super)
{
struct btrfs_fs_devices *fs_devices;
/*
* Handle the case where the scanned device is part of an fs whose last
* metadata UUID change reverted it to the original FSID. At the same
* time * fs_devices was first created by another constitutent device
* which didn't fully observe the operation. This results in an
* btrfs_fs_devices created with metadata/fsid different AND
* btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
* fs_devices equal to the FSID of the disk.
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->fsid,
BTRFS_FSID_SIZE) == 0 &&
fs_devices->fsid_change)
return fs_devices;
}
}
return NULL;
@ -734,24 +784,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
if (fsid_change_in_progress) {
if (!has_metadata_uuid) {
/*
* When we have an image which has CHANGING_FSID_V2 set
* it might belong to either a filesystem which has
* disks with completed fsid change or it might belong
* to fs with no UUID changes in effect, handle both.
*/
if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
if (!fs_devices)
fs_devices = find_fsid(disk_super->fsid, NULL);
} else {
else
fs_devices = find_fsid_changed(disk_super);
}
} else if (has_metadata_uuid) {
fs_devices = find_fsid(disk_super->fsid,
disk_super->metadata_uuid);
fs_devices = find_fsid_with_metadata_uuid(disk_super);
} else {
fs_devices = find_fsid(disk_super->fsid, NULL);
fs_devices = find_fsid_reverted_metadata(disk_super);
if (!fs_devices)
fs_devices = find_fsid(disk_super->fsid, NULL);
}
@ -781,12 +823,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* a device which had the CHANGING_FSID_V2 flag then replace the
* metadata_uuid/fsid values of the fs_devices.
*/
if (has_metadata_uuid && fs_devices->fsid_change &&
if (fs_devices->fsid_change &&
found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid, BTRFS_FSID_SIZE);
if (has_metadata_uuid)
memcpy(fs_devices->metadata_uuid,
disk_super->metadata_uuid,
BTRFS_FSID_SIZE);
else
memcpy(fs_devices->metadata_uuid,
disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->fsid_change = false;
}
@ -1064,11 +1112,6 @@ static void btrfs_close_bdev(struct btrfs_device *device)
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
struct btrfs_device *new_device;
struct rcu_string *name;
if (device->bdev)
fs_devices->open_devices--;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@ -1080,23 +1123,22 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->missing_devices--;
btrfs_close_bdev(device);
new_device = btrfs_alloc_device(NULL, &device->devid,
device->uuid);
BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
/* Safe because we are under uuid_mutex */
if (device->name) {
name = rcu_string_strdup(device->name->str, GFP_NOFS);
BUG_ON(!name); /* -ENOMEM */
rcu_assign_pointer(new_device->name, name);
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
list_replace_rcu(&device->dev_list, &new_device->dev_list);
new_device->fs_devices = device->fs_devices;
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
extent_io_tree_release(&device->alloc_state);
synchronize_rcu();
btrfs_free_device(device);
/* Verify the device is back in a pristine state */
ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
ASSERT(list_empty(&device->dev_alloc_list));
ASSERT(list_empty(&device->post_commit_list));
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
@ -2130,7 +2172,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
{
struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
WARN_ON(!tgtdev);
mutex_lock(&fs_devices->device_list_mutex);
btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
@ -2875,6 +2916,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *block_group;
int ret;
/*
@ -2898,6 +2940,12 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (ret)
return ret;
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)
return -ENOENT;
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
btrfs_put_block_group(block_group);
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@ -6111,75 +6159,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
u64 bytenr;
u64 length;
u64 stripe_nr;
u64 rmap_len;
int i, j, nr = 0;
em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
map = em->map_lookup;
length = em->len;
rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
length = div_u64(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
length = div_u64(length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
length = div_u64(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].physical > physical ||
map->stripes[i].physical + length <= physical)
continue;
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div64_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
} /* else if RAID[56], multiply by nr_data_stripes().
* Alternatively, just use rmap_len below instead of
* map->stripe_len */
bytenr = chunk_start + stripe_nr * rmap_len;
WARN_ON(nr >= map->num_stripes);
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
break;
}
if (j == nr) {
WARN_ON(nr >= map->num_stripes);
buf[nr++] = bytenr;
}
}
*logical = buf;
*naddrs = nr;
*stripe_len = rmap_len;
free_extent_map(em);
return 0;
}
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
bio->bi_private = bbio->private;
@ -6480,19 +6459,14 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
int index = btrfs_bg_flags_to_raid_index(type);
int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
int data_stripes;
switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case BTRFS_BLOCK_GROUP_RAID5:
data_stripes = num_stripes - 1;
break;
case BTRFS_BLOCK_GROUP_RAID6:
data_stripes = num_stripes - 2;
break;
default:
if (nparity)
data_stripes = num_stripes - nparity;
else
data_stripes = num_stripes / ncopies;
break;
}
return div_u64(chunk_len, data_stripes);
}
@ -7331,6 +7305,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
else
btrfs_dev_stat_set(dev, i, 0);
}
btrfs_info(fs_info, "device stats zeroed by %s (%d)",
current->comm, task_pid_nr(current));
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (stats->nr_items > i)

View file

@ -120,8 +120,6 @@ struct btrfs_device {
/* per-device scrub information */
struct scrub_ctx *scrub_ctx;
struct btrfs_work work;
/* readahead state */
atomic_t reada_in_flight;
u64 reada_next;
@ -138,6 +136,10 @@ struct btrfs_device {
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
struct extent_io_tree alloc_state;
struct completion kobj_unregister;
/* For sysfs/FSID/devinfo/devid/ */
struct kobject devid_kobj;
};
/*
@ -255,7 +257,7 @@ struct btrfs_fs_devices {
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
struct kobject fsid_kobj;
struct kobject *device_dir_kobj;
struct kobject *devices_kobj;
struct completion kobj_unregister;
};
@ -417,8 +419,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_bio **bbio_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);

View file

@ -456,6 +456,41 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
}
static inline void bitmap_next_clear_region(unsigned long *bitmap,
unsigned int *rs, unsigned int *re,
unsigned int end)
{
*rs = find_next_zero_bit(bitmap, end, *rs);
*re = find_next_bit(bitmap, end, *rs + 1);
}
static inline void bitmap_next_set_region(unsigned long *bitmap,
unsigned int *rs, unsigned int *re,
unsigned int end)
{
*rs = find_next_bit(bitmap, end, *rs);
*re = find_next_zero_bit(bitmap, end, *rs + 1);
}
/*
* Bitmap region iterators. Iterates over the bitmap between [@start, @end).
* @rs and @re should be integer variables and will be set to start and end
* index of the current clear or set region.
*/
#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \
for ((rs) = (start), \
bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, \
bitmap_next_clear_region((bitmap), &(rs), &(re), (end)))
#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \
for ((rs) = (start), \
bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, \
bitmap_next_set_region((bitmap), &(rs), &(re), (end)))
/**
* BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
* @n: u64 value

View file

@ -496,9 +496,9 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
__entry->ino = btrfs_ino(BTRFS_I(inode));
__entry->file_offset = ordered->file_offset;
__entry->start = ordered->start;
__entry->len = ordered->len;
__entry->disk_len = ordered->disk_len;
__entry->start = ordered->disk_bytenr;
__entry->len = ordered->num_bytes;
__entry->disk_len = ordered->disk_num_bytes;
__entry->bytes_left = ordered->bytes_left;
__entry->flags = ordered->flags;
__entry->compress_type = ordered->compress_type;

View file

@ -270,33 +270,6 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
pcpu_unit_page_offset(cpu, page_idx);
}
static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
{
*rs = find_next_zero_bit(bitmap, end, *rs);
*re = find_next_bit(bitmap, end, *rs + 1);
}
static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
{
*rs = find_next_bit(bitmap, end, *rs);
*re = find_next_zero_bit(bitmap, end, *rs + 1);
}
/*
* Bitmap region iterators. Iterates over the bitmap between
* [@start, @end) in @chunk. @rs and @re should be integer variables
* and will be set to start and end index of the current free region.
*/
#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
/*
* The following are helper functions to help access bitmaps and convert
* between bitmap offsets to address offsets.
@ -732,9 +705,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
}
bits = 0;
pcpu_for_each_md_free_region(chunk, bit_off, bits) {
pcpu_for_each_md_free_region(chunk, bit_off, bits)
pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}
}
/**
@ -749,7 +721,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
int rs, re, start; /* region start, region end */
unsigned int rs, re, start; /* region start, region end */
/* promote scan_hint to contig_hint */
if (block->scan_hint) {
@ -765,10 +737,9 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
block->right_free = 0;
/* iterate over free areas and update the contig hints */
pcpu_for_each_unpop_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS) {
bitmap_for_each_clear_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, rs, re);
}
}
/**
@ -1041,13 +1012,13 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
int *next_off)
{
int page_start, page_end, rs, re;
unsigned int page_start, page_end, rs, re;
page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
rs = page_start;
pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
if (rs >= page_end)
return true;
@ -1702,13 +1673,13 @@ area_found:
/* populate if not all pages are already there */
if (!is_atomic) {
int page_start, page_end, rs, re;
unsigned int page_start, page_end, rs, re;
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
pcpu_for_each_unpop_region(chunk->populated, rs, re,
page_start, page_end) {
bitmap_for_each_clear_region(chunk->populated, rs, re,
page_start, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
@ -1858,10 +1829,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &to_free, list) {
int rs, re;
unsigned int rs, re;
pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
bitmap_for_each_set_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
@ -1893,7 +1864,7 @@ retry_pop:
}
for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
int nr_unpop = 0, rs, re;
unsigned int nr_unpop = 0, rs, re;
if (!nr_to_pop)
break;
@ -1910,9 +1881,9 @@ retry_pop:
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop);
bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
int nr = min_t(int, re - rs, nr_to_pop);
ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
if (!ret) {