From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 29 Sep 2008 15:18:18 -0400 Subject: [PATCH] Btrfs: add and improve comments This improves the comments at the top of many functions. It didn't dive into the guts of functions because I was trying to avoid merging problems with the new allocator and back reference work. extent-tree.c and volumes.c were both skipped, and there is definitely more work todo in cleaning and commenting the code. Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 2 +- fs/btrfs/TODO | 20 ----- fs/btrfs/async-thread.c | 10 ++- fs/btrfs/async-thread.h | 7 +- fs/btrfs/bit-radix.c | 130 --------------------------- fs/btrfs/bit-radix.h | 33 ------- fs/btrfs/btrfs_inode.h | 54 +++++++++++- fs/btrfs/crc32c.h | 18 ++++ fs/btrfs/ctree.c | 127 ++++++++++++++++++++++++--- fs/btrfs/ctree.h | 1 - fs/btrfs/dir-item.c | 41 +++++++++ fs/btrfs/disk-io.c | 33 ++++++- fs/btrfs/extent_io.c | 34 +++++++- fs/btrfs/extent_map.c | 10 +++ fs/btrfs/file.c | 44 +++++++++- fs/btrfs/inode.c | 189 ++++++++++++++++++++++++++-------------- fs/btrfs/locking.c | 13 +++ fs/btrfs/ordered-data.c | 19 +++- fs/btrfs/ref-cache.c | 26 ++++++ fs/btrfs/ref-cache.h | 3 + fs/btrfs/root-tree.c | 21 ++++- fs/btrfs/struct-funcs.c | 21 +++++ fs/btrfs/super.c | 3 + fs/btrfs/transaction.c | 67 +++++++++++++- fs/btrfs/tree-defrag.c | 4 + 25 files changed, 653 insertions(+), 277 deletions(-) delete mode 100644 fs/btrfs/TODO delete mode 100644 fs/btrfs/bit-radix.c delete mode 100644 fs/btrfs/bit-radix.h diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index d5c28557fba9..48b7909ca8d1 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),) obj-m := btrfs.o btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ - transaction.o bit-radix.o inode.o file.o tree-defrag.o \ + transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO deleted file mode 100644 index d9b6d38c603a..000000000000 --- a/fs/btrfs/TODO +++ /dev/null @@ -1,20 +0,0 @@ -* cleanup, add more error checking, get rid of BUG_ONs -* Fix ENOSPC handling -* Make allocator smarter -* add a block group to struct inode -* Do actual block accounting -* Check compat and incompat flags on the inode -* Get rid of struct ctree_path, limiting tree levels held at one time -* Add generation number to key pointer in nodes -* Add generation number to inode -* forbid cross subvolume renames and hardlinks -* Release -* Do real tree locking -* Add extent mirroring (backup copies of blocks) -* Add fancy interface to get access to incremental backups -* Add fancy striped extents to make big reads faster -* Use relocation to try and fix write errors -* Make allocator much smarter -* xattrs (directory streams for regular files) -* Scrub & defrag - diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4e780b279de6..04fb9702d14c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) /* * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. */ next = workers->worker_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); atomic_inc(&worker->num_pending); worker->sequence++; + if (worker->sequence % workers->idle_thresh == 0) list_move_tail(next, &workers->worker_list); return worker; } +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 43e44d115dd1..4ec9a2ee0f9d 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -63,14 +63,17 @@ struct btrfs_workers { /* once a worker has this many requests or fewer, it is idle */ int idle_thresh; - /* list with all the work threads */ + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ struct list_head worker_list; struct list_head idle_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; - /* extra name for this worker */ + /* extra name for this worker, used for current->name */ char *name; }; diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c deleted file mode 100644 index e8bf876db393..000000000000 --- a/fs/btrfs/bit-radix.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "bit-radix.h" - -#define BIT_ARRAY_BYTES 256 -#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8) - -extern struct kmem_cache *btrfs_bit_radix_cachep; -int set_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ - unsigned long *bits; - unsigned long slot; - int bit_slot; - int ret; - - slot = bit / BIT_RADIX_BITS_PER_ARRAY; - bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - - bits = radix_tree_lookup(radix, slot); - if (!bits) { - bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS); - if (!bits) - return -ENOMEM; - memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long)); - bits[0] = slot; - ret = radix_tree_insert(radix, slot, bits); - if (ret) - return ret; - } - ret = test_and_set_bit(bit_slot, bits + 1); - if (ret < 0) - ret = 1; - return ret; -} - -int test_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ - unsigned long *bits; - unsigned long slot; - int bit_slot; - - slot = bit / BIT_RADIX_BITS_PER_ARRAY; - bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - - bits = radix_tree_lookup(radix, slot); - if (!bits) - return 0; - return test_bit(bit_slot, bits + 1); -} - -int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit) -{ - unsigned long *bits; - unsigned long slot; - int bit_slot; - int i; - int empty = 1; - - slot = bit / BIT_RADIX_BITS_PER_ARRAY; - bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY; - - bits = radix_tree_lookup(radix, slot); - if (!bits) - return 0; - clear_bit(bit_slot, bits + 1); - for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) { - if (bits[i]) { - empty = 0; - break; - } - } - if (empty) { - bits = radix_tree_delete(radix, slot); - BUG_ON(!bits); - kmem_cache_free(btrfs_bit_radix_cachep, bits); - } - return 0; -} - -int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, - unsigned long start, int nr) -{ - unsigned long *bits; - unsigned long *gang[4]; - int found; - int ret; - int i; - int total_found = 0; - unsigned long slot; - - slot = start / BIT_RADIX_BITS_PER_ARRAY; - ret = radix_tree_gang_lookup(radix, (void **)gang, slot, - ARRAY_SIZE(gang)); - found = start % BIT_RADIX_BITS_PER_ARRAY; - for (i = 0; i < ret && nr > 0; i++) { - bits = gang[i]; - while(nr > 0) { - found = find_next_bit(bits + 1, - BIT_RADIX_BITS_PER_ARRAY, - found); - if (found < BIT_RADIX_BITS_PER_ARRAY) { - *retbits = bits[0] * - BIT_RADIX_BITS_PER_ARRAY + found; - retbits++; - nr--; - total_found++; - found++; - } else - break; - } - found = 0; - } - return total_found; -} diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h deleted file mode 100644 index c100f54d5c32..000000000000 --- a/fs/btrfs/bit-radix.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BIT_RADIX__ -#define __BIT_RADIX__ -#include - -int set_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int test_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit); -int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits, - unsigned long start, int nr); - -static inline void init_bit_radix(struct radix_tree_root *radix) -{ - INIT_RADIX_TREE(radix, GFP_NOFS); -} -#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0577fda2168a..0b2e623cf421 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -25,27 +25,58 @@ /* in memory btrfs inode */ struct btrfs_inode { + /* which subvolume this inode belongs to */ struct btrfs_root *root; + + /* the block group preferred for allocations. This pointer is buggy + * and needs to be replaced with a bytenr instead + */ struct btrfs_block_group_cache *block_group; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ struct extent_io_tree io_failure_tree; + + /* held while inserting checksums to avoid races */ struct mutex csum_mutex; + + /* held while inesrting or deleting extents from files */ struct mutex extent_mutex; + + /* held while logging the inode in tree-log.c */ struct mutex log_mutex; - struct inode vfs_inode; + + /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; + /* standard acl pointers */ struct posix_acl *i_acl; struct posix_acl *i_default_acl; /* for keeping track of orphaned inodes */ struct list_head i_orphan; + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ struct list_head delalloc_inodes; - /* full 64 bit generation number */ + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ u64 generation; /* @@ -57,10 +88,25 @@ struct btrfs_inode { */ u64 logged_trans; - /* trans that last made a change that should be fully fsync'd */ + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ u64 log_dirty_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ u64 delalloc_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ u64 disk_i_size; + + /* flags field from the on disk inode */ u32 flags; /* @@ -68,6 +114,8 @@ struct btrfs_inode { * number for new files that are created */ u64 index_cnt; + + struct inode vfs_inode; }; static inline struct btrfs_inode *BTRFS_I(struct inode *inode) diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h index 4f0fefed132a..1eaf11d334fd 100644 --- a/fs/btrfs/crc32c.h +++ b/fs/btrfs/crc32c.h @@ -1,3 +1,21 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + #ifndef __BTRFS_CRC32C__ #define __BTRFS_CRC32C__ #include diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50e81f43e6d4..ff3261ff2e19 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2007,2008 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -54,12 +54,19 @@ struct btrfs_path *btrfs_alloc_path(void) return path; } +/* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { btrfs_release_path(NULL, p); kmem_cache_free(btrfs_path_cachep, p); } +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) { int i; @@ -77,6 +84,16 @@ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) } } +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -87,6 +104,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -108,6 +129,10 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) return eb; } +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ static void add_root_to_dirty_list(struct btrfs_root *root) { if (root->track_dirty && list_empty(&root->dirty_list)) { @@ -116,6 +141,11 @@ static void add_root_to_dirty_list(struct btrfs_root *root) } } +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -167,6 +197,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } +/* + * does the dirty work in cow of a single block. The parent block + * (if supplied) is updated to point to the new cow copy. The new + * buffer is marked dirty and returned locked. If you modify the block + * it needs to be marked dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in bytes + * the allocator should try to find free next to the block it returns. This is + * just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. btrfs_alloc_reserved_extent + * is used to finish the allocation. + */ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -311,6 +357,11 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, @@ -347,6 +398,10 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ static int close_blocks(u64 blocknr, u64 other, u32 blocksize) { if (blocknr < other && other - (blocknr + blocksize) < 32768) @@ -381,6 +436,11 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) } +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, @@ -521,6 +581,10 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root, return btrfs_item_offset_nr(leaf, nr - 1); } +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ static int check_node(struct btrfs_root *root, struct btrfs_path *path, int level) { @@ -561,6 +625,10 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path, return 0; } +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, int level) { @@ -782,6 +850,10 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) { @@ -798,6 +870,11 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, btrfs_node_ptr_generation(parent, slot)); } +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) @@ -1024,7 +1101,10 @@ enospc: return ret; } -/* returns zero if the push worked, non-zero otherwise */ +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) @@ -1150,7 +1230,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans, } /* - * readahead one full node of leaves + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. */ static noinline void reada_for_search(struct btrfs_root *root, struct btrfs_path *path, @@ -1226,6 +1307,19 @@ static noinline void reada_for_search(struct btrfs_root *root, } } +/* + * when we walk down the tree, it is usually safe to unlock the higher layers in + * the tree. The exceptions are when our path goes through slot 0, because operations + * on the tree might require changing key pointers higher up in the tree. + * + * callers might also have set path->keep_locks, which tells this code to + * keep the lock if the path points to the last slot in the block. This is + * part of walking through the tree, and selecting the next slot in the higher + * block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. + * so if lowest_unlock is 1, level 0 won't be unlocked + */ static noinline void unlock_up(struct btrfs_path *path, int level, int lowest_unlock) { @@ -2705,6 +2799,12 @@ again: return ret; } +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ int btrfs_truncate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2818,6 +2918,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, return ret; } +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size) @@ -2897,7 +3000,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, } /* - * Given a key and some data, insert an item into the tree. + * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -3046,9 +3149,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root /* * delete the pointer from a given node. * - * If the delete empties a node, the node is removed from the tree, - * continuing all the way the root if required. The root is converted into - * a leaf if all the nodes are emptied. + * the tree should have been previously balanced so the deletion does not + * empty a node. */ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot) @@ -3233,6 +3335,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * search the tree again to find a leaf with lesser keys * returns 0 if it found something or 1 if there are no lesser leaves. * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. */ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { @@ -3265,9 +3370,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) /* * A helper function to walk down the tree starting at min_key, and looking * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, but could - * also be used to search for blocks that have changed since a given - * transaction id. + * transaction id. This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -3279,6 +3382,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) * This honors path->lowest_level to prevent descent past a given level * of the tree. * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * * returns zero if something useful was found, < 0 on error and 1 if there * was nothing in the tree that matched the search criteria. */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0079b60b18f3..ded1643c0273 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,7 +27,6 @@ #include #include #include -#include "bit-radix.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index e4f30090d640..5040b71f1900 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,6 +21,14 @@ #include "hash.h" #include "transaction.h" +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle return (struct btrfs_dir_item *)ptr; } +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, u16 name_len, const void *data, u16 data_len, @@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, return ret; } +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 dir, struct btrfs_key *location, u8 type, u64 index) @@ -184,6 +203,11 @@ out: return 0; } +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len) @@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, return NULL; } +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 45b4f7285275..5ee10d3136f5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf) static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ struct end_io_wq { struct bio *bio; bio_end_io_t *end_io; @@ -66,6 +71,11 @@ struct end_io_wq { struct btrfs_work work; }; +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ struct async_submit_bio { struct inode *inode; struct bio *bio; @@ -76,6 +86,10 @@ struct async_submit_bio { struct btrfs_work work; }; +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, size_t page_offset, u64 start, u64 len, int create) @@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result) *(__le32 *)result = ~cpu_to_le32(crc); } +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { @@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, return 0; } +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ static int verify_parent_transid(struct extent_io_tree *io_tree, struct extent_buffer *eb, u64 parent_transid) { @@ -228,9 +252,12 @@ out: unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); return ret; - } +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ static int btree_read_extent_buffer_pages(struct btrfs_root *root, struct extent_buffer *eb, u64 start, u64 parent_transid) @@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror return -EIO; } +/* + * checksum a dirty tree block before IO. This has extra checks to make + * sure we only fill in the checksum field in the first page of a multi-page block + */ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { struct extent_io_tree *tree; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8bd1b402f3fd..563b2d12f4f2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(wait_on_extent_writeback); +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { int err; @@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(set_range_writeback); +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, int bits) { @@ -1017,6 +1028,10 @@ out: } EXPORT_SYMBOL(find_first_extent_bit); +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, int bits) { @@ -1046,8 +1061,14 @@ out: } EXPORT_SYMBOL(find_first_extent_bit_state); -u64 find_lock_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes) +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) { struct rb_node *node; struct extent_state *state; @@ -1130,6 +1151,11 @@ out: return found; } +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits) @@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) } EXPORT_SYMBOL(unlock_range); +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) { struct rb_node *node; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 78ced11d18c7..74b2a29880d3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, return NULL; } +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_ret, struct rb_node **next_ret) @@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) { struct rb_node *prev; @@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) return ret; } +/* check to see if two extent_map structs are adjacent and safe to merge */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -250,6 +259,7 @@ out: } EXPORT_SYMBOL(add_extent_mapping); +/* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) { if (start + len < start) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1b7e51a9db0f..3088a1184483 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -41,6 +41,9 @@ #include "compat.h" +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, @@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages, return page_fault ? -EFAULT : 0; } +/* + * unlocks pages after btrfs_file_write is done with them + */ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { if (!pages[i]) break; + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ ClearPageChecked(pages[i]); unlock_page(pages[i]); mark_page_accessed(pages[i]); @@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) } } +/* this does all the hard work for inserting an inline extent into + * the btree. Any existing inline extent is extended as required to make room, + * otherwise things are inserted as required into the btree + */ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 offset, size_t size, @@ -228,6 +242,14 @@ fail: return err; } +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct file *file, @@ -362,6 +384,10 @@ out_unlock: return err; } +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned) { @@ -536,6 +562,9 @@ out: * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. + * + * inline_limit is used to tell this code which offsets in the file to keep + * if they contain inline extents. */ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, @@ -796,7 +825,9 @@ out: } /* - * this gets pages into the page cache and locks them down + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. */ static int noinline prepare_pages(struct btrfs_root *root, struct file *file, struct page **pages, size_t num_pages, @@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp) return 0; } +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 404704d26822..f3abecc2d14c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { static void btrfs_truncate(struct inode *inode); +/* + * a very lame attempt at stopping writes when the FS is 85% full. There + * are countless ways this is incorrect, but it is better than nothing. + */ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, int for_del) { @@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, return ret; } +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + */ static int cow_file_range(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -185,6 +195,13 @@ out: return ret; } +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) { u64 extent_start; @@ -291,6 +308,9 @@ out: return err; } +/* + * extent_io.c call back to do delayed allocation processing + */ static int run_delalloc_range(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end) return ret; } +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { @@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, return 0; } +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { @@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, return 0; } +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio) { @@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, return 0; } +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num) { @@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(root, rw, bio, mirror_num, 1); } +/* + * extent_io.c submission hook. This does the right thing for csum calculation on write, + * or reading the csums from the tree before a read + */ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int mirror_num) { @@ -408,6 +452,10 @@ mapit: return btrfs_map_bio(root, rw, bio, mirror_num, 0); } +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_offset, struct list_head *list) @@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) GFP_NOFS); } +/* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; struct btrfs_work work; }; -/* see btrfs_writepage_start_hook for details on why this is required */ void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup; @@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) return -EAGAIN; } +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return btrfs_finish_ordered_io(page->mapping->host, start, end); } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ struct io_failure_record { struct page *page; u64 start; @@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio, return 0; } +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ int btrfs_clean_io_failures(struct inode *inode, u64 start) { u64 private; @@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start) return 0; } +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_free_path(path); } +/* + * read an inode from the btree into the in-memory inode + */ void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; @@ -1083,6 +1155,9 @@ make_bad: make_bad_inode(inode); } +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, @@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, BTRFS_I(inode)->block_group->key.objectid); } +/* + * copy everything in the in-memory inode into the btree. + */ int noinline btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) @@ -1151,6 +1229,11 @@ failed: } +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, struct inode *inode, @@ -1309,7 +1392,7 @@ fail: /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find - * any higher than i_size. + * any higher than new_size * * csum items that cross the new i_size are truncated to the new size * as well. @@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode) btrfs_end_transaction(trans, root); } +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ static int btrfs_set_inode_index_count(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2175,6 +2263,10 @@ out: return ret; } +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ static int btrfs_set_inode_index(struct inode *dir, struct inode *inode, u64 *index) { @@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode) return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; } +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index) @@ -2611,6 +2709,10 @@ out_unlock: return err; } +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, @@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, return add_extent_mapping(em_tree, em); } +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the + * in-ram representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) @@ -2869,76 +2979,11 @@ out: return em; } -#if 0 /* waiting for O_DIRECT reads */ -static int btrfs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - struct extent_map *em; - u64 start = (u64)iblock << inode->i_blkbits; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len; - u64 logical; - u64 map_length; - int ret = 0; - - em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0); - - if (!em || IS_ERR(em)) - goto out; - - if (em->start > start || em->start + em->len <= start) { - goto out; - } - - if (em->block_start == EXTENT_MAP_INLINE) { - ret = -EINVAL; - goto out; - } - - len = em->start + em->len - start; - len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size))); - - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC) { - bh_result->b_size = len; - goto out; - } - - logical = start - em->start; - logical = em->block_start + logical; - - map_length = len; - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - logical, &map_length, &multi, 0); - BUG_ON(ret); - bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits; - bh_result->b_size = min(map_length, len); - - bh_result->b_bdev = multi->stripes[0].dev->bdev; - set_buffer_mapped(bh_result); - kfree(multi); -out: - free_extent_map(em); - return ret; -} -#endif - static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { return -EINVAL; -#if 0 - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - - if (rw == WRITE) - return -EINVAL; - - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, btrfs_get_block, NULL); -#endif } static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) @@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name, } } +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct btrfs_trans_handle *trans, u64 new_dirid, struct btrfs_block_group_cache *block_group) @@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, return btrfs_update_inode(trans, new_root, inode); } +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index) @@ -3424,6 +3475,10 @@ out_unlock: return ret; } +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ int btrfs_start_delalloc_inodes(struct btrfs_root *root) { struct list_head *head = &root->fs_info->delalloc_inodes; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0cc314c10d66..e30aa6e2958f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,6 +25,15 @@ #include "extent_io.h" #include "locking.h" +/* + * locks the per buffer mutex in an extent buffer. This uses adaptive locks + * and the spin is not tuned very extensively. The spinning does make a big + * difference in almost every workload, but spinning for the right amount of + * time needs some help. + * + * In general, we want to spin as long as the lock holder is doing btree searches, + * and we should give up if they are in more expensive code. + */ int btrfs_tree_lock(struct extent_buffer *eb) { int i; @@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb) return mutex_is_locked(&eb->mutex); } +/* + * btrfs_search_slot uses this to decide if it should drop its locks + * before doing something expensive like allocating free blocks for cow. + */ int btrfs_path_lock_waiting(struct btrfs_path *path, int level) { int i; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 951eacff2420..dcc1730dd837 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -26,7 +26,6 @@ #include "btrfs_inode.h" #include "extent_io.h" - static u64 entry_end(struct btrfs_ordered_extent *entry) { if (entry->file_offset + entry->len < entry->file_offset) @@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry) return entry->file_offset + entry->len; } +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, struct rb_node *node) { @@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, struct rb_node **prev_ret) { @@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } +/* + * helper to check if a given offset is inside a given entry + */ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) { if (file_offset < entry->file_offset || @@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { @@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode, return 0; } +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) { struct list_head splice; diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c index 30fcb7aea5b5..a50ebb67055d 100644 --- a/fs/btrfs/ref-cache.c +++ b/fs/btrfs/ref-cache.c @@ -21,6 +21,16 @@ #include "ref-cache.h" #include "transaction.h" +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on. This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, int nr_extents) { @@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, return ref; } +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */ void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) { if (!ref) @@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, return 0; } +/* + * find the leaf ref for a given extent. This returns the ref struct with + * a usage reference incremented + */ struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, u64 bytenr) { @@ -160,6 +178,10 @@ again: return NULL; } +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, int shared) { @@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, return ret; } +/* + * remove a single leaf ref from the tree. This drops the ref held by the tree + * only + */ int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) { struct btrfs_leaf_ref_tree *tree; diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h index 617564787f52..16f3183d7c59 100644 --- a/fs/btrfs/ref-cache.h +++ b/fs/btrfs/ref-cache.h @@ -19,8 +19,11 @@ #define __REFCACHE__ struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ u64 bytenr; u64 num_bytes; + + /* objectid and offset find the back reference for the file */ u64 objectid; u64 offset; }; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0091c01abb06..eb7f7655e9d5 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -22,8 +22,10 @@ #include "print-tree.h" /* - * returns 0 on finding something, 1 if no more roots are there - * and < 0 on error + * search forward for a root, starting with objectid 'search_start' + * if a root key is found, the objectid we find is filled into 'found_objectid' + * and 0 is returned. < 0 is returned on error, 1 if there is nothing + * left in the tree. */ int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid) @@ -66,6 +68,11 @@ out: return ret; } +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key) { @@ -104,6 +111,9 @@ out: return ret; } +/* + * copy the data in 'item' into the btree + */ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item) @@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an offset + * lower than the latest root. They need to be queued for deletion to finish + * what was happening when we crashed. + */ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, struct btrfs_root *latest) { @@ -227,6 +243,7 @@ err: return ret; } +/* drop the root item for 'key' from 'root' */ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key) { diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index ad03a32d1116..cdedbe144d45 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -17,6 +17,27 @@ */ #include + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + */ + #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ u##bits btrfs_##name(struct extent_buffer *eb, \ type *s) \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8399d6d05d63..2e6039825b7b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; +/* + * used by btrfsctl to scan devices when no FS is mounted + */ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 444abe0796ae..11266d68a6c9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) } } +/* + * either allocate a new transaction or hop into the existing one + */ static noinline int join_transaction(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; @@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root) return 0; } +/* + * this does all the record keeping required to make sure that a + * reference counted root is properly recorded in a given transaction. + * This is required to make sure the old root from before we joined the transaction + * is deleted when the transaction commits + */ noinline int btrfs_record_root_in_trans(struct btrfs_root *root) { struct btrfs_dirty_root *dirty; @@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root) return 0; } +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ static void wait_current_trans(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; @@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, return start_transaction(r, num_blocks, 2); } - +/* wait for a transaction commit to be fully complete */ static noinline int wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { @@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +/* + * rate limit against the drop_snapshot code. This helps to slow down new operations + * if the drop_snapshot code isn't able to keep up. + */ static void throttle_on_drops(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; @@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, return __btrfs_end_transaction(trans, root, 1); } - +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages) { @@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, &trans->transaction->dirty_pages); } +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ static int update_cowonly_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return 0; } +/* + * update all the cowonly tree roots on disk + */ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, return 0; } +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) { struct btrfs_dirty_root *dirty; @@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) return 0; } +/* + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot. This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list + */ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, struct radix_tree_root *radix, struct list_head *list) @@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, return err; } +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) { struct btrfs_fs_info *info = root->fs_info; @@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list) { @@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, return ret; } +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_pending_snapshot *pending) @@ -734,6 +791,9 @@ fail: return ret; } +/* + * create all the snapshots we've scheduled for creation + */ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ int btrfs_clean_old_snapshots(struct btrfs_root *root) { struct list_head dirty_roots; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b3bb5bbad76e..6f57d0889b1e 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,6 +23,10 @@ #include "transaction.h" #include "locking.h" +/* defrag all the leaves in a given btree. If cache_only == 1, don't read things + * from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root, int cache_only) {