diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 0c4fa2befae7..0bc4d3a10a5f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o + reada.o backref.o ulist.o qgroup.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a383c18e74e8..7d80ddd8f544 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -773,9 +773,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist *refs, struct ulist *roots, - const u64 *extent_item_pos) + u64 time_seq, struct ulist *refs, + struct ulist *roots, const u64 *extent_item_pos) { struct btrfs_key key; struct btrfs_path *path; @@ -837,7 +836,7 @@ again: btrfs_put_delayed_ref(&head->node); goto again; } - ret = __add_delayed_refs(head, delayed_ref_seq, + ret = __add_delayed_refs(head, time_seq, &prefs_delayed); mutex_unlock(&head->mutex); if (ret) { @@ -981,8 +980,7 @@ static void free_leaf_list(struct ulist *blocks) */ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **leafs, + u64 time_seq, struct ulist **leafs, const u64 *extent_item_pos) { struct ulist *tmp; @@ -997,7 +995,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; } - ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, *leafs, tmp, extent_item_pos); ulist_free(tmp); @@ -1024,8 +1022,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **roots) + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1043,7 +1040,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, + ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, tmp, *roots, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); @@ -1376,11 +1373,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list seq_elem = {}; struct seq_list tree_mod_seq_elem = {}; struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; - struct btrfs_delayed_ref_root *delayed_refs = NULL; pr_debug("resolving all inodes for extent %llu\n", extent_item_objectid); @@ -1391,16 +1386,11 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, tree_mod_seq_elem.seq, &refs, + tree_mod_seq_elem.seq, &refs, &extent_item_pos); if (ret) goto out; @@ -1408,8 +1398,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - seq_elem.seq, - tree_mod_seq_elem.seq, &roots); + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1431,7 +1420,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); - btrfs_put_delayed_seq(delayed_refs, &seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index c18d8ac7b795..3a1ad3e2dcb0 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -58,8 +58,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 delayed_ref_seq, u64 time_seq, - struct ulist **roots); + u64 time_seq, struct ulist **roots); struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 67fe46fdee6f..fb21431fe4e0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -321,7 +321,7 @@ struct tree_mod_root { struct tree_mod_elem { struct rb_node node; u64 index; /* shifted logical */ - struct seq_list elem; + u64 seq; enum mod_log_op op; /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ @@ -341,20 +341,50 @@ struct tree_mod_elem { struct tree_mod_root old_root; }; -static inline void -__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) +static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info) { - elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + read_lock(&fs_info->tree_mod_log_lock); } -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) +static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info) { - elem->flags = 1; + read_unlock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info) +{ + write_lock(&fs_info->tree_mod_log_lock); +} + +static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) +{ + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem) +{ + u64 seq; + + tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); - __get_tree_mod_seq(fs_info, elem); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + } + seq = btrfs_inc_tree_mod_seq(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); + + return seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -371,41 +401,46 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - BUG_ON(!(elem->flags & 1)); spin_lock(&fs_info->tree_mod_seq_lock); list_del(&elem->list); + elem->seq = 0; list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { - if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { + if (cur_elem->seq < min_seq) { if (seq_putting > cur_elem->seq) { /* * blocker with lower sequence number exists, we * cannot remove anything from the log */ - goto out; + spin_unlock(&fs_info->tree_mod_seq_lock); + return; } min_seq = cur_elem->seq; } } + spin_unlock(&fs_info->tree_mod_seq_lock); + + /* + * we removed the lowest blocker from the blocker list, so there may be + * more processible delayed refs. + */ + wake_up(&fs_info->tree_mod_seq_wait); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); + tree_mod_log_write_lock(fs_info); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = container_of(node, struct tree_mod_elem, node); - if (tm->elem.seq > min_seq) + if (tm->seq > min_seq) continue; rb_erase(node, tm_root); - list_del(&tm->elem.list); kfree(tm); } - write_unlock(&fs_info->tree_mod_log_lock); -out: - spin_unlock(&fs_info->tree_mod_seq_lock); + tree_mod_log_write_unlock(fs_info); } /* @@ -423,11 +458,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; - BUG_ON(!tm || !tm->elem.seq); + BUG_ON(!tm || !tm->seq); - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; while (*new) { @@ -437,88 +470,81 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->index > tm->index) new = &((*new)->rb_right); - else if (cur->elem.seq < tm->elem.seq) + else if (cur->seq < tm->seq) new = &((*new)->rb_left); - else if (cur->elem.seq > tm->elem.seq) + else if (cur->seq > tm->seq) new = &((*new)->rb_right); else { kfree(tm); - ret = -EEXIST; - goto unlock; + return -EEXIST; } } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -unlock: - write_unlock(&fs_info->tree_mod_log_lock); - return ret; + return 0; } +/* + * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it + * returns zero with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * call tree_mod_log_write_unlock() to release. + */ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { smp_mb(); if (list_empty(&(fs_info)->tree_mod_seq_list)) return 1; - if (!eb) - return 0; - if (btrfs_header_level(eb) == 0) + if (eb && btrfs_header_level(eb) == 0) return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&fs_info->tree_mod_seq_list)) { + /* + * someone emptied the list while we were waiting for the lock. + * we must not add to the list when no blocker exists. + */ + tree_mod_log_write_unlock(fs_info); + return 1; + } + return 0; } /* - * This allocates memory and gets a tree modification sequence number when - * needed. + * This allocates memory and gets a tree modification sequence number. * - * Returns 0 when no sequence number is needed, < 0 on error. - * Returns 1 when a sequence number was added. In this case, - * fs_info->tree_mod_seq_lock was acquired and must be released by the caller - * after inserting into the rb tree. + * Returns <0 on error. + * Returns >0 (the added sequence number) on success. */ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { struct tree_mod_elem *tm; - int seq; - if (tree_mod_dont_log(fs_info, NULL)) - return 0; - - tm = *tm_ret = kzalloc(sizeof(*tm), flags); + /* + * once we switch from spin locks to something different, we should + * honor the flags parameter here. + */ + tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) return -ENOMEM; - tm->elem.flags = 0; - spin_lock(&fs_info->tree_mod_seq_lock); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list, because no blocker exists. items - * are removed from the list only when the existing blocker is - * removed from the list. - */ - kfree(tm); - seq = 0; - spin_unlock(&fs_info->tree_mod_seq_lock); - } else { - __get_tree_mod_seq(fs_info, &tm->elem); - seq = tm->elem.seq; - } - - return seq; + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + return tm->seq; } -static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +static inline int +__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { - struct tree_mod_elem *tm; int ret; + struct tree_mod_elem *tm; ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) + if (ret < 0) return ret; tm->index = eb->start >> PAGE_CACHE_SHIFT; @@ -530,8 +556,22 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); - ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); + return __tree_mod_log_insert(fs_info, tm); +} + +static noinline int +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) +{ + int ret; + + if (tree_mod_dont_log(fs_info, eb)) + return 0; + + ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + + tree_mod_log_write_unlock(fs_info); return ret; } @@ -542,6 +582,14 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); } +static noinline int +tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op) +{ + return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); +} + static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, @@ -555,14 +603,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, return 0; for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, MOD_LOG_KEY_REMOVE_WHILE_MOVING); BUG_ON(ret < 0); } ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (ret < 0) + goto out; tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; @@ -571,10 +619,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->op = MOD_LOG_MOVE_KEYS; ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); +out: + tree_mod_log_write_unlock(fs_info); return ret; } +static inline void +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +{ + int i; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(eb); + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert_key_locked(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING); + BUG_ON(ret < 0); + } +} + static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, @@ -583,9 +647,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm; int ret; + if (tree_mod_dont_log(fs_info, NULL)) + return 0; + + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret <= 0) - return ret; + if (ret < 0) + goto out; tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -594,7 +663,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->op = MOD_LOG_ROOT_REPLACE; ret = __tree_mod_log_insert(fs_info, tm); - spin_unlock(&fs_info->tree_mod_seq_lock); +out: + tree_mod_log_write_unlock(fs_info); return ret; } @@ -608,7 +678,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, struct tree_mod_elem *found = NULL; u64 index = start >> PAGE_CACHE_SHIFT; - read_lock(&fs_info->tree_mod_log_lock); + tree_mod_log_read_lock(fs_info); tm_root = &fs_info->tree_mod_log; node = tm_root->rb_node; while (node) { @@ -617,18 +687,18 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, node = node->rb_left; } else if (cur->index > index) { node = node->rb_right; - } else if (cur->elem.seq < min_seq) { + } else if (cur->seq < min_seq) { node = node->rb_left; } else if (!smallest) { /* we want the node with the highest seq */ if (found) - BUG_ON(found->elem.seq > cur->elem.seq); + BUG_ON(found->seq > cur->seq); found = cur; node = node->rb_left; - } else if (cur->elem.seq > min_seq) { + } else if (cur->seq > min_seq) { /* we want the node with the smallest seq */ if (found) - BUG_ON(found->elem.seq < cur->elem.seq); + BUG_ON(found->seq < cur->seq); found = cur; node = node->rb_right; } else { @@ -636,7 +706,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, break; } } - read_unlock(&fs_info->tree_mod_log_lock); + tree_mod_log_read_unlock(fs_info); return found; } @@ -664,7 +734,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static inline void +static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) @@ -675,18 +745,23 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (tree_mod_dont_log(fs_info, NULL)) return; - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { + tree_mod_log_write_unlock(fs_info); return; + } - /* speed this up by single seq for all operations? */ for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, - MOD_LOG_KEY_REMOVE); + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); - ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, - MOD_LOG_KEY_ADD); + ret = tree_mod_log_insert_key_locked(fs_info, dst, + i + dst_offset, + MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } + + tree_mod_log_write_unlock(fs_info); } static inline void @@ -699,7 +774,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, BUG_ON(ret < 0); } -static inline void +static noinline void tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int slot, int atomic) @@ -712,30 +787,22 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, BUG_ON(ret < 0); } -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) +static noinline void +tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - int i; - int ret; - u32 nritems; - if (tree_mod_dont_log(fs_info, eb)) return; - nritems = btrfs_header_nritems(eb); - for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); - BUG_ON(ret < 0); - } + __tree_mod_log_free_eb(fs_info, eb); + + tree_mod_log_write_unlock(fs_info); } -static inline void +static noinline void tree_mod_log_set_root_pointer(struct btrfs_root *root, struct extent_buffer *new_root_node) { int ret; - tree_mod_log_free_eb(root->fs_info, root->node); ret = tree_mod_log_insert_root(root->fs_info, root->node, new_root_node, GFP_NOFS); BUG_ON(ret < 0); @@ -1069,7 +1136,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, unsigned long p_size = sizeof(struct btrfs_key_ptr); n = btrfs_header_nritems(eb); - while (tm && tm->elem.seq >= time_seq) { + while (tm && tm->seq >= time_seq) { /* * all the operations are recorded with the operator used for * the modification. as we're going backwards, we do the @@ -2721,6 +2788,78 @@ done: return ret; } +/* + * helper to use instead of search slot if no exact match is needed but + * instead the next or previous item should be returned. + * When find_higher is true, the next higher item is returned, the next lower + * otherwise. + * When return_any and find_higher are both true, and no higher item is found, + * return the next lower instead. + * When return_any is true and find_higher is false, and no lower item is found, + * return the next higher instead. + * It returns 0 if any item is found, 1 if none is found (tree empty), and + * < 0 on error + */ +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any) +{ + int ret; + struct extent_buffer *leaf; + +again: + ret = btrfs_search_slot(NULL, root, key, p, 0, 0); + if (ret <= 0) + return ret; + /* + * a return value of 1 means the path is at the position where the + * item should be inserted. Normally this is the next bigger item, + * but in case the previous item is the last in a leaf, path points + * to the first free slot in the previous leaf, i.e. at an invalid + * item. + */ + leaf = p->nodes[0]; + + if (find_higher) { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no higher item found, return the next + * lower instead + */ + return_any = 0; + find_higher = 0; + btrfs_release_path(p); + goto again; + } + } else { + if (p->slots[0] >= btrfs_header_nritems(leaf)) { + /* we're sitting on an invalid slot */ + if (p->slots[0] == 0) { + ret = btrfs_prev_leaf(root, p); + if (ret <= 0) + return ret; + if (!return_any) + return 1; + /* + * no lower item found, return the next + * higher instead + */ + return_any = 0; + find_higher = 1; + btrfs_release_path(p); + goto again; + } + --p->slots[0]; + } + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a0ee2f8e0566..00f9a50f986d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -91,6 +91,9 @@ struct btrfs_ordered_sum; /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + /* orhpan objectid for tracking unlinked/truncated files */ #define BTRFS_ORPHAN_OBJECTID -5ULL @@ -883,6 +886,72 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * SCANNING is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 scan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +/* flags definition for qgroup limits */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + struct btrfs_space_info { u64 flags; @@ -1030,6 +1099,13 @@ struct btrfs_block_group_cache { struct list_head cluster_list; }; +/* delayed seq elem */ +struct seq_list { + struct list_head list; + u64 seq; +}; + +/* fs_info */ struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; @@ -1044,6 +1120,7 @@ struct btrfs_fs_info { struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; + struct btrfs_root *quota_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1144,6 +1221,8 @@ struct btrfs_fs_info { spinlock_t tree_mod_seq_lock; atomic_t tree_mod_seq; struct list_head tree_mod_seq_list; + struct seq_list tree_mod_seq_elem; + wait_queue_head_t tree_mod_seq_wait; /* this protects tree_mod_log */ rwlock_t tree_mod_log_lock; @@ -1298,6 +1377,29 @@ struct btrfs_fs_info { #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif + /* + * quota information + */ + unsigned int quota_enabled:1; + + /* + * quota_enabled only changes state after a commit. This holds the + * next state. + */ + unsigned int pending_quota_state:1; + + /* is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* holds configuration and tracking. Protected by qgroup_lock */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* list of dirty qgroups to be written at next commit */ + struct list_head dirty_qgroups; + + /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + u64 qgroup_seq; /* filesystem state */ u64 fs_state; @@ -1527,6 +1629,30 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 +/* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + #define BTRFS_BALANCE_ITEM_KEY 248 /* @@ -2508,6 +2634,49 @@ static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, + scan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; @@ -2703,6 +2872,8 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2753,6 +2924,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); +int btrfs_search_slot_for_read(struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, int cache_only, u64 *last_ret, @@ -2835,11 +3009,22 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info->quota_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kfree(fs_info); } +/* tree mod log functions from ctree.c */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct seq_list *elem); +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic_inc_return(&fs_info->tree_mod_seq); +} + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -3198,17 +3383,49 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* delayed seq elem */ -struct seq_list { +/* qgroup.c */ +struct qgroup_update { struct list_head list; - u64 seq; - u32 flags; + struct btrfs_delayed_ref_node *node; + struct btrfs_delayed_extent_op *extent_op; }; -void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 13ae7b04790e..da7419ed01bb 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -233,22 +233,26 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 seq) { struct seq_list *elem; + int ret = 0; - assert_spin_locked(&delayed_refs->lock); - if (list_empty(&delayed_refs->seq_head)) - return 0; - - elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); - if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", - seq, elem->seq, delayed_refs); - return 1; + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + if (seq >= elem->seq) { + pr_debug("holding back delayed_ref %llu, lowest is " + "%llu (%p)\n", seq, elem->seq, delayed_refs); + ret = 1; + } } - return 0; + + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, @@ -525,8 +529,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (is_fstree(ref_root)) - seq = inc_delayed_seq(delayed_refs); + if (need_ref_seq(for_cow, ref_root)) + seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -584,8 +588,8 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; - if (is_fstree(ref_root)) - seq = inc_delayed_seq(delayed_refs); + if (need_ref_seq(for_cow, ref_root)) + seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -658,10 +662,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, for_cow); - if (!is_fstree(ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&fs_info->tree_mod_seq_wait)) + wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); + if (need_ref_seq(for_cow, ref_root)) + btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -707,10 +713,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, action, for_cow); - if (!is_fstree(ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (!need_ref_seq(for_cow, ref_root) && + waitqueue_active(&fs_info->tree_mod_seq_wait)) + wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); + if (need_ref_seq(for_cow, ref_root)) + btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -736,8 +744,8 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, num_bytes, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (waitqueue_active(&fs_info->tree_mod_seq_wait)) + wake_up(&fs_info->tree_mod_seq_wait); spin_unlock(&delayed_refs->lock); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 413927fb9957..0d7c90c366b6 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -139,26 +139,6 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; - - /* - * seq number of delayed refs. We need to know if a backref was being - * added before the currently processed ref or afterwards. - */ - u64 seq; - - /* - * seq_list holds a list of all seq numbers that are currently being - * added to the list. While walking backrefs (btrfs_find_all_roots, - * qgroups), which might take some time, no newer ref must be processed, - * as it might influence the outcome of the walk. - */ - struct list_head seq_head; - - /* - * when the only refs we have in the list must not be processed, we want - * to wait for more refs to show up or for the end of backref walking. - */ - wait_queue_head_t seq_wait; }; static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) @@ -195,35 +175,29 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); -static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) -{ - assert_spin_locked(&delayed_refs->lock); - ++delayed_refs->seq; - return delayed_refs->seq; -} - -static inline void -btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - assert_spin_locked(&delayed_refs->lock); - elem->seq = delayed_refs->seq; - list_add_tail(&elem->list, &delayed_refs->seq_head); -} - -static inline void -btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - spin_lock(&delayed_refs->lock); - list_del(&elem->list); - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); -} - -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, +int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, u64 seq); +/* + * delayed refs with a ref_seq > 0 must be held back during backref walking. + * this only applies to items in one of the fs-trees. for_cow items never need + * to be held back, so they won't get a ref_seq number. + */ +static inline int need_ref_seq(int for_cow, u64 rootid) +{ + if (for_cow) + return 0; + + if (rootid == BTRFS_FS_TREE_OBJECTID) + return 1; + + if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + return 1; + + return 0; +} + /* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1a4a2a975926..05f4fb6e0607 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1225,6 +1225,82 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) return root; } +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + struct extent_buffer *leaf; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root; + struct btrfs_key key; + int ret = 0; + u64 bytenr; + + root = btrfs_alloc_root(fs_info); + if (!root) + return ERR_PTR(-ENOMEM); + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + root->root_key.objectid = objectid; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = 0; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + bytenr = leaf->start; + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, objectid); + root->node = leaf; + + write_extent_buffer(leaf, fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + write_extent_buffer(leaf, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + root->commit_root = btrfs_root_node(root); + root->track_dirty = 1; + + + root->root_item.flags = 0; + root->root_item.byte_limit = 0; + btrfs_set_root_bytenr(&root->root_item, leaf->start); + btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_level(&root->root_item, 0); + btrfs_set_root_refs(&root->root_item, 1); + btrfs_set_root_used(&root->root_item, leaf->len); + btrfs_set_root_last_snapshot(&root->root_item, 0); + btrfs_set_root_dirid(&root->root_item, 0); + root->root_item.drop_level = 0; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); + if (ret) + goto fail; + + btrfs_tree_unlock(leaf); + +fail: + if (ret) + return ERR_PTR(ret); + + return root; +} + static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1396,6 +1472,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; + if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + return fs_info->quota_root ? fs_info->quota_root : + ERR_PTR(-ENOENT); again: spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, @@ -1823,6 +1902,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_extent_buffer(info->extent_root->commit_root); free_extent_buffer(info->csum_root->node); free_extent_buffer(info->csum_root->commit_root); + if (info->quota_root) { + free_extent_buffer(info->quota_root->node); + free_extent_buffer(info->quota_root->commit_root); + } info->tree_root->node = NULL; info->tree_root->commit_root = NULL; @@ -1832,6 +1915,10 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) info->extent_root->commit_root = NULL; info->csum_root->node = NULL; info->csum_root->commit_root = NULL; + if (info->quota_root) { + info->quota_root->node = NULL; + info->quota_root->commit_root = NULL; + } if (chunk_root) { free_extent_buffer(info->chunk_root->node); @@ -1862,6 +1949,7 @@ int open_ctree(struct super_block *sb, struct btrfs_root *csum_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; + struct btrfs_root *quota_root; struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; @@ -1873,9 +1961,10 @@ int open_ctree(struct super_block *sb, csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); + quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info); if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root) { + !chunk_root || !dev_root || !quota_root) { err = -ENOMEM; goto fail; } @@ -1944,6 +2033,8 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; + init_waitqueue_head(&fs_info->tree_mod_seq_wait); + /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2032,6 +2123,13 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + spin_lock_init(&fs_info->qgroup_lock); + fs_info->qgroup_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2356,6 +2454,17 @@ retry_root_backup: goto recovery_tree_root; csum_root->track_dirty = 1; + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_QUOTA_TREE_OBJECTID, quota_root); + if (ret) { + kfree(quota_root); + quota_root = fs_info->quota_root = NULL; + } else { + quota_root->track_dirty = 1; + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + } + fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2415,6 +2524,9 @@ retry_root_backup: " integrity check module %s\n", sb->s_id); } #endif + ret = btrfs_read_qgroup_config(fs_info); + if (ret) + goto fail_trans_kthread; /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0 && @@ -2425,7 +2537,7 @@ retry_root_backup: printk(KERN_WARNING "Btrfs log replay required " "on RO media\n"); err = -EIO; - goto fail_trans_kthread; + goto fail_qgroup; } blocksize = btrfs_level_size(tree_root, @@ -2434,7 +2546,7 @@ retry_root_backup: log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { err = -ENOMEM; - goto fail_trans_kthread; + goto fail_qgroup; } __setup_root(nodesize, leafsize, sectorsize, stripesize, @@ -2474,7 +2586,7 @@ retry_root_backup: printk(KERN_WARNING "btrfs: failed to recover relocation\n"); err = -EINVAL; - goto fail_trans_kthread; + goto fail_qgroup; } } @@ -2484,10 +2596,10 @@ retry_root_backup: fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) - goto fail_trans_kthread; + goto fail_qgroup; if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) @@ -2511,6 +2623,8 @@ retry_root_backup: return 0; +fail_qgroup: + btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); fail_cleaner: @@ -3109,6 +3223,8 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); + btrfs_free_qgroup_config(root->fs_info); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); @@ -3128,6 +3244,10 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(fs_info->dev_root->commit_root); free_extent_buffer(fs_info->csum_root->node); free_extent_buffer(fs_info->csum_root->commit_root); + if (fs_info->quota_root) { + free_extent_buffer(fs_info->quota_root->node); + free_extent_buffer(fs_info->quota_root->commit_root); + } btrfs_free_block_groups(fs_info); @@ -3258,7 +3378,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); } -static int btree_lock_page_hook(struct page *page, void *data, +int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 05b3fab39f7e..95e147eea239 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -89,6 +89,12 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); +void btrfs_abort_devices(struct btrfs_root *root); +struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 objectid); +int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71b2d1c7da69..44f06201f376 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,8 @@ #include "locking.h" #include "free-space-cache.h" +#undef SCRAMBLE_DELAYED_REFS + /* * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk @@ -2217,6 +2219,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int count = 0; int must_insert_reserved = 0; @@ -2255,7 +2258,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = select_delayed_ref(locked_ref); if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { + btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { /* * there are still refs with lower seq numbers in the * process of being added. Don't run this ref yet. @@ -2337,7 +2340,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } next: - do_chunk_alloc(trans, root->fs_info->extent_root, + do_chunk_alloc(trans, fs_info->extent_root, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); @@ -2347,21 +2350,99 @@ next: return count; } -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, +static void wait_for_more_refs(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, unsigned long num_refs, struct list_head *first_seq) { spin_unlock(&delayed_refs->lock); pr_debug("waiting for more refs (num %ld, first %p)\n", num_refs, first_seq); - wait_event(delayed_refs->seq_wait, + wait_event(fs_info->tree_mod_seq_wait, num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); + fs_info->tree_mod_seq_list.next != first_seq); pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); + delayed_refs->num_entries, fs_info->tree_mod_seq_list.next); spin_lock(&delayed_refs->lock); } +#ifdef SCRAMBLE_DELAYED_REFS +/* + * Normally delayed refs get processed in ascending bytenr order. This + * correlates in most cases to the order added. To expose dependencies on this + * order, we start to process the tree in the middle instead of the beginning + */ +static u64 find_middle(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct btrfs_delayed_ref_node *entry; + int alt = 1; + u64 middle; + u64 first = 0, last = 0; + + n = rb_first(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + first = entry->bytenr; + } + n = rb_last(root); + if (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + last = entry->bytenr; + } + n = root->rb_node; + + while (n) { + entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); + WARN_ON(!entry->in_tree); + + middle = entry->bytenr; + + if (alt) + n = n->rb_left; + else + n = n->rb_right; + + alt = 1 - alt; + } + return middle; +} +#endif + +int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct qgroup_update *qgroup_update; + int ret = 0; + + if (list_empty(&trans->qgroup_ref_list) != + !trans->delayed_ref_elem.seq) { + /* list without seq or seq without list */ + printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); + } + + if (!trans->delayed_ref_elem.seq) + return 0; + + while (!list_empty(&trans->qgroup_ref_list)) { + qgroup_update = list_first_entry(&trans->qgroup_ref_list, + struct qgroup_update, list); + list_del(&qgroup_update->list); + if (!ret) + ret = btrfs_qgroup_account_ref( + trans, fs_info, qgroup_update->node, + qgroup_update->extent_op); + kfree(qgroup_update); + } + + btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + + return ret; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2398,11 +2479,18 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); again: consider_waiting = 0; spin_lock(&delayed_refs->lock); + +#ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); +#endif + if (count == 0) { count = delayed_refs->num_entries * 2; run_most = 1; @@ -2437,7 +2525,7 @@ again: num_refs = delayed_refs->num_entries; first_seq = root->fs_info->tree_mod_seq_list.next; } else { - wait_for_more_refs(delayed_refs, + wait_for_more_refs(root->fs_info, delayed_refs, num_refs, first_seq); /* * after waiting, things have changed. we @@ -2502,6 +2590,7 @@ again: } out: spin_unlock(&delayed_refs->lock); + assert_qgroups_uptodate(trans); return 0; } @@ -4479,6 +4568,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, num_bytes + + nr_extents * root->leafsize); + if (ret) + return ret; + } + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; @@ -4557,6 +4653,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); + if (root->fs_info->quota_enabled) { + btrfs_qgroup_free(root, num_bytes + + dropped * root->leafsize); + } + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); } @@ -5193,8 +5294,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, rb_erase(&head->node.rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); + if (waitqueue_active(&root->fs_info->tree_mod_seq_wait)) + wake_up(&root->fs_info->tree_mod_seq_wait); /* * we don't take a ref on the node because we're removing it from the diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 17facea6a51c..e54b663fd3aa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -336,7 +336,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, - u64 *async_transid) + u64 *async_transid, + struct btrfs_qgroup_inherit **inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -368,6 +369,11 @@ static noinline int create_subvol(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, + inherit ? *inherit : NULL); + if (ret) + goto fail; + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { @@ -484,7 +490,7 @@ fail: static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - bool readonly) + bool readonly, struct btrfs_qgroup_inherit **inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -502,6 +508,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; + if (inherit) { + pending_snapshot->inherit = *inherit; + *inherit = NULL; /* take responsibility to free it */ + } trans = btrfs_start_transaction(root->fs_info->extent_root, 5); if (IS_ERR(trans)) { @@ -635,7 +645,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid, bool readonly) + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit **inherit) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -662,11 +673,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen, async_transid, readonly); + error = create_snapshot(snap_src, dentry, name, namelen, + async_transid, readonly, inherit); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid); + name, namelen, async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); @@ -1375,11 +1386,9 @@ out: } static noinline int btrfs_ioctl_snap_create_transid(struct file *file, - char *name, - unsigned long fd, - int subvol, - u64 *transid, - bool readonly) + char *name, unsigned long fd, int subvol, + u64 *transid, bool readonly, + struct btrfs_qgroup_inherit **inherit) { struct file *src_file; int namelen; @@ -1403,7 +1412,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid, readonly); + NULL, transid, readonly, inherit); } else { struct inode *src_inode; src_file = fget(fd); @@ -1422,7 +1431,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid, readonly); + transid, readonly, inherit); fput(src_file); } out_drop_write: @@ -1444,7 +1453,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, - NULL, false); + NULL, false, NULL); kfree(vol_args); return ret; @@ -1458,6 +1467,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, u64 transid = 0; u64 *ptr = NULL; bool readonly = false; + struct btrfs_qgroup_inherit *inherit = NULL; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) @@ -1465,7 +1475,8 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; if (vol_args->flags & - ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | + BTRFS_SUBVOL_QGROUP_INHERIT)) { ret = -EOPNOTSUPP; goto out; } @@ -1474,10 +1485,21 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ptr = &transid; if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; + if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { + if (vol_args->size > PAGE_CACHE_SIZE) { + ret = -EINVAL; + goto out; + } + inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); + if (IS_ERR(inherit)) { + ret = PTR_ERR(inherit); + goto out; + } + } ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - ptr, readonly); + vol_args->fd, subvol, ptr, + readonly, &inherit); if (ret == 0 && ptr && copy_to_user(arg + @@ -1486,6 +1508,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = -EFAULT; out: kfree(vol_args); + kfree(inherit); return ret; } @@ -3401,6 +3424,183 @@ out: return ret; } +static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_quota_ctl_args *sa; + struct btrfs_trans_handle *trans = NULL; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + } + + switch (sa->cmd) { + case BTRFS_QUOTA_CTL_ENABLE: + ret = btrfs_quota_enable(trans, root->fs_info); + break; + case BTRFS_QUOTA_CTL_DISABLE: + ret = btrfs_quota_disable(trans, root->fs_info); + break; + case BTRFS_QUOTA_CTL_RESCAN: + ret = btrfs_quota_rescan(root->fs_info); + break; + default: + ret = -EINVAL; + break; + } + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + if (trans) { + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + } + +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_qgroup_assign_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->assign) { + ret = btrfs_add_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } else { + ret = btrfs_del_qgroup_relation(trans, root->fs_info, + sa->src, sa->dst); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_qgroup_create_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* FIXME: check if the IDs really exist */ + if (sa->create) { + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, + NULL); + } else { + ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); + } + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); + return ret; +} + +static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_qgroup_limit_args *sa; + struct btrfs_trans_handle *trans; + int ret; + int err; + u64 qgroupid; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + qgroupid = sa->qgroupid; + if (!qgroupid) { + /* take the current subvol as qgroup */ + qgroupid = root->root_key.objectid; + } + + /* FIXME: check if the IDs really exist */ + ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); + + err = btrfs_end_transaction(trans, root); + if (err && !ret) + ret = err; + +out: + kfree(sa); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3422,6 +3622,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SUBVOL_CREATE_V2: + return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); case BTRFS_IOC_SUBVOL_GETFLAGS: @@ -3485,6 +3687,14 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_progress(root, argp); case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(root, argp); + case BTRFS_IOC_QUOTA_CTL: + return btrfs_ioctl_quota_ctl(root, argp); + case BTRFS_IOC_QGROUP_ASSIGN: + return btrfs_ioctl_qgroup_assign(root, argp); + case BTRFS_IOC_QGROUP_CREATE: + return btrfs_ioctl_qgroup_create(root, argp); + case BTRFS_IOC_QGROUP_LIMIT: + return btrfs_ioctl_qgroup_limit(root, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 4e3e5d342a2b..3f9701d571ea 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -32,15 +32,46 @@ struct btrfs_ioctl_vol_args { #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) +#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 +#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) + +struct btrfs_qgroup_limit { + __u64 flags; + __u64 max_rfer; + __u64 max_excl; + __u64 rsv_rfer; + __u64 rsv_excl; +}; + +struct btrfs_qgroup_inherit { + __u64 flags; + __u64 num_qgroups; + __u64 num_ref_copies; + __u64 num_excl_copies; + struct btrfs_qgroup_limit lim; + __u64 qgroups[0]; +}; + +struct btrfs_ioctl_qgroup_limit_args { + __u64 qgroupid; + struct btrfs_qgroup_limit lim; +}; + #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; __u64 flags; - __u64 unused[4]; + union { + struct { + __u64 size; + struct btrfs_qgroup_inherit __user *qgroup_inherit; + }; + __u64 unused[4]; + }; char name[BTRFS_SUBVOL_NAME_MAX + 1]; }; @@ -299,6 +330,25 @@ struct btrfs_ioctl_get_dev_stats { __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ }; +#define BTRFS_QUOTA_CTL_ENABLE 1 +#define BTRFS_QUOTA_CTL_DISABLE 2 +#define BTRFS_QUOTA_CTL_RESCAN 3 +struct btrfs_ioctl_quota_ctl_args { + __u64 cmd; + __u64 status; +}; + +struct btrfs_ioctl_qgroup_assign_args { + __u64 assign; + __u64 src; + __u64 dst; +}; + +struct btrfs_ioctl_qgroup_create_args { + __u64 create; + __u64 qgroupid; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -343,6 +393,8 @@ struct btrfs_ioctl_get_dev_stats { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ + struct btrfs_ioctl_vol_args_v2) #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ @@ -365,6 +417,14 @@ struct btrfs_ioctl_get_dev_stats { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ + struct btrfs_ioctl_quota_ctl_args) +#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ + struct btrfs_ioctl_qgroup_assign_args) +#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ + struct btrfs_ioctl_qgroup_create_args) +#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ + struct btrfs_ioctl_qgroup_limit_args) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c new file mode 100644 index 000000000000..bc424ae5a81a --- /dev/null +++ b/fs/btrfs/qgroup.c @@ -0,0 +1,1571 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "ulist.h" +#include "ioctl.h" +#include "backref.h" + +/* TODO XXX FIXME + * - subvol delete -> delete when ref goes to 0? delete limits also? + * - reorganize keys + * - compressed + * - sync + * - rescan + * - copy also limits on subvol creation + * - limit + * - caches fuer ulists + * - performance benchmarks + * - check all ioctl parameters + */ + +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + */ + u64 tag; + u64 refcnt; +}; + +/* + * glue structure to represent the relations between qgroups. + */ +struct btrfs_qgroup_list { + struct list_head next_group; + struct list_head next_member; + struct btrfs_qgroup *group; + struct btrfs_qgroup *member; +}; + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node *n = fs_info->qgroup_tree.rb_node; + struct btrfs_qgroup *qgroup; + + while (n) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + if (qgroup->qgroupid < qgroupid) + n = n->rb_left; + else if (qgroup->qgroupid > qgroupid) + n = n->rb_right; + else + return qgroup; + } + return NULL; +} + +/* must be called with qgroup_lock held */ +static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + u64 qgroupid) +{ + struct rb_node **p = &fs_info->qgroup_tree.rb_node; + struct rb_node *parent = NULL; + struct btrfs_qgroup *qgroup; + + while (*p) { + parent = *p; + qgroup = rb_entry(parent, struct btrfs_qgroup, node); + + if (qgroup->qgroupid < qgroupid) + p = &(*p)->rb_left; + else if (qgroup->qgroupid > qgroupid) + p = &(*p)->rb_right; + else + return qgroup; + } + + qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); + if (!qgroup) + return ERR_PTR(-ENOMEM); + + qgroup->qgroupid = qgroupid; + INIT_LIST_HEAD(&qgroup->groups); + INIT_LIST_HEAD(&qgroup->members); + INIT_LIST_HEAD(&qgroup->dirty); + + rb_link_node(&qgroup->node, parent, p); + rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); + + return qgroup; +} + +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + struct btrfs_qgroup_list *list; + + if (!qgroup) + return -ENOENT; + + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + list_del(&qgroup->dirty); + + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + kfree(qgroup); + + return 0; +} + +/* must be called with qgroup_lock held */ +static int add_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list = kzalloc(sizeof(*list), GFP_ATOMIC); + if (!list) + return -ENOMEM; + + list->group = parent; + list->member = member; + list_add_tail(&list->next_group, &member->groups); + list_add_tail(&list->next_member, &parent->members); + + return 0; +} + +/* must be called with qgroup_lock held */ +static int del_relation_rb(struct btrfs_fs_info *fs_info, + u64 memberid, u64 parentid) +{ + struct btrfs_qgroup *member; + struct btrfs_qgroup *parent; + struct btrfs_qgroup_list *list; + + member = find_qgroup_rb(fs_info, memberid); + parent = find_qgroup_rb(fs_info, parentid); + if (!member || !parent) + return -ENOENT; + + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + return 0; + } + } + return -ENOENT; +} + +/* + * The full config is read in one go, only called from open_ctree() + * It doesn't use any locking, as at this point we're still single-threaded + */ +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_path *path = NULL; + struct extent_buffer *l; + int slot; + int ret = 0; + u64 flags = 0; + + if (!fs_info->quota_enabled) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + /* default this to quota off, in case no status key is found */ + fs_info->qgroup_flags = 0; + + /* + * pass 1: read status, all qgroup infos and limits + */ + key.objectid = 0; + key.type = 0; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); + if (ret) + goto out; + + while (1) { + struct btrfs_qgroup *qgroup; + + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { + struct btrfs_qgroup_status_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_status_item); + + if (btrfs_qgroup_status_version(l, ptr) != + BTRFS_QGROUP_STATUS_VERSION) { + printk(KERN_ERR + "btrfs: old qgroup version, quota disabled\n"); + goto out; + } + if (btrfs_qgroup_status_generation(l, ptr) != + fs_info->generation) { + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + printk(KERN_ERR + "btrfs: qgroup generation mismatch, " + "marked as inconsistent\n"); + } + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, + ptr); + /* FIXME read scan element */ + goto next1; + } + + if (found_key.type != BTRFS_QGROUP_INFO_KEY && + found_key.type != BTRFS_QGROUP_LIMIT_KEY) + goto next1; + + qgroup = find_qgroup_rb(fs_info, found_key.offset); + if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || + (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { + printk(KERN_ERR "btrfs: inconsitent qgroup config\n"); + flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + if (!qgroup) { + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out; + } + } + switch (found_key.type) { + case BTRFS_QGROUP_INFO_KEY: { + struct btrfs_qgroup_info_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_info_item); + qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); + qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); + qgroup->excl = btrfs_qgroup_info_excl(l, ptr); + qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); + /* generation currently unused */ + break; + } + case BTRFS_QGROUP_LIMIT_KEY: { + struct btrfs_qgroup_limit_item *ptr; + + ptr = btrfs_item_ptr(l, slot, + struct btrfs_qgroup_limit_item); + qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); + qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); + qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); + qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); + qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); + break; + } + } +next1: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } + btrfs_release_path(path); + + /* + * pass 2: read all qgroup relations + */ + key.objectid = 0; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = 0; + ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); + if (ret) + goto out; + while (1) { + slot = path->slots[0]; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + + if (found_key.type != BTRFS_QGROUP_RELATION_KEY) + goto next2; + + if (found_key.objectid > found_key.offset) { + /* parent <- member, not needed to build config */ + /* FIXME should we omit the key completely? */ + goto next2; + } + + ret = add_relation_rb(fs_info, found_key.objectid, + found_key.offset); + if (ret) + goto out; +next2: + ret = btrfs_next_item(quota_root, path); + if (ret < 0) + goto out; + if (ret) + break; + } +out: + fs_info->qgroup_flags |= flags; + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + } + btrfs_free_path(path); + + return ret < 0 ? ret : 0; +} + +/* + * This is only called from close_ctree() or open_ctree(), both in single- + * treaded paths. Clean up the in-memory structures. No locking needed. + */ +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) +{ + struct rb_node *n; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; + + while ((n = rb_first(&fs_info->qgroup_tree))) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + rb_erase(n, &fs_info->qgroup_tree); + + WARN_ON(!list_empty(&qgroup->dirty)); + + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, + next_group); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + + while (!list_empty(&qgroup->members)) { + list = list_first_entry(&qgroup->members, + struct btrfs_qgroup_list, + next_member); + list_del(&list->next_group); + list_del(&list->next_member); + kfree(list); + } + kfree(qgroup); + } +} + +static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, + u64 src, u64 dst) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = src; + key.type = BTRFS_QGROUP_RELATION_KEY; + key.offset = dst; + + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); +out: + btrfs_free_path(path); + return ret; +} + +static int add_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_qgroup_info_item *qgroup_info; + struct btrfs_qgroup_limit_item *qgroup_limit; + struct extent_buffer *leaf; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + qgroup_info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); + btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); + + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*qgroup_limit)); + if (ret) + goto out; + + leaf = path->nodes[0]; + qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); + btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); + + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int del_qgroup_item(struct btrfs_trans_handle *trans, + struct btrfs_root *quota_root, u64 qgroupid) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroupid; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + + key.type = BTRFS_QGROUP_LIMIT_KEY; + ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, quota_root, path); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 qgroupid, + u64 flags, u64 max_rfer, u64 max_excl, + u64 rsv_rfer, u64 rsv_excl) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_limit_item *qgroup_limit; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_LIMIT_KEY; + key.offset = qgroupid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_limit = btrfs_item_ptr(l, path->slots[0], + struct btrfs_qgroup_limit_item); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_info_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_info_item *qgroup_info; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_INFO_KEY; + key.offset = qgroup->qgroupid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + qgroup_info = btrfs_item_ptr(l, path->slots[0], + struct btrfs_qgroup_info_item); + btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); + btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); + btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); + btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); + btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +static int update_qgroup_status_item(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_qgroup_status_item *ptr; + int ret; + int slot; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret > 0) + ret = -ENOENT; + + if (ret) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_generation(l, ptr, trans->transid); + /* XXX scan */ + + btrfs_mark_buffer_dirty(l); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called with qgroup_lock held + */ +static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (!root) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = 0; + key.offset = 0; + key.type = 0; + + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + break; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + btrfs_release_path(path); + } + ret = 0; +out: + root->fs_info->pending_quota_state = 0; + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root; + struct btrfs_path *path = NULL; + struct btrfs_qgroup_status_item *ptr; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret = 0; + + spin_lock(&fs_info->qgroup_lock); + if (fs_info->quota_root) { + fs_info->pending_quota_state = 1; + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + spin_unlock(&fs_info->qgroup_lock); + + /* + * initially create the quota tree + */ + quota_root = btrfs_create_tree(trans, fs_info, + BTRFS_QUOTA_TREE_OBJECTID); + if (IS_ERR(quota_root)) { + ret = PTR_ERR(quota_root); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = 0; + key.type = BTRFS_QGROUP_STATUS_KEY; + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, + sizeof(*ptr)); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_qgroup_status_item); + btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); + btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); + btrfs_set_qgroup_status_scan(leaf, ptr, 0); + + btrfs_mark_buffer_dirty(leaf); + + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + fs_info->pending_quota_state = 1; + spin_unlock(&fs_info->qgroup_lock); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *quota_root; + int ret = 0; + + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + quota_root = fs_info->quota_root; + fs_info->quota_root = NULL; + btrfs_free_qgroup_config(fs_info); + spin_unlock(&fs_info->qgroup_lock); + + if (!quota_root) + return -EINVAL; + + ret = btrfs_clean_quota_tree(trans, quota_root); + if (ret) + goto out; + + ret = btrfs_del_root(trans, tree_root, "a_root->root_key); + if (ret) + goto out; + + list_del("a_root->dirty_list); + + btrfs_tree_lock(quota_root->node); + clean_tree_block(trans, tree_root, quota_root->node); + btrfs_tree_unlock(quota_root->node); + btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); + + free_extent_buffer(quota_root->node); + free_extent_buffer(quota_root->commit_root); + kfree(quota_root); +out: + return ret; +} + +int btrfs_quota_rescan(struct btrfs_fs_info *fs_info) +{ + /* FIXME */ + return 0; +} + +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + int ret = 0; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = add_qgroup_relation_item(trans, quota_root, src, dst); + if (ret) + return ret; + + ret = add_qgroup_relation_item(trans, quota_root, dst, src); + if (ret) { + del_qgroup_relation_item(trans, quota_root, src, dst); + return ret; + } + + spin_lock(&fs_info->qgroup_lock); + ret = add_relation_rb(quota_root->fs_info, src, dst); + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + struct btrfs_root *quota_root; + int ret = 0; + int err; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = del_qgroup_relation_item(trans, quota_root, src, dst); + err = del_qgroup_relation_item(trans, quota_root, dst, src); + if (err && !ret) + ret = err; + + spin_lock(&fs_info->qgroup_lock); + del_relation_rb(fs_info, src, dst); + + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = add_qgroup_item(trans, quota_root, qgroupid); + + spin_lock(&fs_info->qgroup_lock); + qgroup = add_qgroup_rb(fs_info, qgroupid); + spin_unlock(&fs_info->qgroup_lock); + + if (IS_ERR(qgroup)) + ret = PTR_ERR(qgroup); + + return ret; +} + +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_root *quota_root; + int ret = 0; + + quota_root = fs_info->quota_root; + if (!quota_root) + return -EINVAL; + + ret = del_qgroup_item(trans, quota_root, qgroupid); + + spin_lock(&fs_info->qgroup_lock); + del_qgroup_rb(quota_root->fs_info, qgroupid); + + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *qgroup; + int ret = 0; + + if (!quota_root) + return -EINVAL; + + ret = update_qgroup_limit_item(trans, quota_root, qgroupid, + limit->flags, limit->max_rfer, + limit->max_excl, limit->rsv_rfer, + limit->rsv_excl); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + printk(KERN_INFO "unable to update quota limit for %llu\n", + (unsigned long long)qgroupid); + } + + spin_lock(&fs_info->qgroup_lock); + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto unlock; + } + qgroup->lim_flags = limit->flags; + qgroup->max_rfer = limit->max_rfer; + qgroup->max_excl = limit->max_excl; + qgroup->rsv_rfer = limit->rsv_rfer; + qgroup->rsv_excl = limit->rsv_excl; + +unlock: + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} + +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); +} + +/* + * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts + * the modification into a list that's later used by btrfs_end_transaction to + * pass the recorded modifications on to btrfs_qgroup_account_ref. + */ +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct qgroup_update *u; + + BUG_ON(!trans->delayed_ref_elem.seq); + u = kmalloc(sizeof(*u), GFP_NOFS); + if (!u) + return -ENOMEM; + + u->node = node; + u->extent_op = extent_op; + list_add_tail(&u->list, &trans->qgroup_ref_list); + + return 0; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_key ins; + struct btrfs_root *quota_root; + u64 ref_root; + struct btrfs_qgroup *qgroup; + struct ulist_node *unode; + struct ulist *roots = NULL; + struct ulist *tmp = NULL; + struct ulist_iterator uiter; + u64 seq; + int ret = 0; + int sgn; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) { + struct btrfs_delayed_tree_ref *ref; + ref = btrfs_delayed_node_to_tree_ref(node); + ref_root = ref->root; + } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_delayed_data_ref *ref; + ref = btrfs_delayed_node_to_data_ref(node); + ref_root = ref->root; + } else { + BUG(); + } + + if (!is_fstree(ref_root)) { + /* + * non-fs-trees are not being accounted + */ + return 0; + } + + switch (node->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + sgn = 1; + break; + case BTRFS_DROP_DELAYED_REF: + sgn = -1; + break; + case BTRFS_UPDATE_DELAYED_HEAD: + return 0; + default: + BUG(); + } + + /* + * the delayed ref sequence number we pass depends on the direction of + * the operation. for add operations, we pass (node->seq - 1) to skip + * the delayed ref's current sequence number, because we need the state + * of the tree before the add operation. for delete operations, we pass + * (node->seq) to include the delayed ref's current sequence number, + * because we need the state of the tree after the delete operation. + */ + ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, + sgn > 0 ? node->seq - 1 : node->seq, &roots); + if (ret < 0) + goto out; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + if (!quota_root) + goto unlock; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto unlock; + + /* + * step 1: for each old ref, visit all nodes once and inc refcnt + */ + tmp = ulist_alloc(GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto unlock; + } + seq = fs_info->qgroup_seq; + fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + /* XXX id not needed */ + ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)tmp_unode->aux; + if (qg->refcnt < seq) + qg->refcnt = seq + 1; + else + ++qg->refcnt; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(tmp, glist->group->qgroupid, + (unsigned long)glist->group, + GFP_ATOMIC); + } + } + } + + /* + * step 2: walk from the new root + */ + ulist_reinit(tmp); + ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)unode->aux; + if (qg->refcnt < seq) { + /* not visited by step 1 */ + qg->rfer += sgn * node->num_bytes; + qg->rfer_cmpr += sgn * node->num_bytes; + if (roots->nnodes == 0) { + qg->excl += sgn * node->num_bytes; + qg->excl_cmpr += sgn * node->num_bytes; + } + qgroup_dirty(fs_info, qg); + } + WARN_ON(qg->tag >= seq); + qg->tag = seq; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(tmp, glist->group->qgroupid, + (unsigned long)glist->group, GFP_ATOMIC); + } + } + + /* + * step 3: walk again from old refs + */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + struct btrfs_qgroup *qg; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC); + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)tmp_unode->aux; + if (qg->tag == seq) + continue; + + if (qg->refcnt - seq == roots->nnodes) { + qg->excl -= sgn * node->num_bytes; + qg->excl_cmpr -= sgn * node->num_bytes; + qgroup_dirty(fs_info, qg); + } + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(tmp, glist->group->qgroupid, + (unsigned long)glist->group, + GFP_ATOMIC); + } + } + } + ret = 0; +unlock: + spin_unlock(&fs_info->qgroup_lock); +out: + ulist_free(roots); + ulist_free(tmp); + + return ret; +} + +/* + * called from commit_transaction. Writes all changed qgroups to disk. + */ +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *quota_root = fs_info->quota_root; + int ret = 0; + + if (!quota_root) + goto out; + + fs_info->quota_enabled = fs_info->pending_quota_state; + + spin_lock(&fs_info->qgroup_lock); + while (!list_empty(&fs_info->dirty_qgroups)) { + struct btrfs_qgroup *qgroup; + qgroup = list_first_entry(&fs_info->dirty_qgroups, + struct btrfs_qgroup, dirty); + list_del_init(&qgroup->dirty); + spin_unlock(&fs_info->qgroup_lock); + ret = update_qgroup_info_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + spin_lock(&fs_info->qgroup_lock); + } + if (fs_info->quota_enabled) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; + else + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_status_item(trans, fs_info, quota_root); + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + +out: + + return ret; +} + +/* + * copy the acounting information between qgroups. This is necessary when a + * snapshot or a subvolume is created + */ +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit) +{ + int ret = 0; + int i; + u64 *i_qgroups; + struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_qgroup *srcgroup; + struct btrfs_qgroup *dstgroup; + u32 level_size = 0; + + if (!fs_info->quota_enabled) + return 0; + + if (!quota_root) + return -EINVAL; + + /* + * create a tracking group for the subvol itself + */ + ret = add_qgroup_item(trans, quota_root, objectid); + if (ret) + goto out; + + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + ret = update_qgroup_limit_item(trans, quota_root, objectid, + inherit->lim.flags, + inherit->lim.max_rfer, + inherit->lim.max_excl, + inherit->lim.rsv_rfer, + inherit->lim.rsv_excl); + if (ret) + goto out; + } + + if (srcid) { + struct btrfs_root *srcroot; + struct btrfs_key srckey; + int srcroot_level; + + srckey.objectid = srcid; + srckey.type = BTRFS_ROOT_ITEM_KEY; + srckey.offset = (u64)-1; + srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey); + if (IS_ERR(srcroot)) { + ret = PTR_ERR(srcroot); + goto out; + } + + rcu_read_lock(); + srcroot_level = btrfs_header_level(srcroot->node); + level_size = btrfs_level_size(srcroot, srcroot_level); + rcu_read_unlock(); + } + + /* + * add qgroup to all inherited groups + */ + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_qgroup_relation_item(trans, quota_root, + objectid, *i_qgroups); + if (ret) + goto out; + ret = add_qgroup_relation_item(trans, quota_root, + *i_qgroups, objectid); + if (ret) + goto out; + ++i_qgroups; + } + } + + + spin_lock(&fs_info->qgroup_lock); + + dstgroup = add_qgroup_rb(fs_info, objectid); + if (!dstgroup) + goto unlock; + + if (srcid) { + srcgroup = find_qgroup_rb(fs_info, srcid); + if (!srcgroup) + goto unlock; + dstgroup->rfer = srcgroup->rfer - level_size; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + srcgroup->excl = level_size; + srcgroup->excl_cmpr = level_size; + qgroup_dirty(fs_info, dstgroup); + qgroup_dirty(fs_info, srcgroup); + } + + if (!inherit) + goto unlock; + + i_qgroups = (u64 *)(inherit + 1); + for (i = 0; i < inherit->num_qgroups; ++i) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + ++i_qgroups; + } + + for (i = 0; i < inherit->num_ref_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->rfer = src->rfer - level_size; + dst->rfer_cmpr = src->rfer_cmpr - level_size; + i_qgroups += 2; + } + for (i = 0; i < inherit->num_excl_copies; ++i) { + struct btrfs_qgroup *src; + struct btrfs_qgroup *dst; + + src = find_qgroup_rb(fs_info, i_qgroups[0]); + dst = find_qgroup_rb(fs_info, i_qgroups[1]); + + if (!src || !dst) { + ret = -EINVAL; + goto unlock; + } + + dst->excl = src->excl + level_size; + dst->excl_cmpr = src->excl_cmpr + level_size; + i_qgroups += 2; + } + +unlock: + spin_unlock(&fs_info->qgroup_lock); +out: + return ret; +} + +/* + * reserve some space for a qgroup and all its parents. The reservation takes + * place with start_transaction or dealloc_reserve, similar to ENOSPC + * accounting. If not enough space is available, EDQUOT is returned. + * We assume that the requested space is new for all qgroups. + */ +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 ref_root = root->root_key.objectid; + int ret = 0; + struct ulist *ulist = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + + if (!is_fstree(ref_root)) + return 0; + + if (num_bytes == 0) + return 0; + + spin_lock(&fs_info->qgroup_lock); + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + /* + * in a first step, we check all affected qgroups if any limits would + * be exceeded + */ + ulist = ulist_alloc(GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)unode->aux; + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && + qg->reserved + qg->rfer + num_bytes > + qg->max_rfer) + ret = -EDQUOT; + + if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && + qg->reserved + qg->excl + num_bytes > + qg->max_excl) + ret = -EDQUOT; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(ulist, glist->group->qgroupid, + (unsigned long)glist->group, GFP_ATOMIC); + } + } + if (ret) + goto out; + + /* + * no limits exceeded, now record the reservation into all qgroups + */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(ulist, &uiter))) { + struct btrfs_qgroup *qg; + + qg = (struct btrfs_qgroup *)unode->aux; + + qg->reserved += num_bytes; + } + +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(ulist); + + return ret; +} + +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; + struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *ulist = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + u64 ref_root = root->root_key.objectid; + + if (!is_fstree(ref_root)) + return; + + if (num_bytes == 0) + return; + + spin_lock(&fs_info->qgroup_lock); + + quota_root = fs_info->quota_root; + if (!quota_root) + goto out; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + ulist = ulist_alloc(GFP_ATOMIC); + ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(ulist, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)unode->aux; + + qg->reserved -= num_bytes; + + list_for_each_entry(glist, &qg->groups, next_group) { + ulist_add(ulist, glist->group->qgroupid, + (unsigned long)glist->group, GFP_ATOMIC); + } + } + +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(ulist); +} + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) +{ + if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) + return; + printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n", + trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", + trans->delayed_ref_elem.seq); + BUG(); +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 328b95f67660..cc20e95ea289 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -38,7 +38,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -126,7 +125,6 @@ loop: cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.seq = 1; /* * although the tree mod log is per file system and not per transaction, @@ -145,10 +143,8 @@ loop: } atomic_set(&fs_info->tree_mod_seq, 0); - init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); - INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); INIT_LIST_HEAD(&cur_trans->pending_snapshots); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -299,6 +295,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_transaction *cur_trans; u64 num_bytes = 0; int ret; + u64 qgroup_reserved = 0; if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return ERR_PTR(-EROFS); @@ -317,6 +314,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { + if (root->fs_info->quota_enabled && + is_fstree(root->root_key.objectid)) { + qgroup_reserved = num_items * root->leafsize; + ret = btrfs_qgroup_reserve(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); + } + num_bytes = btrfs_calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, @@ -349,12 +354,16 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; h->adding_csums = 0; h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; + h->qgroup_reserved = qgroup_reserved; + h->delayed_ref_elem.seq = 0; + INIT_LIST_HEAD(&h->qgroup_ref_list); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -505,6 +514,24 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + /* + * do the qgroup accounting as early as possible + */ + err = btrfs_delayed_refs_qgroup_accounting(trans, info); + + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + /* + * the same root has to be passed to start_transaction and + * end_transaction. Subvolume quota depends on this. + */ + WARN_ON(trans->root != root); + + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } + while (count < 2) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -559,6 +586,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { err = -EIO; } + assert_qgroups_uptodate(trans); memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -777,6 +805,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = btrfs_run_dev_stats(trans, root->fs_info); BUG_ON(ret); + ret = btrfs_run_qgroups(trans, root->fs_info); + BUG_ON(ret); + + /* run_qgroups might have added some more refs */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); @@ -949,6 +984,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } + ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid, + objectid, pending->inherit); + kfree(pending->inherit); + if (ret) { + pending->error = ret; + goto fail; + } + key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1344,6 +1387,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; + /* + * running the delayed items may have added new refs. account + * them now so that they hinder processing of more delayed refs + * as little as possible. + */ + btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); + /* * rename don't use btrfs_join_transaction, so, once we * set the transaction to blocked above, we aren't going @@ -1456,6 +1506,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->chunk_root->node); switch_commit_root(root->fs_info->chunk_root); + assert_qgroups_uptodate(trans); update_super_roots(root); if (!root->fs_info->log_root_recovering) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d314a74b4968..e8b8416c688b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -20,6 +20,7 @@ #define __BTRFS_TRANSACTION__ #include "btrfs_inode.h" #include "delayed-ref.h" +#include "ctree.h" struct btrfs_transaction { u64 transid; @@ -49,6 +50,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; @@ -58,12 +60,21 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; int aborted; int adding_csums; + /* + * this root is only needed to validate that the root passed to + * start_transaction is the same as the one passed to end_transaction. + * Subvolume quota depends on this + */ + struct btrfs_root *root; + struct seq_list delayed_ref_elem; + struct list_head qgroup_ref_list; }; struct btrfs_pending_snapshot { struct dentry *dentry; struct btrfs_root *root; struct btrfs_root *snap; + struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; /* extra metadata reseration for relocation */