diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3b6e35aafc9e..3af777357acb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -54,6 +54,31 @@ struct btrfs_path *btrfs_alloc_path(void) return path; } +/* + * set all locked nodes in the path to blocking locks. This should + * be done before scheduling + */ +noinline void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_set_lock_blocking(p->nodes[i]); + } +} + +/* + * reset all the locked nodes in the patch to spinning locks. + */ +noinline void btrfs_clear_path_blocking(struct btrfs_path *p) +{ + int i; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (p->nodes[i] && p->locks[i]) + btrfs_clear_lock_blocking(p->nodes[i]); + } +} + /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { @@ -272,6 +297,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (IS_ERR(cow)) return PTR_ERR(cow); + /* cow is set to blocking by btrfs_init_new_buffer */ + copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); @@ -397,6 +424,11 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, } search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + + if (parent) + btrfs_set_lock_blocking(parent); + btrfs_set_lock_blocking(buf); + ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0, prealloc_dest); @@ -502,6 +534,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems == 1) return 0; + btrfs_set_lock_blocking(parent); + for (i = start_slot; i < end_slot; i++) { int close = 1; @@ -562,6 +596,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = last_block; btrfs_tree_lock(cur); + btrfs_set_lock_blocking(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -860,6 +895,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; mid = path->nodes[level]; + WARN_ON(!path->locks[level]); WARN_ON(btrfs_header_generation(mid) != trans->transid); @@ -882,6 +918,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); btrfs_tree_lock(child); + btrfs_set_lock_blocking(child); BUG_ON(!child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); @@ -898,6 +935,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); btrfs_tree_unlock(child); + path->locks[level] = 0; path->nodes[level] = NULL; clean_tree_block(trans, root, mid); @@ -922,6 +960,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, 0); if (wret) { @@ -932,6 +971,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = read_node_slot(root, parent, pslot + 1); if (right) { btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, 0); if (wret) { @@ -1107,6 +1147,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1153,7 +1195,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, */ if (right) { u32 right_nr; + btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { wret = 1; @@ -1264,6 +1309,68 @@ static noinline void reada_for_search(struct btrfs_root *root, } } +/* + * returns -EAGAIN if it had to drop the path, or zero if everything was in + * cache + */ +static noinline int reada_for_balance(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + int slot; + int nritems; + struct extent_buffer *parent; + struct extent_buffer *eb; + u64 gen; + u64 block1 = 0; + u64 block2 = 0; + int ret = 0; + int blocksize; + + parent = path->nodes[level - 1]; + if (!parent) + return 0; + + nritems = btrfs_header_nritems(parent); + slot = path->slots[level]; + blocksize = btrfs_level_size(root, level); + + if (slot > 0) { + block1 = btrfs_node_blockptr(parent, slot - 1); + gen = btrfs_node_ptr_generation(parent, slot - 1); + eb = btrfs_find_tree_block(root, block1, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block1 = 0; + free_extent_buffer(eb); + } + if (slot < nritems) { + block2 = btrfs_node_blockptr(parent, slot + 1); + gen = btrfs_node_ptr_generation(parent, slot + 1); + eb = btrfs_find_tree_block(root, block2, blocksize); + if (eb && btrfs_buffer_uptodate(eb, gen)) + block2 = 0; + free_extent_buffer(eb); + } + if (block1 || block2) { + ret = -EAGAIN; + btrfs_release_path(root, path); + if (block1) + readahead_tree_block(root, block1, blocksize, 0); + if (block2) + readahead_tree_block(root, block2, blocksize, 0); + + if (block1) { + eb = read_tree_block(root, block1, blocksize, 0); + free_extent_buffer(eb); + } + if (block1) { + eb = read_tree_block(root, block2, blocksize, 0); + free_extent_buffer(eb); + } + } + return ret; +} + + /* * when we walk down the tree, it is usually safe to unlock the higher layers * in the tree. The exceptions are when our path goes through slot 0, because @@ -1314,6 +1421,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } } +/* + * This releases any locks held in the path starting at level and + * going all the way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few + * corner cases, such as COW of the block at slot zero in the node. This + * ignores those rules, and it should only be called when there are no + * more updates to be done higher up in the tree. + */ +noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks || path->lowest_level) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; + } +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -1385,6 +1518,7 @@ again: */ if (prealloc_block.objectid && prealloc_block.offset != b->len) { + btrfs_set_path_blocking(p); btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); @@ -1409,6 +1543,8 @@ again: goto again; } + btrfs_set_path_blocking(p); + wret = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], @@ -1430,6 +1566,22 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; + btrfs_clear_path_blocking(p); + + /* + * we have a lock on b and as long as we aren't changing + * the tree, there is no way to for the items in b to change. + * It is safe to drop the lock on our parent before we + * go through the expensive btree search on b. + * + * If cow is true, then we might be changing slot zero, + * which may require changing the parent. So, we can't + * drop the lock until after we know which slot we're + * operating on. + */ + if (!cow) + btrfs_unlock_up_safe(p, level + 1); + ret = check_block(root, p, level); if (ret) { ret = -1; @@ -1437,6 +1589,7 @@ cow_done: } ret = bin_search(b, key, level, &slot); + if (level != 0) { if (ret && slot > 0) slot -= 1; @@ -1444,7 +1597,16 @@ cow_done: if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret = split_node(trans, root, p, level); + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1453,8 +1615,16 @@ cow_done: b = p->nodes[level]; slot = p->slots[level]; } else if (ins_len < 0) { - int sret = balance_level(trans, root, p, - level); + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p); + if (sret) { ret = sret; goto done; @@ -1488,7 +1658,7 @@ cow_done: * of the btree by dropping locks before * we read. */ - if (level > 1) { + if (level > 0) { btrfs_release_path(NULL, p); if (tmp) free_extent_buffer(tmp); @@ -1503,6 +1673,7 @@ cow_done: free_extent_buffer(tmp); goto again; } else { + btrfs_set_path_blocking(p); if (tmp) free_extent_buffer(tmp); if (should_reada) @@ -1512,14 +1683,29 @@ cow_done: b = read_node_slot(root, b, slot); } } - if (!p->skip_locking) - btrfs_tree_lock(b); + if (!p->skip_locking) { + int lret; + + btrfs_clear_path_blocking(p); + lret = btrfs_try_spin_lock(b); + + if (!lret) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + btrfs_clear_path_blocking(p); + } + } } else { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret = split_leaf(trans, root, key, + int sret; + + btrfs_set_path_blocking(p); + sret = split_leaf(trans, root, key, p, ins_len, ret == 0); + btrfs_clear_path_blocking(p); + BUG_ON(sret > 0); if (sret) { ret = sret; @@ -1533,12 +1719,16 @@ cow_done: } ret = 1; done: + /* + * we don't really know what they plan on doing with the path + * from here on, so for now just mark it as blocking + */ + btrfs_set_path_blocking(p); if (prealloc_block.objectid) { btrfs_free_reserved_extent(root, prealloc_block.objectid, prealloc_block.offset); } - return ret; } @@ -1562,6 +1752,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); BUG_ON(ret); + btrfs_set_lock_blocking(eb); + parent = eb; while (1) { level = btrfs_header_level(parent); @@ -1586,6 +1778,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } /* @@ -1610,6 +1803,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans, eb = read_tree_block(root, bytenr, blocksize, generation); btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); } ret = btrfs_cow_block(trans, root, eb, parent, slot, @@ -2156,6 +2350,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); + btrfs_set_lock_blocking(right); + free_space = btrfs_leaf_free_space(root, right); if (free_space < data_size) goto out_unlock; @@ -2351,6 +2547,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); + btrfs_set_lock_blocking(left); + free_space = btrfs_leaf_free_space(root, left); if (free_space < data_size) { ret = 1; @@ -2809,6 +3007,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, path->keep_locks = 0; BUG_ON(ret); + /* + * make sure any changes to the path from split_leaf leave it + * in a blocking state + */ + btrfs_set_path_blocking(path); + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); @@ -3338,6 +3542,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, BUG(); } out: + btrfs_unlock_up_safe(path, 1); return ret; } @@ -3705,12 +3910,14 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; + btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, cache_only, min_trans); if (sret == 0) { btrfs_release_path(root, path); goto again; } else { + btrfs_clear_path_blocking(path); goto out; } } @@ -3722,16 +3929,20 @@ find_next_key: unlock_up(path, level, 1); goto out; } + btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); btrfs_tree_lock(cur); + path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); + btrfs_clear_path_blocking(path); } out: if (ret == 0) memcpy(min_key, &found_key, sizeof(found_key)); + btrfs_set_path_blocking(path); return ret; } @@ -3827,6 +4038,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (ret < 0) return ret; + btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -3857,6 +4069,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } + /* the path was set to blocking above */ if (level == 1 && (path->locks[1] || path->skip_locking) && path->reada) reada_for_search(root, path, level, slot, 0); @@ -3865,6 +4078,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(c)); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } break; } @@ -3881,12 +4095,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) path->locks[level] = 1; if (!level) break; + + btrfs_set_path_blocking(path); if (level == 1 && path->locks[1] && path->reada) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { WARN_ON(!btrfs_tree_locked(path->nodes[level])); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); } } done: @@ -3911,6 +4128,7 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { + btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f2b8d26b0472..531db112c8bd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1835,6 +1835,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_init_path(struct btrfs_path *p); +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_clear_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *p, int level); + int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); int btrfs_del_leaf(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 549271607c17..5aebddd71193 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -799,7 +799,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) - buf->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); else WARN_ON(1); return buf; @@ -813,6 +813,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { WARN_ON(!btrfs_tree_locked(buf)); + + /* ugh, clear_extent_buffer_dirty can be expensive */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); } @@ -2311,6 +2315,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) u64 transid = btrfs_header_generation(buf); struct inode *btree_inode = root->fs_info->btree_inode; + btrfs_set_lock_blocking(buf); + WARN_ON(!btrfs_tree_locked(buf)); if (transid != root->fs_info->generation) { printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " @@ -2353,7 +2359,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int ret; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret == 0) - buf->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags); return ret; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7a22f2e6ec47..ed1e25d72483 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3407,7 +3407,10 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_header_generation(buf, trans->transid); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + + btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -3416,6 +3419,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, buf->start + buf->len - 1, GFP_NOFS); } trans->blocks_used++; + /* this returns a buffer locked for blocking */ return buf; } @@ -3752,6 +3756,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, next = read_tree_block(root, bytenr, blocksize, ptr_gen); btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, &refs); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ea7f052722c..dd5df53e045a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2990,7 +2990,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, eb = kmem_cache_zalloc(extent_buffer_cache, mask); eb->start = start; eb->len = len; - mutex_init(&eb->mutex); + spin_lock_init(&eb->lock); + init_waitqueue_head(&eb->lock_wq); + #if LEAK_DEBUG spin_lock_irqsave(&leak_lock, flags); list_add(&eb->leak_list, &buffers); @@ -3071,8 +3073,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, unlock_page(p); } if (uptodate) - eb->flags |= EXTENT_UPTODATE; - eb->flags |= EXTENT_BUFFER_FILLED; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); spin_lock(&tree->buffer_lock); exists = buffer_tree_insert(tree, start, &eb->rb_node); @@ -3226,7 +3227,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, unsigned long num_pages; num_pages = num_extent_pages(eb->start, eb->len); - eb->flags &= ~EXTENT_UPTODATE; + clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); @@ -3297,7 +3298,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, struct page *page; int pg_uptodate = 1; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3333,7 +3334,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned long bio_flags = 0; - if (eb->flags & EXTENT_UPTODATE) + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, @@ -3364,7 +3365,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (all_uptodate) { if (start_i == 0) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); goto unlock_exit; } @@ -3400,7 +3401,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (!ret) - eb->flags |= EXTENT_UPTODATE; + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); return ret; unlock_exit: @@ -3497,7 +3498,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start, unmap_extent_buffer(eb, eb->map_token, km); eb->map_token = NULL; save = 1; - WARN_ON(!mutex_is_locked(&eb->mutex)); } err = map_private_extent_buffer(eb, start, min_len, token, map, map_start, map_len, km); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e80c6d96b318..1f9df88afbf6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -22,6 +22,10 @@ /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 +/* these are bit numbers for test/set bit */ +#define EXTENT_BUFFER_UPTODATE 0 +#define EXTENT_BUFFER_BLOCKING 1 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -95,11 +99,19 @@ struct extent_buffer { unsigned long map_start; unsigned long map_len; struct page *first_page; + unsigned long bflags; atomic_t refs; - int flags; struct list_head leak_list; struct rb_node rb_node; - struct mutex mutex; + + /* the spinlock is used to protect most operations */ + spinlock_t lock; + + /* + * when we keep the lock held while blocking, waiters go onto + * the wq + */ + wait_queue_head_t lock_wq; }; struct extent_map_tree; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a79e1c5ebd0..ebd7d6c37df8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "tree-log.h" #include "ref-cache.h" #include "compression.h" +#include "locking.h" struct btrfs_iget_args { u64 ino; @@ -2021,6 +2022,7 @@ void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2117,6 +2119,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, goto failed; } + btrfs_unlock_up_safe(path, 1); leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 39bae7761db6..68fd9ccf1805 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -26,45 +26,215 @@ #include "locking.h" /* - * locks the per buffer mutex in an extent buffer. This uses adaptive locks - * and the spin is not tuned very extensively. The spinning does make a big - * difference in almost every workload, but spinning for the right amount of - * time needs some help. - * - * In general, we want to spin as long as the lock holder is doing btree - * searches, and we should give up if they are in more expensive code. + * btrfs_header_level() isn't free, so don't call it when lockdep isn't + * on */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void spin_nested(struct extent_buffer *eb) +{ + spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); +} +#else +static inline void spin_nested(struct extent_buffer *eb) +{ + spin_lock(&eb->lock); +} +#endif -int btrfs_tree_lock(struct extent_buffer *eb) +/* + * Setting a lock to blocking will drop the spinlock and set the + * flag that forces other procs who want the lock to wait. After + * this you can safely schedule with the lock held. + */ +void btrfs_set_lock_blocking(struct extent_buffer *eb) +{ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + spin_unlock(&eb->lock); + } + /* exit with the spin lock released and the bit set */ +} + +/* + * clearing the blocking flag will take the spinlock again. + * After this you can't safely schedule + */ +void btrfs_clear_lock_blocking(struct extent_buffer *eb) +{ + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + spin_nested(eb); + clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); + smp_mb__after_clear_bit(); + } + /* exit with the spin lock held */ +} + +/* + * unfortunately, many of the places that currently set a lock to blocking + * don't end up blocking for every long, and often they don't block + * at all. For a dbench 50 run, if we don't spin one the blocking bit + * at all, the context switch rate can jump up to 400,000/sec or more. + * + * So, we're still stuck with this crummy spin on the blocking bit, + * at least until the most common causes of the short blocks + * can be dealt with. + */ +static int btrfs_spin_on_block(struct extent_buffer *eb) { int i; - - if (mutex_trylock(&eb->mutex)) - return 0; for (i = 0; i < 512; i++) { cpu_relax(); - if (mutex_trylock(&eb->mutex)) - return 0; + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + if (need_resched()) + break; } - cpu_relax(); - mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); return 0; } +/* + * This is somewhat different from trylock. It will take the + * spinlock but if it finds the lock is set to blocking, it will + * return without the lock held. + * + * returns 1 if it was able to take the lock and zero otherwise + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_try_spin_lock(struct extent_buffer *eb) +{ + int i; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + + /* spin for a bit on the BLOCKING flag */ + for (i = 0; i < 2; i++) { + if (!btrfs_spin_on_block(eb)) + break; + + spin_nested(eb); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 1; + spin_unlock(&eb->lock); + } + return 0; +} + +/* + * the autoremove wake function will return 0 if it tried to wake up + * a process that was already awake, which means that process won't + * count as an exclusive wakeup. The waitq code will continue waking + * procs until it finds one that was actually sleeping. + * + * For btrfs, this isn't quite what we want. We want a single proc + * to be notified that the lock is ready for taking. If that proc + * already happen to be awake, great, it will loop around and try for + * the lock. + * + * So, btrfs_wake_function always returns 1, even when the proc that we + * tried to wake up was already awake. + */ +static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + autoremove_wake_function(wait, mode, sync, key); + return 1; +} + +/* + * returns with the extent buffer spinlocked. + * + * This will spin and/or wait as required to take the lock, and then + * return with the spinlock held. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ +int btrfs_tree_lock(struct extent_buffer *eb) +{ + DEFINE_WAIT(wait); + wait.func = btrfs_wake_function; + + while(1) { + spin_nested(eb); + + /* nobody is blocking, exit with the spinlock held */ + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + return 0; + + /* + * we have the spinlock, but the real owner is blocking. + * wait for them + */ + spin_unlock(&eb->lock); + + /* + * spin for a bit, and if the blocking flag goes away, + * loop around + */ + if (btrfs_spin_on_block(eb)) + continue; + + prepare_to_wait_exclusive(&eb->lock_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + schedule(); + + finish_wait(&eb->lock_wq, &wait); + } + return 0; +} + +/* + * Very quick trylock, this does not spin or schedule. It returns + * 1 with the spinlock held if it was able to take the lock, or it + * returns zero if it was unable to take the lock. + * + * After this call, scheduling is not safe without first calling + * btrfs_set_lock_blocking() + */ int btrfs_try_tree_lock(struct extent_buffer *eb) { - return mutex_trylock(&eb->mutex); + if (spin_trylock(&eb->lock)) { + if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { + /* + * we've got the spinlock, but the real owner is + * blocking. Drop the spinlock and return failure + */ + spin_unlock(&eb->lock); + return 0; + } + return 1; + } + /* someone else has the spinlock giveup */ + return 0; } int btrfs_tree_unlock(struct extent_buffer *eb) { - mutex_unlock(&eb->mutex); + /* + * if we were a blocking owner, we don't have the spinlock held + * just clear the bit and look for waiters + */ + if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + smp_mb__after_clear_bit(); + else + spin_unlock(&eb->lock); + + if (waitqueue_active(&eb->lock_wq)) + wake_up(&eb->lock_wq); return 0; } int btrfs_tree_locked(struct extent_buffer *eb) { - return mutex_is_locked(&eb->mutex); + return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || + spin_is_locked(&eb->lock); } /* @@ -75,12 +245,14 @@ int btrfs_path_lock_waiting(struct btrfs_path *path, int level) { int i; struct extent_buffer *eb; + for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { eb = path->nodes[i]; if (!eb) break; smp_mb(); - if (!list_empty(&eb->mutex.wait_list)) + if (spin_is_contended(&eb->lock) || + waitqueue_active(&eb->lock_wq)) return 1; } return 0; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index bc1faef12519..d92e707f5870 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -22,6 +22,12 @@ int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); int btrfs_tree_locked(struct extent_buffer *eb); + int btrfs_try_tree_lock(struct extent_buffer *eb); +int btrfs_try_spin_lock(struct extent_buffer *eb); + int btrfs_path_lock_waiting(struct btrfs_path *path, int level); + +void btrfs_set_lock_blocking(struct extent_buffer *eb); +void btrfs_clear_lock_blocking(struct extent_buffer *eb); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3e8358c36165..98d25fa4570e 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4f26f3ed0c87..20794290256b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1615,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1661,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = path->nodes[*level]; btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1718,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, root, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -1790,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); clean_tree_block(trans, log, next); + btrfs_set_lock_blocking(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next);