From d3e46fea1b1e8ba97a8c9dd8f54b97d086cd25aa Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:04:19 +0200 Subject: btrfs: sink blocksize parameter to readahead_tree_block All callers pass nodesize. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 222d6aea4a8a..c025751c20d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7485,7 +7485,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(root, bytenr, blocksize); + readahead_tree_block(root, bytenr); nread++; } wc->reada_slot = slot; -- cgit v1.2.3 From fe864576de7fb940b5bd1f8ab8908a08a3416ca0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:28:42 +0200 Subject: btrfs: sink blocksize parameter to btrfs_init_new_buffer Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c025751c20d7..50ebc74db508 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7215,11 +7215,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize, int level) + u64 bytenr, int level) { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7338,7 +7338,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - blocksize, level); + level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -7355,8 +7355,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { -- cgit v1.2.3 From a83fffb75d09cd3d44167b7fb9c1ab9e2269445f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 15 Jun 2014 02:39:54 +0200 Subject: btrfs: sink blocksize parameter to btrfs_find_create_tree_block Finally it's clear that the requested blocksize is always equal to nodesize, with one exception, the superblock. Superblock has fixed size regardless of the metadata block size, but uses the same helpers to initialize sys array/chunk tree and to work with the chunk items. So it pretends to be an extent_buffer for a moment, btrfs_read_sys_array is full of special cases, we're adding one more. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 50ebc74db508..8ff31f81d870 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7219,7 +7219,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7825,7 +7825,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, -- cgit v1.2.3 From ce93ec548cfa02f9cd6b70d546d5f36f4d160f57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 17 Nov 2014 15:45:48 -0500 Subject: Btrfs: track dirty block groups on their own list Currently any time we try to update the block groups on disk we will walk _all_ block groups and check for the ->dirty flag to see if it is set. This function can get called several times during a commit. So if you have several terabytes of data you will be a very sad panda as we will loop through _all_ of the block groups several times, which makes the commit take a while which slows down the rest of the file system operations. This patch introduces a dirty list for the block groups that we get added to when we dirty the block group for the first time. Then we simply update any block groups that have been dirtied since the last time we called btrfs_write_dirty_block_groups. This allows us to clean up how we write the free space cache out so it is much cleaner. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 167 ++++++++++++++++--------------------------------- 1 file changed, 53 insertions(+), 114 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 15116585e714..21c373fe256c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,8 +74,9 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } - - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - btrfs_put_block_group(cache); - if (err) /* File system offline */ - goto out; - } - - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - err = btrfs_write_out_cache(root, trans, cache, path); - - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + list_del_init(&cache->dirty_list); + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, + (unsigned long) -1); + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) + btrfs_write_out_cache(root, trans, cache, path); + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5414,6 +5338,14 @@ static int update_block_group(struct btrfs_root *root, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5424,7 +5356,6 @@ static int update_block_group(struct btrfs_root *root, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -6103,7 +6034,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -7063,7 +6994,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7152,7 +7083,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->nodesize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -9005,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9068,9 +9001,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, @@ -9461,6 +9393,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); -- cgit v1.2.3 From 6219872dc6e56529159f04e73587ed0fcd63eb20 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 6 Jan 2015 20:18:45 +0000 Subject: Btrfs: lookup for block group only if needed when freeing a tree block Very often our extent buffer's header generation doesn't match the current transaction's id or it is also referenced by other trees (snapshots), so we don't need the corresponding block group cache object. Therefore only search for it if we are going to use it, so we avoid an unnecessary search in the block groups rbtree (and acquiring and releasing its spinlock). Freeing a tree block is performed when COWing or deleting a node/leaf, which implies we are holding the node/leaf's parent node lock, therefore reducing the amount of time spent when freeing a tree block helps reducing the amount of time we are holding the parent node's lock. For example, for a run of xfstests/generic/083, the block group cache object was needed only 682 times for a total of 226691 calls to free a tree block. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1c591d6eae58..3af53f8313af 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6136,7 +6136,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_group_cache *cache = NULL; int pin = 1; int ret; @@ -6152,17 +6151,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) goto out; } + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); goto out; } @@ -6170,6 +6172,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6184,7 +6187,6 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); } /* Can return -ENOMEM */ -- cgit v1.2.3 From 6e9606d2a2dce098c1739fb3cd82a1c34fd73d3a Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Tue, 20 Jan 2015 15:11:34 +0800 Subject: Btrfs: add ref_count and free function for btrfs_bio 1: ref_count is simple than current RBIO_HOLD_BBIO_MAP_BIT flag to keep btrfs_bio's memory in raid56 recovery implement. 2: free function for bbio will make code clean and flexible, plus forced data type checking in compile. Changelog v1->v2: Rename following by David Sterba's suggestion: put_btrfs_bio() -> btrfs_put_bio() get_btrfs_bio() -> btrfs_get_bio() bbio->ref_count -> bbio->refs Signed-off-by: Zhao Lei Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3af53f8313af..1d361ac9abc9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1926,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(bbio); + btrfs_put_bbio(bbio); } if (actual_bytes) -- cgit v1.2.3 From 26455d3318a1e2a38f783db07981e3ed67de40ed Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 17 Dec 2014 16:14:09 +0800 Subject: Btrfs: cleanup unused run_most "run_most" is not used anymore. Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1d361ac9abc9..53294da0749d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2769,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - int run_most = 0; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2779,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) { + if (count == 0) count = atomic_read(&delayed_refs->num_entries) * 2; - run_most = 1; - } again: #ifdef SCRAMBLE_DELAYED_REFS -- cgit v1.2.3 From d4b450cd4b33ce7c572e7fdccf33b59c4cdf361c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Jan 2015 19:18:25 +0000 Subject: Btrfs: fix race between transaction commit and empty block group removal Committing a transaction can race with automatic removal of empty block groups (cleaner kthread), leading to a BUG_ON() in the transaction commit code while running btrfs_finish_extent_commit(). The following sequence diagram shows how it can happen: CPU 1 CPU 2 btrfs_commit_transaction() fs_info->running_transaction = NULL btrfs_finish_extent_commit() find_first_extent_bit() -> found range for block group X in fs_info->freed_extents[] btrfs_delete_unused_bgs() -> found block group X Removed block group X's range from fs_info->freed_extents[] btrfs_remove_chunk() btrfs_remove_block_group(bg X) unpin_extent_range(bg X range) btrfs_lookup_block_group(bg X) -> returns NULL -> BUG_ON() The trace that results from the BUG_ON() is: [48665.187808] ------------[ cut here ]------------ [48665.188032] kernel BUG at fs/btrfs/extent-tree.c:5675! [48665.188032] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [48665.188032] Modules linked in: dm_flakey dm_mod crc32c_generic btrfs xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop parport_pc evdev microcode [48665.197388] CPU: 2 PID: 31211 Comm: kworker/u32:16 Tainted: G W 3.19.0-rc5-btrfs-next-4+ #1 [48665.197388] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [48665.197388] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs] [48665.197388] task: ffff880222011810 ti: ffff8801b56a4000 task.ti: ffff8801b56a4000 [48665.197388] RIP: 0010:[] [] unpin_extent_range+0x6a/0x1ba [btrfs] [48665.197388] RSP: 0018:ffff8801b56a7b88 EFLAGS: 00010246 [48665.197388] RAX: 0000000000000000 RBX: ffff8802143a6000 RCX: ffff8802220120c8 [48665.197388] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8800a3c140b0 [48665.197388] RBP: ffff8801b56a7bd8 R08: 0000000000000003 R09: 0000000000000000 [48665.197388] R10: 0000000000000000 R11: 000000000000bbac R12: 0000000012e8e000 [48665.197388] R13: ffff8800a3c14000 R14: 0000000000000000 R15: 0000000000000000 [48665.197388] FS: 0000000000000000(0000) GS:ffff88023ec40000(0000) knlGS:0000000000000000 [48665.197388] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [48665.197388] CR2: 00007f065e42f270 CR3: 0000000206f70000 CR4: 00000000000006e0 [48665.197388] Stack: [48665.197388] ffff8801b56a7bd8 0000000012ea0000 01ff8800a3c14138 0000000012e9ffff [48665.197388] ffff880141df3dd8 ffff8802143a6000 ffff8800a3c14138 ffff880141df3df0 [48665.197388] ffff880141df3dd8 0000000000000000 ffff8801b56a7c08 ffffffffa0354227 [48665.197388] Call Trace: [48665.197388] [] btrfs_finish_extent_commit+0xb0/0xd9 [btrfs] [48665.197388] [] btrfs_commit_transaction+0x791/0x92c [btrfs] [48665.197388] [] flush_space+0x43d/0x452 [btrfs] [48665.197388] [] ? _raw_spin_unlock+0x28/0x33 [48665.197388] [] btrfs_async_reclaim_metadata_space+0x118/0x164 [btrfs] [48665.197388] [] ? process_one_work+0x14b/0x3ab [48665.197388] [] process_one_work+0x1e0/0x3ab [48665.197388] [] ? trace_hardirqs_off+0xd/0xf [48665.197388] [] worker_thread+0x210/0x2d0 [48665.197388] [] ? rescuer_thread+0x2c3/0x2c3 [48665.197388] [] kthread+0xef/0xf7 [48665.197388] [] ? _raw_spin_unlock_irq+0x2d/0x39 [48665.197388] [] ? __kthread_parkme+0xad/0xad [48665.197388] [] ret_from_fork+0x7c/0xb0 [48665.197388] [] ? __kthread_parkme+0xad/0xad [48665.197388] Code: 85 f6 74 14 49 8b 06 49 03 46 09 49 39 c4 72 1d 4c 89 f7 e8 83 ec ff ff 4c 89 e6 4c 89 ef e8 1e f1 ff ff 48 85 c0 49 89 c6 75 02 <0f> 0b 49 8b 1e 49 03 5e 09 48 8b [48665.197388] RIP [] unpin_extent_range+0x6a/0x1ba [btrfs] [48665.197388] RSP [48665.272246] ---[ end trace b9c6ab9957521376 ]--- Fix this by ensuring that unpining the block group's range in btrfs_finish_extent_commit() is done in a synchronized fashion with removing the block group's range from freed_extents[] in btrfs_delete_unused_bgs() This race got introduced with the change: Btrfs: remove empty block groups automatically commit 47ab2a6c689913db23ccae38349714edf8365e0a Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53294da0749d..857a859948a3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5735,10 +5735,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, unpin = &fs_info->freed_extents[0]; while (1) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, @@ -5746,6 +5749,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -9561,18 +9565,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; -- cgit v1.2.3 From 2f0810880f082fa8ba66ab2c33b02e4ff9770a5e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2015 10:40:15 -0800 Subject: btrfs: delete chunk allocation attemp when setting block group ro Below test will fail currently: mkfs.ext4 -F /dev/sda btrfs-convert /dev/sda mount /dev/sda /mnt btrfs device add -f /dev/sdb /mnt btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt The reason is there are some block groups with usage 0, but the whole disk hasn't free space to allocate new chunk, so we even can't set such block group readonly. This patch deletes the chunk allocation when setting block group ro. For META, we already have reserve. But for SYSTEM, we don't have, so the check_system_chunk is still required. Signed-off-by: Shaohua Li Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 857a859948a3..50de1fa6fc9e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8482,14 +8482,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8500,6 +8492,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + check_system_chunk(trans, root, alloc_flags); + } + btrfs_end_transaction(trans, root); return ret; } -- cgit v1.2.3 From dcab6a3b2ae657a2017637083c28ee303b6b1b8e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 Feb 2015 15:08:59 -0500 Subject: Btrfs: account for large extents with enospc On our gluster boxes we stream large tar balls of backups onto our fses. With 160gb of ram this means we get really large contiguous ranges of dirty data, but the way our ENOSPC stuff works is that as long as it's contiguous we only hold metadata reservation for one extent. The problem is we limit our extents to 128mb, so we'll end up with at least 800 extents so our enospc accounting is quite a bit lower than what we need. To keep track of this make sure we increase outstanding_extents for every multiple of the max extent size so we can be sure to have enough reserved metadata space. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 50de1fa6fc9e..0f6737063142 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4963,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode) +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; + unsigned num_extents = 0; - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; if (BTRFS_I(inode)->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, @@ -5146,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) out_fail: spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers @@ -5225,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); -- cgit v1.2.3 From 3d84be799194147e04c0e3129ed44a948773b80a Mon Sep 17 00:00:00 2001 From: Forrest Liu Date: Wed, 11 Feb 2015 14:24:12 +0800 Subject: Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group Removing large amount of block group in a transaction may encounters BUG_ON() in btrfs_orphan_add(). That is because btrfs_orphan_reserve_metadata() will grab metadata reservation from transaction handle, and btrfs_delete_unused_bgs() didn't reserve metadata for trnasaction handle when delete unused block group. The problem can be reproduce by following script mntpath=/btrfs loopdev=/dev/loop0 filepath=/home/forrest/image umount $mntpath losetup -d $loopdev truncate --size 1000g $filepath losetup $loopdev $filepath mkfs.btrfs -f $loopdev mount $loopdev $mntpath for j in `seq 1 1 1000`; do fallocate -l 1g $mntpath/$j done # wait cleaner thread remove unused block group sleep 300 The call trace that results from the BUG_ON() is: [ 613.093084] ------------[ cut here ]------------ [ 613.097928] kernel BUG at fs/btrfs/inode.c:3142! [ 613.105855] invalid opcode: 0000 [#1] SMP [ 613.112702] Modules linked in: coretemp(E) crc32_pclmul(E) ghash_clmulni_intel(E) aesni_intel(E) snd_ens1371(E) snd_ac97_codec(E) aes_x86_64(E) lrw(E) gf128mul(E) glue_helper(E) ppdev(E) ac97_bus(E) ablk_helper(E) gameport(E) cryptd(E) snd_rawmidi(E) snd_seq_device(E) snd_pcm(E) vmw_balloon(E) snd_timer(E) snd(E) soundcore(E) serio_raw(E) vmwgfx(E) ttm(E) drm_kms_helper(E) drm(E) vmw_vmci(E) parport_pc(E) shpchp(E) i2c_piix4(E) mac_hid(E) lp(E) parport(E) btrfs(E) xor(E) raid6_pq(E) hid_generic(E) usbhid(E) hid(E) psmouse(E) ahci(E) libahci(E) e1000(E) mptspi(E) mptscsih(E) mptbase(E) floppy(E) vmw_pvscsi(E) vmxnet3(E) [ 613.144196] CPU: 0 PID: 1480 Comm: btrfs-cleaner Tainted: G E 3.19.0-rc7-custom #2 [ 613.148501] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 [ 613.152694] task: ffff880035cdb1a0 ti: ffff880039cf4000 task.ti: ffff880039cf4000 [ 613.154969] RIP: 0010:[] [] btrfs_orphan_add+0x1d2/0x1e0 [btrfs] [ 613.157780] RSP: 0018:ffff880039cf7c48 EFLAGS: 00010286 [ 613.159560] RAX: 00000000ffffffe4 RBX: ffff88003bd981a0 RCX: ffff88003c9e4000 [ 613.161904] RDX: 0000000000002244 RSI: 0000000000040000 RDI: ffff88003c9e4138 [ 613.164264] RBP: ffff880039cf7c88 R08: 000060ffc0000850 R09: 0000000000000000 [ 613.166507] R10: ffff88003bc4b7a0 R11: ffffea0000eb6740 R12: ffff88003c9c0000 [ 613.168681] R13: ffff88003c102160 R14: ffff88003c9c0458 R15: 0000000000000001 [ 613.170932] FS: 0000000000000000(0000) GS:ffff88003f600000(0000) knlGS:0000000000000000 [ 613.173316] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 613.175227] CR2: 00007f6343537000 CR3: 0000000036329000 CR4: 00000000000407f0 [ 613.177554] Stack: [ 613.178712] ffff880039cf7c88 ffffffffa0182a54 ffff88003c9e4b04 ffff88003c9c7800 [ 613.181297] ffff88003bc4b7a0 ffff88003bd981a0 ffff88003c8db200 ffff88003c2fcc60 [ 613.183782] ffff880039cf7d18 ffffffffa012da97 ffff88003bc4b7a4 ffff88003bc4b7a0 [ 613.186171] Call Trace: [ 613.187493] [] ? lookup_free_space_inode+0x44/0x100 [btrfs] [ 613.189801] [] btrfs_remove_block_group+0x137/0x740 [btrfs] [ 613.192126] [] btrfs_remove_chunk+0x672/0x780 [btrfs] [ 613.194267] [] btrfs_delete_unused_bgs+0x25f/0x280 [btrfs] [ 613.196567] [] cleaner_kthread+0x12c/0x190 [btrfs] [ 613.198687] [] ? check_leaf+0x350/0x350 [btrfs] [ 613.200758] [] kthread+0xd2/0xf0 [ 613.202616] [] ? kthread_create_on_node+0x180/0x180 [ 613.204738] [] ret_from_fork+0x7c/0xb0 [ 613.206652] [] ? kthread_create_on_node+0x180/0x180 [ 613.208741] Code: ff ff 0f 1f 80 00 00 00 00 89 45 c8 3e 80 63 80 fd 48 89 df e8 d0 23 fe ff 8b 45 c8 e9 14 ff ff ff b8 f4 ff ff ff e9 12 ff ff ff <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 55 48 [ 613.216562] RIP [] btrfs_orphan_add+0x1d2/0x1e0 [btrfs] [ 613.218828] RSP [ 613.220382] ---[ end trace 71073106deb8a457 ]--- This patch replace btrfs_join_transaction() with btrfs_start_transaction() in btrfs_delete_unused_bgs() to revent BUG_ON() in btrfs_orphan_add() Signed-off-by: Forrest Liu Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f6737063142..28ce5c8004d4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9555,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - trans = btrfs_join_transaction(root); + /* 1 for btrfs_orphan_reserve_metadata() */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_set_block_group_rw(root, block_group); ret = PTR_ERR(trans); -- cgit v1.2.3