From 062c05c46bd4358aad7a0e0cb5ffeb98ab935286 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 7 Dec 2011 19:50:42 -0500 Subject: Btrfs: try to allocate from cluster even at LOOP_NO_EMPTY_SIZE If we reach LOOP_NO_EMPTY_SIZE, we won't even try to use a cluster that others might have set up. Odds are that there won't be one, but if someone else succeeded in setting it up, we might as well use it, even if we don't try to set up a cluster again. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813c6bb96c9a..db0b23b14f20 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,15 +5285,10 @@ alloc: spin_unlock(&block_group->free_space_ctl->tree_lock); /* - * Ok we want to try and use the cluster allocator, so lets look - * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will - * have tried the cluster allocator plenty of times at this - * point and not have found anything, so we are likely way too - * fragmented for the clustering stuff to find anything, so lets - * just skip it and let the allocator find whatever block it can - * find + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + if (last_ptr) { /* * the refill lock keeps out other * people trying to start a new cluster @@ -5342,6 +5337,20 @@ alloc: } spin_unlock(&last_ptr->lock); refill_cluster: + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. */ + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* * this cluster didn't work out, free it and * start over @@ -5389,6 +5398,7 @@ refill_cluster: goto loop; } +unclustered_alloc: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* -- cgit v1.2.3 From 274bd4fb3ed6b72c1d77ef8850511f09fc6b8e4d Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: try cluster but don't advance in search list When we find an existing cluster, we switch to its block group as the current block group, possibly skipping multiple blocks in the process. Furthermore, under heavy contention, multiple threads may fail to allocate from a cluster and then release just-created clusters just to proceed to create new ones in a different block group. This patch tries to allocate from an existing cluster regardless of its block group, and doesn't switch to that group, instead proceeding to try to allocate a cluster from the group it was iterating before the attempt. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 74 +++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index db0b23b14f20..05e1386b8bec 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5106,11 +5106,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group_cache *used_block_group; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; struct btrfs_space_info *space_info; - int last_ptr_loop = 0; int loop = 0; int index = 0; int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? @@ -5172,6 +5172,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); + used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -5209,6 +5210,7 @@ search: u64 offset; int cached; + used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -5294,49 +5296,33 @@ alloc: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (!last_ptr->block_group || - last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data)) + used_block_group = last_ptr->block_group; + if (used_block_group != block_group && + (!used_block_group || + used_block_group->ro || + !block_group_bits(used_block_group, data))) { + used_block_group = block_group; goto refill_cluster; + } + + if (used_block_group != block_group) + btrfs_get_block_group(used_block_group); - offset = btrfs_alloc_from_cluster(block_group, last_ptr, - num_bytes, search_start); + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, num_bytes, used_block_group->key.objectid); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); goto checks; } - spin_lock(&last_ptr->lock); - /* - * whoops, this cluster doesn't actually point to - * this block group. Get a ref on the block - * group is does point to and try again - */ - if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group && - index <= - get_block_group_index(last_ptr->block_group)) { - - btrfs_put_block_group(block_group); - block_group = last_ptr->block_group; - btrfs_get_block_group(block_group); - spin_unlock(&last_ptr->lock); - spin_unlock(&last_ptr->refill_lock); - - last_ptr_loop = 1; - search_start = block_group->key.objectid; - /* - * we know this block group is properly - * in the list because - * btrfs_remove_block_group, drops the - * cluster before it removes the block - * group from the list - */ - goto have_block_group; + WARN_ON(last_ptr->block_group != used_block_group); + if (used_block_group != block_group) { + btrfs_put_block_group(used_block_group); + used_block_group = block_group; } - spin_unlock(&last_ptr->lock); refill_cluster: + BUG_ON(used_block_group != block_group); /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -5357,8 +5343,6 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - last_ptr_loop = 0; - /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5425,14 +5409,14 @@ checks: search_start = stripe_align(root, offset); /* move on to the next group */ if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } /* move on to the next group */ if (search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + used_block_group->key.objectid + used_block_group->key.offset) { + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5440,14 +5424,14 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, + ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5456,15 +5440,19 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.2.3 From a5d16333612718569ffd26064270e535cb9c3928 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: check if the to-be-added device is writable If we call ioctl(BTRFS_IOC_ADD_DEV) directly, we'll succeed in adding a readonly device to a btrfs filesystem, and btrfs will write to that device, emitting kernel errors: [ 3109.833692] lost page write due to I/O error on loop2 [ 3109.833720] lost page write due to I/O error on loop2 ... Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c37433d3cd82..0a8c8f8304b1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1611,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); -- cgit v1.2.3 From 1cf4ffdb3289624a6462c94f2ce05545b32ef736 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: drop spin lock when memory alloc fails Drop spin lock in convert_extent_bit() when memory alloc fails, otherwise, it will be a deadlock. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index be1bf627a14b..49f3c9dc09f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -935,8 +935,10 @@ again: node = tree_search(tree, start); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); @@ -992,8 +994,10 @@ hit_next: */ if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); prealloc = NULL; @@ -1024,8 +1028,10 @@ hit_next: this_end = last_start - 1; prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } /* * Avoid to free 'prealloc' if it can be merged with @@ -1051,8 +1057,10 @@ hit_next: */ if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); -- cgit v1.2.3