Btrfs: rework allocation clustering
Because btrfs is copy-on-write, we end up picking new locations for
blocks very often. This makes it fairly difficult to maintain perfect
read patterns over time, but we can at least do some optimizations
for writes.
This is done today by remembering the last place we allocated and
trying to find a free space hole big enough to hold more than just one
allocation. The end result is that we tend to write sequentially to
the drive.
This happens all the time for metadata and it happens for data
when mounted -o ssd. But, the way we record it is fairly racey
and it tends to fragment the free space over time because we are trying
to allocate fairly large areas at once.
This commit gets rid of the races by adding a free space cluster object
with dedicated locking to make sure that only one process at a time
is out replacing the cluster.
The free space fragmentation is somewhat solved by allowing a cluster
to be comprised of smaller free space extents. This part definitely
adds some CPU time to the cluster allocations, but it allows the allocator
to consume the small holes left behind by cow.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15d62a9..178df4c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
#include "volumes.h"
#include "locking.h"
#include "ref-cache.h"
+#include "free-space-cache.h"
#define PENDING_EXTENT_INSERT 0
#define PENDING_EXTENT_DELETE 1
@@ -324,7 +325,7 @@
return cache;
}
-static inline void put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
if (atomic_dec_and_test(&cache->count))
kfree(cache);
@@ -397,12 +398,12 @@
div_factor(cache->key.offset, factor)) {
group_start = cache->key.objectid;
spin_unlock(&cache->lock);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
goto found;
}
}
spin_unlock(&cache->lock);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
cond_resched();
}
if (!wrapped) {
@@ -1592,7 +1593,7 @@
if (!block_group || block_group->ro)
readonly = 1;
if (block_group)
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
return readonly;
}
@@ -2016,7 +2017,7 @@
WARN_ON(ret);
}
}
- put_block_group(cache);
+ btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
}
@@ -2033,7 +2034,7 @@
return 0;
bytenr = cache->key.objectid;
- put_block_group(cache);
+ btrfs_put_block_group(cache);
return bytenr;
}
@@ -2077,7 +2078,7 @@
if (cache->cached)
btrfs_add_free_space(cache, bytenr, len);
}
- put_block_group(cache);
+ btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
@@ -2108,7 +2109,7 @@
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
@@ -2543,12 +2544,13 @@
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
- u64 *last_ptr = NULL;
+ struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
int empty_cluster = 2 * 1024 * 1024;
int allowed_chunk_alloc = 0;
- int using_hint = 0;
struct btrfs_space_info *space_info;
+ int last_ptr_loop = 0;
+ int loop = 0;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -2561,38 +2563,39 @@
allowed_chunk_alloc = 1;
if (data & BTRFS_BLOCK_GROUP_METADATA) {
- last_ptr = &root->fs_info->last_alloc;
+ last_ptr = &root->fs_info->meta_alloc_cluster;
if (!btrfs_test_opt(root, SSD))
empty_cluster = 64 * 1024;
}
- if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
- last_ptr = &root->fs_info->last_data_alloc;
+ if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+ last_ptr = &root->fs_info->data_alloc_cluster;
+ }
if (last_ptr) {
- if (*last_ptr)
- hint_byte = *last_ptr;
- else
- empty_size += empty_cluster;
- } else {
- empty_cluster = 0;
+ spin_lock(&last_ptr->lock);
+ if (last_ptr->block_group)
+ hint_byte = last_ptr->window_start;
+ spin_unlock(&last_ptr->lock);
}
+
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
+ if (!last_ptr) {
+ empty_cluster = 0;
+ loop = 1;
+ }
+
if (search_start == hint_byte) {
- using_hint = 1;
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
if (block_group && block_group_bits(block_group, data)) {
down_read(&space_info->groups_sem);
goto have_block_group;
} else if (block_group) {
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
}
-
- empty_size += empty_cluster;
- using_hint = 0;
}
search:
@@ -2609,7 +2612,7 @@
ret = cache_block_group(root, block_group);
mutex_unlock(&block_group->cache_mutex);
if (ret) {
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
break;
}
}
@@ -2617,11 +2620,88 @@
if (unlikely(block_group->ro))
goto loop;
+ if (last_ptr) {
+ /*
+ * the refill lock keeps out other
+ * people trying to start a new cluster
+ */
+ spin_lock(&last_ptr->refill_lock);
+ offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+ num_bytes, search_start);
+ if (offset) {
+ /* we have a block, we're done */
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
+ }
+
+ spin_lock(&last_ptr->lock);
+ /*
+ * whoops, this cluster doesn't actually point to
+ * this block group. Get a ref on the block
+ * group is does point to and try again
+ */
+ if (!last_ptr_loop && last_ptr->block_group &&
+ last_ptr->block_group != block_group) {
+
+ btrfs_put_block_group(block_group);
+ block_group = last_ptr->block_group;
+ atomic_inc(&block_group->count);
+ spin_unlock(&last_ptr->lock);
+ spin_unlock(&last_ptr->refill_lock);
+
+ last_ptr_loop = 1;
+ search_start = block_group->key.objectid;
+ goto have_block_group;
+ }
+ spin_unlock(&last_ptr->lock);
+
+ /*
+ * this cluster didn't work out, free it and
+ * start over
+ */
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+ last_ptr_loop = 0;
+
+ /* allocate a cluster in this block group */
+ ret = btrfs_find_space_cluster(trans,
+ block_group, last_ptr,
+ offset, num_bytes,
+ empty_cluster + empty_size);
+ if (ret == 0) {
+ /*
+ * now pull our allocation out of this
+ * cluster
+ */
+ offset = btrfs_alloc_from_cluster(block_group,
+ last_ptr, num_bytes,
+ search_start);
+ if (offset) {
+ /* we found one, proceed */
+ spin_unlock(&last_ptr->refill_lock);
+ goto checks;
+ }
+ }
+ /*
+ * at this point we either didn't find a cluster
+ * or we weren't able to allocate a block from our
+ * cluster. Free the cluster we've been trying
+ * to use, and go to the next block group
+ */
+ if (loop < 2) {
+ btrfs_return_cluster_to_free_space(NULL,
+ last_ptr);
+ spin_unlock(&last_ptr->refill_lock);
+ goto loop;
+ }
+ spin_unlock(&last_ptr->refill_lock);
+ }
+
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
if (!offset)
goto loop;
-
+checks:
search_start = stripe_align(root, offset);
/* move on to the next group */
@@ -2637,11 +2717,6 @@
goto loop;
}
- if (using_hint && search_start > hint_byte) {
- btrfs_add_free_space(block_group, offset, num_bytes);
- goto loop;
- }
-
if (exclude_nr > 0 &&
(search_start + num_bytes > exclude_start &&
search_start < exclude_start + exclude_nr)) {
@@ -2670,33 +2745,33 @@
/* we are all good, lets return */
break;
loop:
- put_block_group(block_group);
- if (using_hint) {
- empty_size += empty_cluster;
- using_hint = 0;
- up_read(&space_info->groups_sem);
- goto search;
- }
+ btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
- if (!ins->objectid && (empty_size || allowed_chunk_alloc)) {
- int try_again = empty_size;
-
- empty_size = 0;
+ /* loop == 0, try to find a clustered alloc in every block group
+ * loop == 1, try again after forcing a chunk allocation
+ * loop == 2, set empty_size and empty_cluster to 0 and try again
+ */
+ if (!ins->objectid && loop < 3 &&
+ (empty_size || empty_cluster || allowed_chunk_alloc)) {
+ if (loop >= 2) {
+ empty_size = 0;
+ empty_cluster = 0;
+ }
if (allowed_chunk_alloc) {
ret = do_chunk_alloc(trans, root, num_bytes +
2 * 1024 * 1024, data, 1);
- if (!ret)
- try_again = 1;
allowed_chunk_alloc = 0;
} else {
space_info->force_alloc = 1;
}
- if (try_again)
+ if (loop < 3) {
+ loop++;
goto search;
+ }
ret = -ENOSPC;
} else if (!ins->objectid) {
ret = -ENOSPC;
@@ -2707,9 +2782,7 @@
if (!(data & BTRFS_BLOCK_GROUP_DATA))
trans->block_group = block_group->key.objectid;
- if (last_ptr)
- *last_ptr = ins->objectid + ins->offset;
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = 0;
}
@@ -2817,7 +2890,7 @@
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
- put_block_group(cache);
+ btrfs_put_block_group(cache);
update_reserved_extents(root, start, len, 0);
return ret;
@@ -2955,7 +3028,7 @@
ret = btrfs_remove_free_space(block_group, ins->objectid,
ins->offset);
BUG_ON(ret);
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
ref_generation, owner, ins, 1);
return ret;
@@ -5644,7 +5717,7 @@
WARN_ON(block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
spin_unlock(&block_group->lock);
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = 0;
out:
btrfs_free_path(path);
@@ -5774,6 +5847,7 @@
spin_lock_init(&cache->tree_lock);
mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
@@ -5830,6 +5904,7 @@
spin_lock_init(&cache->tree_lock);
mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
btrfs_set_block_group_used(&cache->item, bytes_used);
btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5889,8 +5964,8 @@
spin_unlock(&block_group->space_info->lock);
block_group->space_info->full = 0;
- put_block_group(block_group);
- put_block_group(block_group);
+ btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)