fs/btrfs/block-group.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0

 #include "ctree.h"
 #include "block-group.h"
 #include "space-info.h"
 #include "disk-io.h"
 #include "free-space-cache.h"
 #include "free-space-tree.h"
 #include "disk-io.h"
 #include "volumes.h"
 #include "transaction.h"
 #include "ref-verify.h"
 #include "sysfs.h"
 #include "tree-log.h"

 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 {
 	atomic_inc(&cache->count);
 }

 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);

 		/*
 		 * If not empty, someone is still holding mutex of
 		 * full_stripe_lock, which can only be released by caller.
 		 * And it will definitely cause use-after-free when caller
 		 * tries to release full stripe lock.
 		 *
 		 * No better way to resolve, but only to warn.
 		 */
 		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
 }

 /*
  * This adds the block group to the fs_info rb tree for the block group cache
  */
 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 				struct btrfs_block_group_cache *block_group)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct btrfs_block_group_cache *cache;

 	spin_lock(&info->block_group_cache_lock);
 	p = &info->block_group_cache_tree.rb_node;

 	while (*p) {
 		parent = *p;
 		cache = rb_entry(parent, struct btrfs_block_group_cache,
 				 cache_node);
 		if (block_group->key.objectid < cache->key.objectid) {
 			p = &(*p)->rb_left;
 		} else if (block_group->key.objectid > cache->key.objectid) {
 			p = &(*p)->rb_right;
 		} else {
 			spin_unlock(&info->block_group_cache_lock);
 			return -EEXIST;
 		}
 	}

 	rb_link_node(&block_group->cache_node, parent, p);
 	rb_insert_color(&block_group->cache_node,
 			&info->block_group_cache_tree);

 	if (info->first_logical_byte > block_group->key.objectid)
 		info->first_logical_byte = block_group->key.objectid;

 	spin_unlock(&info->block_group_cache_lock);

 	return 0;
 }

 /*
  * This will return the block group at or after bytenr if contains is 0, else
  * it will return the block group that contains the bytenr
  */
 static struct btrfs_block_group_cache *block_group_cache_tree_search(
 		struct btrfs_fs_info *info, u64 bytenr, int contains)
 {
 	struct btrfs_block_group_cache *cache, *ret = NULL;
 	struct rb_node *n;
 	u64 end, start;

 	spin_lock(&info->block_group_cache_lock);
 	n = info->block_group_cache_tree.rb_node;

 	while (n) {
 		cache = rb_entry(n, struct btrfs_block_group_cache,
 				 cache_node);
 		end = cache->key.objectid + cache->key.offset - 1;
 		start = cache->key.objectid;

 		if (bytenr < start) {
 			if (!contains && (!ret || start < ret->key.objectid))
 				ret = cache;
 			n = n->rb_left;
 		} else if (bytenr > start) {
 			if (contains && bytenr <= end) {
 				ret = cache;
 				break;
 			}
 			n = n->rb_right;
 		} else {
 			ret = cache;
 			break;
 		}
 	}
 	if (ret) {
 		btrfs_get_block_group(ret);
 		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 			info->first_logical_byte = ret->key.objectid;
 	}
 	spin_unlock(&info->block_group_cache_lock);

 	return ret;
 }

 /*
  * Return the block group that starts at or after bytenr
  */
 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
 		struct btrfs_fs_info *info, u64 bytenr)
 {
 	return block_group_cache_tree_search(info, bytenr, 0);
 }

 /*
  * Return the block group that contains the given bytenr
  */
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 		struct btrfs_fs_info *info, u64 bytenr)
 {
 	return block_group_cache_tree_search(info, bytenr, 1);
 }

 struct btrfs_block_group_cache *btrfs_next_block_group(
 		struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct rb_node *node;

 	spin_lock(&fs_info->block_group_cache_lock);

 	/* If our block group was removed, we need a full search. */
 	if (RB_EMPTY_NODE(&cache->cache_node)) {
 		const u64 next_bytenr = cache->key.objectid + cache->key.offset;

 		spin_unlock(&fs_info->block_group_cache_lock);
 		btrfs_put_block_group(cache);
 		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 	}
 	node = rb_next(&cache->cache_node);
 	btrfs_put_block_group(cache);
 	if (node) {
 		cache = rb_entry(node, struct btrfs_block_group_cache,
 				 cache_node);
 		btrfs_get_block_group(cache);
 	} else
 		cache = NULL;
 	spin_unlock(&fs_info->block_group_cache_lock);
 	return cache;
 }

 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
 	struct btrfs_block_group_cache *bg;
 	bool ret = true;

 	bg = btrfs_lookup_block_group(fs_info, bytenr);
 	if (!bg)
 		return false;

 	spin_lock(&bg->lock);
 	if (bg->ro)
 		ret = false;
 	else
 		atomic_inc(&bg->nocow_writers);
 	spin_unlock(&bg->lock);

 	/* No put on block group, done by btrfs_dec_nocow_writers */
 	if (!ret)
 		btrfs_put_block_group(bg);

 	return ret;
 }

 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
 	struct btrfs_block_group_cache *bg;

 	bg = btrfs_lookup_block_group(fs_info, bytenr);
 	ASSERT(bg);
 	if (atomic_dec_and_test(&bg->nocow_writers))
 		wake_up_var(&bg->nocow_writers);
 	/*
 	 * Once for our lookup and once for the lookup done by a previous call
 	 * to btrfs_inc_nocow_writers()
 	 */
 	btrfs_put_block_group(bg);
 	btrfs_put_block_group(bg);
 }

 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 {
 	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 }

 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 					const u64 start)
 {
 	struct btrfs_block_group_cache *bg;

 	bg = btrfs_lookup_block_group(fs_info, start);
 	ASSERT(bg);
 	if (atomic_dec_and_test(&bg->reservations))
 		wake_up_var(&bg->reservations);
 	btrfs_put_block_group(bg);
 }

 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_space_info *space_info = bg->space_info;

 	ASSERT(bg->ro);

 	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 		return;

 	/*
 	 * Our block group is read only but before we set it to read only,
 	 * some task might have had allocated an extent from it already, but it
 	 * has not yet created a respective ordered extent (and added it to a
 	 * root's list of ordered extents).
 	 * Therefore wait for any task currently allocating extents, since the
 	 * block group's reservations counter is incremented while a read lock
 	 * on the groups' semaphore is held and decremented after releasing
 	 * the read access on that semaphore and creating the ordered extent.
 	 */
 	down_write(&space_info->groups_sem);
 	up_write(&space_info->groups_sem);

 	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 }

 struct btrfs_caching_control *btrfs_get_caching_control(
 		struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_caching_control *ctl;

 	spin_lock(&cache->lock);
 	if (!cache->caching_ctl) {
 		spin_unlock(&cache->lock);
 		return NULL;
 	}

 	ctl = cache->caching_ctl;
 	refcount_inc(&ctl->count);
 	spin_unlock(&cache->lock);
 	return ctl;
 }

 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 {
 	if (refcount_dec_and_test(&ctl->count))
 		kfree(ctl);
 }

 /*
  * When we wait for progress in the block group caching, its because our
  * allocation attempt failed at least once.  So, we must sleep and let some
  * progress happen before we try again.
  *
  * This function will sleep at least once waiting for new free space to show
  * up, and then it will check the block group free space numbers for our min
  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
  * a free extent of a given size, but this is a good start.
  *
  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
  * any of the information in this block group.
  */
 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 					   u64 num_bytes)
 {
 	struct btrfs_caching_control *caching_ctl;

 	caching_ctl = btrfs_get_caching_control(cache);
 	if (!caching_ctl)
 		return;

 	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) ||
 		   (cache->free_space_ctl->free_space >= num_bytes));

 	btrfs_put_caching_control(caching_ctl);
 }

 int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_caching_control *caching_ctl;
 	int ret = 0;

 	caching_ctl = btrfs_get_caching_control(cache);
 	if (!caching_ctl)
 		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;

 	wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache));
 	if (cache->cached == BTRFS_CACHE_ERROR)
 		ret = -EIO;
 	btrfs_put_caching_control(caching_ctl);
 	return ret;
 }

 #ifdef CONFIG_BTRFS_DEBUG
 void btrfs_fragment_free_space(struct btrfs_block_group_cache *block_group)
 {
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	u64 start = block_group->key.objectid;
 	u64 len = block_group->key.offset;
 	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 		fs_info->nodesize : fs_info->sectorsize;
 	u64 step = chunk << 1;

 	while (len > chunk) {
 		btrfs_remove_free_space(block_group, start, chunk);
 		start += step;
 		if (len < step)
 			len = 0;
 		else
 			len -= step;
 	}
 }
 #endif

 /*
  * This is only called by btrfs_cache_block_group, since we could have freed
  * extents we need to check the pinned_extents for any extents that can't be
  * used yet since their free space will be released as soon as the transaction
  * commits.
  */
 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 		       u64 start, u64 end)
 {
 	struct btrfs_fs_info *info = block_group->fs_info;
 	u64 extent_start, extent_end, size, total_added = 0;
 	int ret;

 	while (start < end) {
 		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
 					    EXTENT_DIRTY | EXTENT_UPTODATE,
 					    NULL);
 		if (ret)
 			break;

 		if (extent_start <= start) {
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
 			total_added += size;
 			ret = btrfs_add_free_space(block_group, start,
 						   size);
 			BUG_ON(ret); /* -ENOMEM or logic error */
 			start = extent_end + 1;
 		} else {
 			break;
 		}
 	}

 	if (start < end) {
 		size = end - start;
 		total_added += size;
 		ret = btrfs_add_free_space(block_group, start, size);
 		BUG_ON(ret); /* -ENOMEM or logic error */
 	}

 	return total_added;
 }

 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 {
 	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	u64 total_found = 0;
 	u64 last = 0;
 	u32 nritems;
 	int ret;
 	bool wakeup = true;

 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;

 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);

 #ifdef CONFIG_BTRFS_DEBUG
 	/*
 	 * If we're fragmenting we don't want to make anybody think we can
 	 * allocate from this block group until we've had a chance to fragment
 	 * the free space.
 	 */
 	if (btrfs_should_fragment_free_space(block_group))
 		wakeup = false;
 #endif
 	/*
 	 * We don't want to deadlock with somebody trying to allocate a new
 	 * extent for the extent root while also trying to search the extent
 	 * root to add free space.  So we skip locking and search the commit
 	 * root, since its read-only
 	 */
 	path->skip_locking = 1;
 	path->search_commit_root = 1;
 	path->reada = READA_FORWARD;

 	key.objectid = last;
 	key.offset = 0;
 	key.type = BTRFS_EXTENT_ITEM_KEY;

 next:
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;

 	leaf = path->nodes[0];
 	nritems = btrfs_header_nritems(leaf);

 	while (1) {
 		if (btrfs_fs_closing(fs_info) > 1) {
 			last = (u64)-1;
 			break;
 		}

 		if (path->slots[0] < nritems) {
 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 		} else {
 			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 			if (ret)
 				break;

 			if (need_resched() ||
 			    rwsem_is_contended(&fs_info->commit_root_sem)) {
 				if (wakeup)
 					caching_ctl->progress = last;
 				btrfs_release_path(path);
 				up_read(&fs_info->commit_root_sem);
 				mutex_unlock(&caching_ctl->mutex);
 				cond_resched();
 				mutex_lock(&caching_ctl->mutex);
 				down_read(&fs_info->commit_root_sem);
 				goto next;
 			}

 			ret = btrfs_next_leaf(extent_root, path);
 			if (ret < 0)
 				goto out;
 			if (ret)
 				break;
 			leaf = path->nodes[0];
 			nritems = btrfs_header_nritems(leaf);
 			continue;
 		}

 		if (key.objectid < last) {
 			key.objectid = last;
 			key.offset = 0;
 			key.type = BTRFS_EXTENT_ITEM_KEY;

 			if (wakeup)
 				caching_ctl->progress = last;
 			btrfs_release_path(path);
 			goto next;
 		}

 		if (key.objectid < block_group->key.objectid) {
 			path->slots[0]++;
 			continue;
 		}

 		if (key.objectid >= block_group->key.objectid +
 		    block_group->key.offset)
 			break;

 		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		    key.type == BTRFS_METADATA_ITEM_KEY) {
 			total_found += add_new_free_space(block_group, last,
 							  key.objectid);
 			if (key.type == BTRFS_METADATA_ITEM_KEY)
 				last = key.objectid +
 					fs_info->nodesize;
 			else
 				last = key.objectid + key.offset;

 			if (total_found > CACHING_CTL_WAKE_UP) {
 				total_found = 0;
 				if (wakeup)
 					wake_up(&caching_ctl->wait);
 			}
 		}
 		path->slots[0]++;
 	}
 	ret = 0;

 	total_found += add_new_free_space(block_group, last,
 					  block_group->key.objectid +
 					  block_group->key.offset);
 	caching_ctl->progress = (u64)-1;

 out:
 	btrfs_free_path(path);
 	return ret;
 }

 static noinline void caching_thread(struct btrfs_work *work)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_caching_control *caching_ctl;
 	int ret;

 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
 	fs_info = block_group->fs_info;

 	mutex_lock(&caching_ctl->mutex);
 	down_read(&fs_info->commit_root_sem);

 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 		ret = load_free_space_tree(caching_ctl);
 	else
 		ret = load_extent_tree_free(caching_ctl);

 	spin_lock(&block_group->lock);
 	block_group->caching_ctl = NULL;
 	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 	spin_unlock(&block_group->lock);

 #ifdef CONFIG_BTRFS_DEBUG
 	if (btrfs_should_fragment_free_space(block_group)) {
 		u64 bytes_used;

 		spin_lock(&block_group->space_info->lock);
 		spin_lock(&block_group->lock);
 		bytes_used = block_group->key.offset -
 			btrfs_block_group_used(&block_group->item);
 		block_group->space_info->bytes_used += bytes_used >> 1;
 		spin_unlock(&block_group->lock);
 		spin_unlock(&block_group->space_info->lock);
 		btrfs_fragment_free_space(block_group);
 	}
 #endif

 	caching_ctl->progress = (u64)-1;

 	up_read(&fs_info->commit_root_sem);
 	btrfs_free_excluded_extents(block_group);
 	mutex_unlock(&caching_ctl->mutex);

 	wake_up(&caching_ctl->wait);

 	btrfs_put_caching_control(caching_ctl);
 	btrfs_put_block_group(block_group);
 }

 int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
 			    int load_cache_only)
 {
 	DEFINE_WAIT(wait);
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_caching_control *caching_ctl;
 	int ret = 0;

 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 	if (!caching_ctl)
 		return -ENOMEM;

 	INIT_LIST_HEAD(&caching_ctl->list);
 	mutex_init(&caching_ctl->mutex);
 	init_waitqueue_head(&caching_ctl->wait);
 	caching_ctl->block_group = cache;
 	caching_ctl->progress = cache->key.objectid;
 	refcount_set(&caching_ctl->count, 1);
 	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 			caching_thread, NULL, NULL);

 	spin_lock(&cache->lock);
 	/*
 	 * This should be a rare occasion, but this could happen I think in the
 	 * case where one thread starts to load the space cache info, and then
 	 * some other thread starts a transaction commit which tries to do an
 	 * allocation while the other thread is still loading the space cache
 	 * info.  The previous loop should have kept us from choosing this block
 	 * group, but if we've moved to the state where we will wait on caching
 	 * block groups we need to first check if we're doing a fast load here,
 	 * so we can wait for it to finish, otherwise we could end up allocating
 	 * from a block group who's cache gets evicted for one reason or
 	 * another.
 	 */
 	while (cache->cached == BTRFS_CACHE_FAST) {
 		struct btrfs_caching_control *ctl;

 		ctl = cache->caching_ctl;
 		refcount_inc(&ctl->count);
 		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 		spin_unlock(&cache->lock);

 		schedule();

 		finish_wait(&ctl->wait, &wait);
 		btrfs_put_caching_control(ctl);
 		spin_lock(&cache->lock);
 	}

 	if (cache->cached != BTRFS_CACHE_NO) {
 		spin_unlock(&cache->lock);
 		kfree(caching_ctl);
 		return 0;
 	}
 	WARN_ON(cache->caching_ctl);
 	cache->caching_ctl = caching_ctl;
 	cache->cached = BTRFS_CACHE_FAST;
 	spin_unlock(&cache->lock);

 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 		mutex_lock(&caching_ctl->mutex);
 		ret = load_free_space_cache(cache);

 		spin_lock(&cache->lock);
 		if (ret == 1) {
 			cache->caching_ctl = NULL;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			cache->last_byte_to_unpin = (u64)-1;
 			caching_ctl->progress = (u64)-1;
 		} else {
 			if (load_cache_only) {
 				cache->caching_ctl = NULL;
 				cache->cached = BTRFS_CACHE_NO;
 			} else {
 				cache->cached = BTRFS_CACHE_STARTED;
 				cache->has_caching_ctl = 1;
 			}
 		}
 		spin_unlock(&cache->lock);
 #ifdef CONFIG_BTRFS_DEBUG
 		if (ret == 1 &&
 		    btrfs_should_fragment_free_space(cache)) {
 			u64 bytes_used;

 			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			bytes_used = cache->key.offset -
 				btrfs_block_group_used(&cache->item);
 			cache->space_info->bytes_used += bytes_used >> 1;
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 			btrfs_fragment_free_space(cache);
 		}
 #endif
 		mutex_unlock(&caching_ctl->mutex);

 		wake_up(&caching_ctl->wait);
 		if (ret == 1) {
 			btrfs_put_caching_control(caching_ctl);
 			btrfs_free_excluded_extents(cache);
 			return 0;
 		}
 	} else {
 		/*
 		 * We're either using the free space tree or no caching at all.
 		 * Set cached to the appropriate value and wakeup any waiters.
 		 */
 		spin_lock(&cache->lock);
 		if (load_cache_only) {
 			cache->caching_ctl = NULL;
 			cache->cached = BTRFS_CACHE_NO;
 		} else {
 			cache->cached = BTRFS_CACHE_STARTED;
 			cache->has_caching_ctl = 1;
 		}
 		spin_unlock(&cache->lock);
 		wake_up(&caching_ctl->wait);
 	}

 	if (load_cache_only) {
 		btrfs_put_caching_control(caching_ctl);
 		return 0;
 	}

 	down_write(&fs_info->commit_root_sem);
 	refcount_inc(&caching_ctl->count);
 	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 	up_write(&fs_info->commit_root_sem);

 	btrfs_get_block_group(cache);

 	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);

 	return ret;
 }

 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = chunk_to_extended(flags) &
 				BTRFS_EXTENDED_PROFILE_MASK;

 	write_seqlock(&fs_info->profiles_lock);
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		fs_info->avail_data_alloc_bits &= ~extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		fs_info->avail_system_alloc_bits &= ~extra_flags;
 	write_sequnlock(&fs_info->profiles_lock);
 }

 /*
  * Clear incompat bits for the following feature(s):
  *
  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
  *            in the whole filesystem
  */
 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
 		struct list_head *head = &fs_info->space_info;
 		struct btrfs_space_info *sinfo;

 		list_for_each_entry_rcu(sinfo, head, list) {
 			bool found = false;

 			down_read(&sinfo->groups_sem);
 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 				found = true;
 			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 				found = true;
 			up_read(&sinfo->groups_sem);

 			if (found)
 				return;
 		}
 		btrfs_clear_fs_incompat(fs_info, RAID56);
 	}
 }

 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     u64 group_start, struct extent_map *em)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_path *path;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_free_cluster *cluster;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_key key;
 	struct inode *inode;
 	struct kobject *kobj = NULL;
 	int ret;
 	int index;
 	int factor;
 	struct btrfs_caching_control *caching_ctl = NULL;
 	bool remove_em;
 	bool remove_rsv = false;

 	block_group = btrfs_lookup_block_group(fs_info, group_start);
 	BUG_ON(!block_group);
 	BUG_ON(!block_group->ro);

 	trace_btrfs_remove_block_group(block_group);
 	/*
 	 * Free the reserved super bytes from this block group before
 	 * remove it.
 	 */
 	btrfs_free_excluded_extents(block_group);
 	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
 				  block_group->key.offset);

 	memcpy(&key, &block_group->key, sizeof(key));
 	index = btrfs_bg_flags_to_raid_index(block_group->flags);
 	factor = btrfs_bg_type_to_factor(block_group->flags);

 	/* make sure this block group isn't part of an allocation cluster */
 	cluster = &fs_info->data_alloc_cluster;
 	spin_lock(&cluster->refill_lock);
 	btrfs_return_cluster_to_free_space(block_group, cluster);
 	spin_unlock(&cluster->refill_lock);

 	/*
 	 * make sure this block group isn't part of a metadata
 	 * allocation cluster
 	 */
 	cluster = &fs_info->meta_alloc_cluster;
 	spin_lock(&cluster->refill_lock);
 	btrfs_return_cluster_to_free_space(block_group, cluster);
 	spin_unlock(&cluster->refill_lock);

 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
 		goto out;
 	}

 	/*
 	 * get the inode first so any iput calls done for the io_list
 	 * aren't the final iput (no unlinks allowed now)
 	 */
 	inode = lookup_free_space_inode(block_group, path);

 	mutex_lock(&trans->transaction->cache_write_mutex);
 	/*
 	 * Make sure our free space cache IO is done before removing the
 	 * free space inode
 	 */
 	spin_lock(&trans->transaction->dirty_bgs_lock);
 	if (!list_empty(&block_group->io_list)) {
 		list_del_init(&block_group->io_list);

 		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);

 		spin_unlock(&trans->transaction->dirty_bgs_lock);
 		btrfs_wait_cache_io(trans, block_group, path);
 		btrfs_put_block_group(block_group);
 		spin_lock(&trans->transaction->dirty_bgs_lock);
 	}

 	if (!list_empty(&block_group->dirty_list)) {
 		list_del_init(&block_group->dirty_list);
 		remove_rsv = true;
 		btrfs_put_block_group(block_group);
 	}
 	spin_unlock(&trans->transaction->dirty_bgs_lock);
 	mutex_unlock(&trans->transaction->cache_write_mutex);

 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 		if (ret) {
 			btrfs_add_delayed_iput(inode);
 			goto out;
 		}
 		clear_nlink(inode);
 		/* One for the block groups ref */
 		spin_lock(&block_group->lock);
 		if (block_group->iref) {
 			block_group->iref = 0;
 			block_group->inode = NULL;
 			spin_unlock(&block_group->lock);
 			iput(inode);
 		} else {
 			spin_unlock(&block_group->lock);
 		}
 		/* One for our lookup ref */
 		btrfs_add_delayed_iput(inode);
 	}

 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = block_group->key.objectid;
 	key.type = 0;

 	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
 	if (ret < 0)
 		goto out;
 	if (ret > 0)
 		btrfs_release_path(path);
 	if (ret == 0) {
 		ret = btrfs_del_item(trans, tree_root, path);
 		if (ret)
 			goto out;
 		btrfs_release_path(path);
 	}

 	spin_lock(&fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &fs_info->block_group_cache_tree);
 	RB_CLEAR_NODE(&block_group->cache_node);

 	if (fs_info->first_logical_byte == block_group->key.objectid)
 		fs_info->first_logical_byte = (u64)-1;
 	spin_unlock(&fs_info->block_group_cache_lock);

 	down_write(&block_group->space_info->groups_sem);
 	/*
 	 * we must use list_del_init so people can check to see if they
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
 	if (list_empty(&block_group->space_info->block_groups[index])) {
 		kobj = block_group->space_info->block_group_kobjs[index];
 		block_group->space_info->block_group_kobjs[index] = NULL;
 		clear_avail_alloc_bits(fs_info, block_group->flags);
 	}
 	up_write(&block_group->space_info->groups_sem);
 	clear_incompat_bg_bits(fs_info, block_group->flags);
 	if (kobj) {
 		kobject_del(kobj);
 		kobject_put(kobj);
 	}

 	if (block_group->has_caching_ctl)
 		caching_ctl = btrfs_get_caching_control(block_group);
 	if (block_group->cached == BTRFS_CACHE_STARTED)
 		btrfs_wait_block_group_cache_done(block_group);
 	if (block_group->has_caching_ctl) {
 		down_write(&fs_info->commit_root_sem);
 		if (!caching_ctl) {
 			struct btrfs_caching_control *ctl;

 			list_for_each_entry(ctl,
 				    &fs_info->caching_block_groups, list)
 				if (ctl->block_group == block_group) {
 					caching_ctl = ctl;
 					refcount_inc(&caching_ctl->count);
 					break;
 				}
 		}
 		if (caching_ctl)
 			list_del_init(&caching_ctl->list);
 		up_write(&fs_info->commit_root_sem);
 		if (caching_ctl) {
 			/* Once for the caching bgs list and once for us. */
 			btrfs_put_caching_control(caching_ctl);
 			btrfs_put_caching_control(caching_ctl);
 		}
 	}

 	spin_lock(&trans->transaction->dirty_bgs_lock);
 	WARN_ON(!list_empty(&block_group->dirty_list));
 	WARN_ON(!list_empty(&block_group->io_list));
 	spin_unlock(&trans->transaction->dirty_bgs_lock);

 	btrfs_remove_free_space_cache(block_group);

 	spin_lock(&block_group->space_info->lock);
 	list_del_init(&block_group->ro_list);

 	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 		WARN_ON(block_group->space_info->total_bytes
 			< block_group->key.offset);
 		WARN_ON(block_group->space_info->bytes_readonly
 			< block_group->key.offset);
 		WARN_ON(block_group->space_info->disk_total
 			< block_group->key.offset * factor);
 	}
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
 	block_group->space_info->disk_total -= block_group->key.offset * factor;

 	spin_unlock(&block_group->space_info->lock);

 	memcpy(&key, &block_group->key, sizeof(key));

 	mutex_lock(&fs_info->chunk_mutex);
 	spin_lock(&block_group->lock);
 	block_group->removed = 1;
 	/*
 	 * At this point trimming can't start on this block group, because we
 	 * removed the block group from the tree fs_info->block_group_cache_tree
 	 * so no one can't find it anymore and even if someone already got this
 	 * block group before we removed it from the rbtree, they have already
 	 * incremented block_group->trimming - if they didn't, they won't find
 	 * any free space entries because we already removed them all when we
 	 * called btrfs_remove_free_space_cache().
 	 *
 	 * And we must not remove the extent map from the fs_info->mapping_tree
 	 * to prevent the same logical address range and physical device space
 	 * ranges from being reused for a new block group. This is because our
 	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
 	 * completely transactionless, so while it is trimming a range the
 	 * currently running transaction might finish and a new one start,
 	 * allowing for new block groups to be created that can reuse the same
 	 * physical device locations unless we take this special care.
 	 *
 	 * There may also be an implicit trim operation if the file system
 	 * is mounted with -odiscard. The same protections must remain
 	 * in place until the extents have been discarded completely when
 	 * the transaction commit has completed.
 	 */
 	remove_em = (atomic_read(&block_group->trimming) == 0);
 	spin_unlock(&block_group->lock);

 	mutex_unlock(&fs_info->chunk_mutex);

 	ret = remove_block_group_free_space(trans, block_group);
 	if (ret)
 		goto out;

 	btrfs_put_block_group(block_group);
 	btrfs_put_block_group(block_group);

 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0)
 		ret = -EIO;
 	if (ret < 0)
 		goto out;

 	ret = btrfs_del_item(trans, root, path);
 	if (ret)
 		goto out;

 	if (remove_em) {
 		struct extent_map_tree *em_tree;

 		em_tree = &fs_info->mapping_tree;
 		write_lock(&em_tree->lock);
 		remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 		/* once for the tree */
 		free_extent_map(em);
 	}
 out:
 	if (remove_rsv)
 		btrfs_delayed_refs_rsv_release(fs_info, 1);
 	btrfs_free_path(path);
 	return ret;
 }

 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
 {
 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
 	unsigned int num_items;

 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	read_unlock(&em_tree->lock);
 	ASSERT(em && em->start == chunk_offset);

 	/*
 	 * We need to reserve 3 + N units from the metadata space info in order
 	 * to remove a block group (done at btrfs_remove_chunk() and at
 	 * btrfs_remove_block_group()), which are used for:
 	 *
 	 * 1 unit for adding the free space inode's orphan (located in the tree
 	 * of tree roots).
 	 * 1 unit for deleting the block group item (located in the extent
 	 * tree).
 	 * 1 unit for deleting the free space item (located in tree of tree
 	 * roots).
 	 * N units for deleting N device extent items corresponding to each
 	 * stripe (located in the device tree).
 	 *
 	 * In order to remove a block group we also need to reserve units in the
 	 * system space info in order to update the chunk tree (update one or
 	 * more device items and remove one chunk item), but this is done at
 	 * btrfs_remove_chunk() through a call to check_system_chunk().
 	 */
 	map = em->map_lookup;
 	num_items = 3 + map->num_stripes;
 	free_extent_map(em);

 	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
 							   num_items, 1);
 }

 /*
  * Mark block group @cache read-only, so later write won't happen to block
  * group @cache.
  *
  * If @force is not set, this function will only mark the block group readonly
  * if we have enough free space (1M) in other metadata/system block groups.
  * If @force is not set, this function will mark the block group readonly
  * without checking free space.
  *
  * NOTE: This function doesn't care if other block groups can contain all the
  * data in this block group. That check should be done by relocation routine,
  * not this function.
  */
 int __btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;
 	u64 sinfo_used;
 	u64 min_allocable_bytes;
 	int ret = -ENOSPC;

 	/*
 	 * We need some metadata space and system metadata space for
 	 * allocating chunks in some corner cases until we force to set
 	 * it to be readonly.
 	 */
 	if ((sinfo->flags &
 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
 	    !force)
 		min_allocable_bytes = SZ_1M;
 	else
 		min_allocable_bytes = 0;

 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);

 	if (cache->ro) {
 		cache->ro++;
 		ret = 0;
 		goto out;
 	}

 	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 	sinfo_used = btrfs_space_info_used(sinfo, true);

 	/*
 	 * sinfo_used + num_bytes should always <= sinfo->total_bytes.
 	 *
 	 * Here we make sure if we mark this bg RO, we still have enough
 	 * free space as buffer (if min_allocable_bytes is not 0).
 	 */
 	if (sinfo_used + num_bytes + min_allocable_bytes <=
 	    sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
 		cache->ro++;
 		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
 		ret = 0;
 	}
 out:
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
 	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
 		btrfs_info(cache->fs_info,
 			"unable to make block group %llu ro",
 			cache->key.objectid);
 		btrfs_info(cache->fs_info,
 			"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
 			sinfo_used, num_bytes, min_allocable_bytes);
 		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
 	}
 	return ret;
 }

 /*
  * Process the unused_bgs list and remove any that don't have any allocated
  * space inside of them.
  */
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;

 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
 		return;

 	spin_lock(&fs_info->unused_bgs_lock);
 	while (!list_empty(&fs_info->unused_bgs)) {
 		u64 start, end;
 		int trimming;

 		block_group = list_first_entry(&fs_info->unused_bgs,
 					       struct btrfs_block_group_cache,
 					       bg_list);
 		list_del_init(&block_group->bg_list);

 		space_info = block_group->space_info;

 		if (ret || btrfs_mixed_space_info(space_info)) {
 			btrfs_put_block_group(block_group);
 			continue;
 		}
 		spin_unlock(&fs_info->unused_bgs_lock);

 		mutex_lock(&fs_info->delete_unused_bgs_mutex);

 		/* Don't want to race with allocators so take the groups_sem */
 		down_write(&space_info->groups_sem);
 		spin_lock(&block_group->lock);
 		if (block_group->reserved || block_group->pinned ||
 		    btrfs_block_group_used(&block_group->item) ||
 		    block_group->ro ||
 		    list_is_singular(&block_group->list)) {
 			/*
 			 * We want to bail if we made new allocations or have
 			 * outstanding allocations in this block group.  We do
 			 * the ro check in case balance is currently acting on
 			 * this block group.
 			 */
 			trace_btrfs_skip_unused_block_group(block_group);
 			spin_unlock(&block_group->lock);
 			up_write(&space_info->groups_sem);
 			goto next;
 		}
 		spin_unlock(&block_group->lock);

 		/* We don't want to force the issue, only flip if it's ok. */
 		ret = __btrfs_inc_block_group_ro(block_group, 0);
 		up_write(&space_info->groups_sem);
 		if (ret < 0) {
 			ret = 0;
 			goto next;
 		}

 		/*
 		 * Want to do this before we do anything else so we can recover
 		 * properly if we fail to join the transaction.
 		 */
 		trans = btrfs_start_trans_remove_block_group(fs_info,
 						     block_group->key.objectid);
 		if (IS_ERR(trans)) {
 			btrfs_dec_block_group_ro(block_group);
 			ret = PTR_ERR(trans);
 			goto next;
 		}

 		/*
 		 * We could have pending pinned extents for this block group,
 		 * just delete them, we don't care about them anymore.
 		 */
 		start = block_group->key.objectid;
 		end = start + block_group->key.offset - 1;
 		/*
 		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
 		 * btrfs_finish_extent_commit(). If we are at transaction N,
 		 * another task might be running finish_extent_commit() for the
 		 * previous transaction N - 1, and have seen a range belonging
 		 * to the block group in freed_extents[] before we were able to
 		 * clear the whole block group range from freed_extents[]. This
 		 * means that task can lookup for the block group after we
 		 * unpinned it from freed_extents[] and removed it, leading to
 		 * a BUG_ON() at btrfs_unpin_extent_range().
 		 */
 		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
 				  EXTENT_DIRTY);
 		if (ret) {
 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 			btrfs_dec_block_group_ro(block_group);
 			goto end_trans;
 		}
 		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
 				  EXTENT_DIRTY);
 		if (ret) {
 			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 			btrfs_dec_block_group_ro(block_group);
 			goto end_trans;
 		}
 		mutex_unlock(&fs_info->unused_bg_unpin_mutex);

 		/* Reset pinned so btrfs_put_block_group doesn't complain */
 		spin_lock(&space_info->lock);
 		spin_lock(&block_group->lock);

 		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
 						     -block_group->pinned);
 		space_info->bytes_readonly += block_group->pinned;
 		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 				   -block_group->pinned,
 				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
 		block_group->pinned = 0;

 		spin_unlock(&block_group->lock);
 		spin_unlock(&space_info->lock);

 		/* DISCARD can flip during remount */
 		trimming = btrfs_test_opt(fs_info, DISCARD);

 		/* Implicit trim during transaction commit. */
 		if (trimming)
 			btrfs_get_block_group_trimming(block_group);

 		/*
 		 * Btrfs_remove_chunk will abort the transaction if things go
 		 * horribly wrong.
 		 */
 		ret = btrfs_remove_chunk(trans, block_group->key.objectid);

 		if (ret) {
 			if (trimming)
 				btrfs_put_block_group_trimming(block_group);
 			goto end_trans;
 		}

 		/*
 		 * If we're not mounted with -odiscard, we can just forget
 		 * about this block group. Otherwise we'll need to wait
 		 * until transaction commit to do the actual discard.
 		 */
 		if (trimming) {
 			spin_lock(&fs_info->unused_bgs_lock);
 			/*
 			 * A concurrent scrub might have added us to the list
 			 * fs_info->unused_bgs, so use a list_move operation
 			 * to add the block group to the deleted_bgs list.
 			 */
 			list_move(&block_group->bg_list,
 				  &trans->transaction->deleted_bgs);
 			spin_unlock(&fs_info->unused_bgs_lock);
 			btrfs_get_block_group(block_group);
 		}
 end_trans:
 		btrfs_end_transaction(trans);
 next:
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 		btrfs_put_block_group(block_group);
 		spin_lock(&fs_info->unused_bgs_lock);
 	}
 	spin_unlock(&fs_info->unused_bgs_lock);
 }

 void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_fs_info *fs_info = bg->fs_info;

 	spin_lock(&fs_info->unused_bgs_lock);
 	if (list_empty(&bg->bg_list)) {
 		btrfs_get_block_group(bg);
 		trace_btrfs_add_unused_block_group(bg);
 		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
 	}
 	spin_unlock(&fs_info->unused_bgs_lock);
 }

 static int find_first_block_group(struct btrfs_fs_info *fs_info,
 				  struct btrfs_path *path,
 				  struct btrfs_key *key)
 {
 	struct btrfs_root *root = fs_info->extent_root;
 	int ret = 0;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	struct btrfs_block_group_item bg;
 	u64 flags;
 	int slot;

 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 	if (ret < 0)
 		goto out;

 	while (1) {
 		slot = path->slots[0];
 		leaf = path->nodes[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
 			if (ret < 0)
 				goto out;
 			break;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);

 		if (found_key.objectid >= key->objectid &&
 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
 			struct extent_map_tree *em_tree;
 			struct extent_map *em;

 			em_tree = &root->fs_info->mapping_tree;
 			read_lock(&em_tree->lock);
 			em = lookup_extent_mapping(em_tree, found_key.objectid,
 						   found_key.offset);
 			read_unlock(&em_tree->lock);
 			if (!em) {
 				btrfs_err(fs_info,
 			"logical %llu len %llu found bg but no related chunk",
 					  found_key.objectid, found_key.offset);
 				ret = -ENOENT;
 			} else if (em->start != found_key.objectid ||
 				   em->len != found_key.offset) {
 				btrfs_err(fs_info,
 		"block group %llu len %llu mismatch with chunk %llu len %llu",
 					  found_key.objectid, found_key.offset,
 					  em->start, em->len);
 				ret = -EUCLEAN;
 			} else {
 				read_extent_buffer(leaf, &bg,
 					btrfs_item_ptr_offset(leaf, slot),
 					sizeof(bg));
 				flags = btrfs_block_group_flags(&bg) &
 					BTRFS_BLOCK_GROUP_TYPE_MASK;

 				if (flags != (em->map_lookup->type &
 					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 					btrfs_err(fs_info,
 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
 						found_key.objectid,
 						found_key.offset, flags,
 						(BTRFS_BLOCK_GROUP_TYPE_MASK &
 						 em->map_lookup->type));
 					ret = -EUCLEAN;
 				} else {
 					ret = 0;
 				}
 			}
 			free_extent_map(em);
 			goto out;
 		}
 		path->slots[0]++;
 	}
 out:
 	return ret;
 }

 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = chunk_to_extended(flags) &
 				BTRFS_EXTENDED_PROFILE_MASK;

 	write_seqlock(&fs_info->profiles_lock);
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		fs_info->avail_data_alloc_bits |= extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 		fs_info->avail_metadata_alloc_bits |= extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		fs_info->avail_system_alloc_bits |= extra_flags;
 	write_sequnlock(&fs_info->profiles_lock);
 }

 static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	u64 bytenr;
 	u64 *logical;
 	int stripe_len;
 	int i, nr, ret;

 	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 		cache->bytes_super += stripe_len;
 		ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid,
 						stripe_len);
 		if (ret)
 			return ret;
 	}

 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
 		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
 				       bytenr, &logical, &nr, &stripe_len);
 		if (ret)
 			return ret;

 		while (nr--) {
 			u64 start, len;

 			if (logical[nr] > cache->key.objectid +
 			    cache->key.offset)
 				continue;

 			if (logical[nr] + stripe_len <= cache->key.objectid)
 				continue;

 			start = logical[nr];
 			if (start < cache->key.objectid) {
 				start = cache->key.objectid;
 				len = (logical[nr] + stripe_len) - start;
 			} else {
 				len = min_t(u64, stripe_len,
 					    cache->key.objectid +
 					    cache->key.offset - start);
 			}

 			cache->bytes_super += len;
 			ret = btrfs_add_excluded_extent(fs_info, start, len);
 			if (ret) {
 				kfree(logical);
 				return ret;
 			}
 		}

 		kfree(logical);
 	}
 	return 0;
 }

 static void link_block_group(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *space_info = cache->space_info;
 	int index = btrfs_bg_flags_to_raid_index(cache->flags);
 	bool first = false;

 	down_write(&space_info->groups_sem);
 	if (list_empty(&space_info->block_groups[index]))
 		first = true;
 	list_add_tail(&cache->list, &space_info->block_groups[index]);
 	up_write(&space_info->groups_sem);

 	if (first)
 		btrfs_sysfs_add_block_group_type(cache);
 }

 static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
 		struct btrfs_fs_info *fs_info, u64 start, u64 size)
 {
 	struct btrfs_block_group_cache *cache;

 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	if (!cache)
 		return NULL;

 	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
 					GFP_NOFS);
 	if (!cache->free_space_ctl) {
 		kfree(cache);
 		return NULL;
 	}

 	cache->key.objectid = start;
 	cache->key.offset = size;
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;

 	cache->fs_info = fs_info;
 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
 	set_free_space_tree_thresholds(cache);

 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	init_rwsem(&cache->data_rwsem);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
 	INIT_LIST_HEAD(&cache->bg_list);
 	INIT_LIST_HEAD(&cache->ro_list);
 	INIT_LIST_HEAD(&cache->dirty_list);
 	INIT_LIST_HEAD(&cache->io_list);
 	btrfs_init_free_space_ctl(cache);
 	atomic_set(&cache->trimming, 0);
 	mutex_init(&cache->free_space_lock);
 	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);

 	return cache;
 }

 /*
  * Iterate all chunks and verify that each of them has the corresponding block
  * group
  */
 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 {
 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct btrfs_block_group_cache *bg;
 	u64 start = 0;
 	int ret = 0;

 	while (1) {
 		read_lock(&map_tree->lock);
 		/*
 		 * lookup_extent_mapping will return the first extent map
 		 * intersecting the range, so setting @len to 1 is enough to
 		 * get the first chunk.
 		 */
 		em = lookup_extent_mapping(map_tree, start, 1);
 		read_unlock(&map_tree->lock);
 		if (!em)
 			break;

 		bg = btrfs_lookup_block_group(fs_info, em->start);
 		if (!bg) {
 			btrfs_err(fs_info,
 	"chunk start=%llu len=%llu doesn't have corresponding block group",
 				     em->start, em->len);
 			ret = -EUCLEAN;
 			free_extent_map(em);
 			break;
 		}
 		if (bg->key.objectid != em->start ||
 		    bg->key.offset != em->len ||
 		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
 		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 			btrfs_err(fs_info,
 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
 				em->start, em->len,
 				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
 				bg->key.objectid, bg->key.offset,
 				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
 			ret = -EUCLEAN;
 			free_extent_map(em);
 			btrfs_put_block_group(bg);
 			break;
 		}
 		start = em->start + em->len;
 		free_extent_map(em);
 		btrfs_put_block_group(bg);
 	}
 	return ret;
 }

 int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_space_info *space_info;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	int need_clear = 0;
 	u64 cache_gen;
 	u64 feature;
 	int mixed;

 	feature = btrfs_super_incompat_flags(info->super_copy);
 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);

 	key.objectid = 0;
 	key.offset = 0;
 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	path->reada = READA_FORWARD;

 	cache_gen = btrfs_super_cache_generation(info->super_copy);
 	if (btrfs_test_opt(info, SPACE_CACHE) &&
 	    btrfs_super_generation(info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(info, CLEAR_CACHE))
 		need_clear = 1;

 	while (1) {
 		ret = find_first_block_group(info, path, &key);
 		if (ret > 0)
 			break;
 		if (ret != 0)
 			goto error;

 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

 		cache = btrfs_create_block_group_cache(info, found_key.objectid,
 						       found_key.offset);
 		if (!cache) {
 			ret = -ENOMEM;
 			goto error;
 		}

 		if (need_clear) {
 			/*
 			 * When we mount with old space cache, we need to
 			 * set BTRFS_DC_CLEAR and set dirty flag.
 			 *
 			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
 			 *    truncate the old free space cache inode and
 			 *    setup a new one.
 			 * b) Setting 'dirty flag' makes sure that we flush
 			 *    the new space cache info onto disk.
 			 */
 			if (btrfs_test_opt(info, SPACE_CACHE))
 				cache->disk_cache_state = BTRFS_DC_CLEAR;
 		}

 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
 		cache->flags = btrfs_block_group_flags(&cache->item);
 		if (!mixed &&
 		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
 		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
 			btrfs_err(info,
 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
 				  cache->key.objectid);
 			ret = -EINVAL;
 			goto error;
 		}

 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(path);

 		/*
 		 * We need to exclude the super stripes now so that the space
 		 * info has super bytes accounted for, otherwise we'll think
 		 * we have more space than we actually do.
 		 */
 		ret = exclude_super_stripes(cache);
 		if (ret) {
 			/*
 			 * We may have excluded something, so call this just in
 			 * case.
 			 */
 			btrfs_free_excluded_extents(cache);
 			btrfs_put_block_group(cache);
 			goto error;
 		}

 		/*
 		 * Check for two cases, either we are full, and therefore
 		 * don't need to bother with the caching work since we won't
 		 * find any space, or we are empty, and we can just add all
 		 * the space in and be done with it.  This saves us _a_lot_ of
 		 * time, particularly in the full case.
 		 */
 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			btrfs_free_excluded_extents(cache);
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			add_new_free_space(cache, found_key.objectid,
 					   found_key.objectid +
 					   found_key.offset);
 			btrfs_free_excluded_extents(cache);
 		}

 		ret = btrfs_add_block_group_cache(info, cache);
 		if (ret) {
 			btrfs_remove_free_space_cache(cache);
 			btrfs_put_block_group(cache);
 			goto error;
 		}

 		trace_btrfs_add_block_group(info, cache, 0);
 		btrfs_update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
 					cache->bytes_super, &space_info);

 		cache->space_info = space_info;

 		link_block_group(cache);

 		set_avail_alloc_bits(info, cache->flags);
 		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
 			__btrfs_inc_block_group_ro(cache, 1);
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
 			ASSERT(list_empty(&cache->bg_list));
 			btrfs_mark_bg_unused(cache);
 		}
 	}

 	list_for_each_entry_rcu(space_info, &info->space_info, list) {
 		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
 		      (BTRFS_BLOCK_GROUP_RAID10 |
 		       BTRFS_BLOCK_GROUP_RAID1_MASK |
 		       BTRFS_BLOCK_GROUP_RAID56_MASK |
 		       BTRFS_BLOCK_GROUP_DUP)))
 			continue;
 		/*
 		 * Avoid allocating from un-mirrored block group if there are
 		 * mirrored block groups.
 		 */
 		list_for_each_entry(cache,
 				&space_info->block_groups[BTRFS_RAID_RAID0],
 				list)
 			__btrfs_inc_block_group_ro(cache, 1);
 		list_for_each_entry(cache,
 				&space_info->block_groups[BTRFS_RAID_SINGLE],
 				list)
 			__btrfs_inc_block_group_ro(cache, 1);
 	}

 	btrfs_init_global_block_rsv(info);
 	ret = check_chunk_block_group_mappings(info);
 error:
 	btrfs_free_path(path);
 	return ret;
 }

 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	struct btrfs_block_group_item item;
 	struct btrfs_key key;
 	int ret = 0;

 	if (!trans->can_flush_pending_bgs)
 		return;

 	while (!list_empty(&trans->new_bgs)) {
 		block_group = list_first_entry(&trans->new_bgs,
 					       struct btrfs_block_group_cache,
 					       bg_list);
 		if (ret)
 			goto next;

 		spin_lock(&block_group->lock);
 		memcpy(&item, &block_group->item, sizeof(item));
 		memcpy(&key, &block_group->key, sizeof(key));
 		spin_unlock(&block_group->lock);

 		ret = btrfs_insert_item(trans, extent_root, &key, &item,
 					sizeof(item));
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
 		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
 		add_block_group_free_space(trans, block_group);
 		/* Already aborted the transaction if it failed. */
 next:
 		btrfs_delayed_refs_rsv_release(fs_info, 1);
 		list_del_init(&block_group->bg_list);
 	}
 	btrfs_trans_release_chunk_metadata(trans);
 }

 int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
 			   u64 type, u64 chunk_offset, u64 size)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group_cache *cache;
 	int ret;

 	btrfs_set_log_full_commit(trans);

 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
 	if (!cache)
 		return -ENOMEM;

 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item,
 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
 	btrfs_set_block_group_flags(&cache->item, type);

 	cache->flags = type;
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
 	cache->needs_free_space = 1;
 	ret = exclude_super_stripes(cache);
 	if (ret) {
 		/* We may have excluded something, so call this just in case */
 		btrfs_free_excluded_extents(cache);
 		btrfs_put_block_group(cache);
 		return ret;
 	}

 	add_new_free_space(cache, chunk_offset, chunk_offset + size);

 	btrfs_free_excluded_extents(cache);

 #ifdef CONFIG_BTRFS_DEBUG
 	if (btrfs_should_fragment_free_space(cache)) {
 		u64 new_bytes_used = size - bytes_used;

 		bytes_used += new_bytes_used >> 1;
 		btrfs_fragment_free_space(cache);
 	}
 #endif
 	/*
 	 * Ensure the corresponding space_info object is created and
 	 * assigned to our block group. We want our bg to be added to the rbtree
 	 * with its ->space_info set.
 	 */
 	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
 	ASSERT(cache->space_info);

 	ret = btrfs_add_block_group_cache(fs_info, cache);
 	if (ret) {
 		btrfs_remove_free_space_cache(cache);
 		btrfs_put_block_group(cache);
 		return ret;
 	}

 	/*
 	 * Now that our block group has its ->space_info set and is inserted in
 	 * the rbtree, update the space info's counters.
 	 */
 	trace_btrfs_add_block_group(fs_info, cache, 1);
 	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
 				cache->bytes_super, &cache->space_info);
 	btrfs_update_global_block_rsv(fs_info);

 	link_block_group(cache);

 	list_add_tail(&cache->bg_list, &trans->new_bgs);
 	trans->delayed_ref_updates++;
 	btrfs_update_delayed_refs_rsv(trans);

 	set_avail_alloc_bits(fs_info, type);
 	return 0;
 }

 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 num_devices;
 	u64 stripped;

 	/*
 	 * if restripe for this chunk_type is on pick target profile and
 	 * return, otherwise do the usual balance
 	 */
 	stripped = btrfs_get_restripe_target(fs_info, flags);
 	if (stripped)
 		return extended_to_chunk(stripped);

 	num_devices = fs_info->fs_devices->rw_devices;

 	stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
 		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;

 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;

 		/* turn raid0 into single device chunks */
 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
 			return stripped;

 		/* turn mirroring into duplication */
 		if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
 			     BTRFS_BLOCK_GROUP_RAID10))
 			return stripped | BTRFS_BLOCK_GROUP_DUP;
 	} else {
 		/* they already had raid on here, just return */
 		if (flags & stripped)
 			return flags;

 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;

 		/* switch duplicated blocks with raid1 */
 		if (flags & BTRFS_BLOCK_GROUP_DUP)
 			return stripped | BTRFS_BLOCK_GROUP_RAID1;

 		/* this is drive concat, leave it alone */
 	}

 	return flags;
 }

 int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)

 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
 	struct btrfs_trans_handle *trans;
 	u64 alloc_flags;
 	int ret;

 again:
 	trans = btrfs_join_transaction(fs_info->extent_root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);

 	/*
 	 * we're not allowed to set block groups readonly after the dirty
 	 * block groups cache has started writing.  If it already started,
 	 * back off and let this transaction commit
 	 */
 	mutex_lock(&fs_info->ro_block_group_mutex);
 	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
 		u64 transid = trans->transid;

 		mutex_unlock(&fs_info->ro_block_group_mutex);
 		btrfs_end_transaction(trans);

 		ret = btrfs_wait_for_commit(fs_info, transid);
 		if (ret)
 			return ret;
 		goto again;
 	}

 	/*
 	 * if we are changing raid levels, try to allocate a corresponding
 	 * block group with the new raid level.
 	 */
 	alloc_flags = update_block_group_flags(fs_info, cache->flags);
 	if (alloc_flags != cache->flags) {
 		ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 		/*
 		 * ENOSPC is allowed here, we may have enough space
 		 * already allocated at the new raid level to
 		 * carry on
 		 */
 		if (ret == -ENOSPC)
 			ret = 0;
 		if (ret < 0)
 			goto out;
 	}

 	ret = __btrfs_inc_block_group_ro(cache, 0);
 	if (!ret)
 		goto out;
 	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
 	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 	if (ret < 0)
 		goto out;
 	ret = __btrfs_inc_block_group_ro(cache, 0);
 out:
 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 		alloc_flags = update_block_group_flags(fs_info, cache->flags);
 		mutex_lock(&fs_info->chunk_mutex);
 		check_system_chunk(trans, alloc_flags);
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
 	mutex_unlock(&fs_info->ro_block_group_mutex);

 	btrfs_end_transaction(trans);
 	return ret;
 }

 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
 	u64 num_bytes;

 	BUG_ON(!cache->ro);

 	spin_lock(&sinfo->lock);
 	spin_lock(&cache->lock);
 	if (!--cache->ro) {
 		num_bytes = cache->key.offset - cache->reserved -
 			    cache->pinned - cache->bytes_super -
 			    btrfs_block_group_used(&cache->item);
 		sinfo->bytes_readonly -= num_bytes;
 		list_del_init(&cache->ro_list);
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
 }