fs/btrfs/block-group.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / block-group.c
at master 146 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/sizes.h>
   4#include <linux/list_sort.h>
   5#include "misc.h"
   6#include "ctree.h"
   7#include "block-group.h"
   8#include "space-info.h"
   9#include "disk-io.h"
  10#include "free-space-cache.h"
  11#include "free-space-tree.h"
  12#include "volumes.h"
  13#include "transaction.h"
  14#include "ref-verify.h"
  15#include "sysfs.h"
  16#include "tree-log.h"
  17#include "delalloc-space.h"
  18#include "discard.h"
  19#include "raid56.h"
  20#include "zoned.h"
  21#include "fs.h"
  22#include "accessors.h"
  23#include "extent-tree.h"
  24
  25#ifdef CONFIG_BTRFS_DEBUG
  26int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
  27{
  28	struct btrfs_fs_info *fs_info = block_group->fs_info;
  29
  30	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
  31		block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
  32	       (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
  33		block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
  34}
  35#endif
  36
  37static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
  38{
  39	/* The meta_write_pointer is available only on the zoned setup. */
  40	if (!btrfs_is_zoned(block_group->fs_info))
  41		return false;
  42
  43	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
  44		return false;
  45
  46	return block_group->start + block_group->alloc_offset >
  47		block_group->meta_write_pointer;
  48}
  49
  50/*
  51 * Return target flags in extended format or 0 if restripe for this chunk_type
  52 * is not in progress
  53 *
  54 * Should be called with balance_lock held
  55 */
  56static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
  57{
  58	const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  59	u64 target = 0;
  60
  61	if (!bctl)
  62		return 0;
  63
  64	if (flags & BTRFS_BLOCK_GROUP_DATA &&
  65	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  66		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  67	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  68		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  69		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  70	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  71		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  72		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  73	}
  74
  75	return target;
  76}
  77
  78/*
  79 * @flags: available profiles in extended format (see ctree.h)
  80 *
  81 * Return reduced profile in chunk format.  If profile changing is in progress
  82 * (either running or paused) picks the target profile (if it's already
  83 * available), otherwise falls back to plain reducing.
  84 */
  85static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  86{
  87	u64 num_devices = fs_info->fs_devices->rw_devices;
  88	u64 target;
  89	u64 raid_type;
  90	u64 allowed = 0;
  91
  92	/*
  93	 * See if restripe for this chunk_type is in progress, if so try to
  94	 * reduce to the target profile
  95	 */
  96	spin_lock(&fs_info->balance_lock);
  97	target = get_restripe_target(fs_info, flags);
  98	if (target) {
  99		spin_unlock(&fs_info->balance_lock);
 100		return extended_to_chunk(target);
 101	}
 102	spin_unlock(&fs_info->balance_lock);
 103
 104	/* First, mask out the RAID levels which aren't possible */
 105	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 106		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
 107			allowed |= btrfs_raid_array[raid_type].bg_flag;
 108	}
 109	allowed &= flags;
 110
 111	/* Select the highest-redundancy RAID level. */
 112	if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
 113		allowed = BTRFS_BLOCK_GROUP_RAID1C4;
 114	else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
 115		allowed = BTRFS_BLOCK_GROUP_RAID6;
 116	else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
 117		allowed = BTRFS_BLOCK_GROUP_RAID1C3;
 118	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
 119		allowed = BTRFS_BLOCK_GROUP_RAID5;
 120	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
 121		allowed = BTRFS_BLOCK_GROUP_RAID10;
 122	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
 123		allowed = BTRFS_BLOCK_GROUP_RAID1;
 124	else if (allowed & BTRFS_BLOCK_GROUP_DUP)
 125		allowed = BTRFS_BLOCK_GROUP_DUP;
 126	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
 127		allowed = BTRFS_BLOCK_GROUP_RAID0;
 128
 129	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
 130
 131	return extended_to_chunk(flags | allowed);
 132}
 133
 134u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 135{
 136	unsigned seq;
 137	u64 flags;
 138
 139	do {
 140		flags = orig_flags;
 141		seq = read_seqbegin(&fs_info->profiles_lock);
 142
 143		if (flags & BTRFS_BLOCK_GROUP_DATA)
 144			flags |= fs_info->avail_data_alloc_bits;
 145		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 146			flags |= fs_info->avail_system_alloc_bits;
 147		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 148			flags |= fs_info->avail_metadata_alloc_bits;
 149	} while (read_seqretry(&fs_info->profiles_lock, seq));
 150
 151	return btrfs_reduce_alloc_profile(fs_info, flags);
 152}
 153
 154void btrfs_get_block_group(struct btrfs_block_group *cache)
 155{
 156	refcount_inc(&cache->refs);
 157}
 158
 159void btrfs_put_block_group(struct btrfs_block_group *cache)
 160{
 161	if (refcount_dec_and_test(&cache->refs)) {
 162		WARN_ON(cache->pinned > 0);
 163		/*
 164		 * If there was a failure to cleanup a log tree, very likely due
 165		 * to an IO failure on a writeback attempt of one or more of its
 166		 * extent buffers, we could not do proper (and cheap) unaccounting
 167		 * of their reserved space, so don't warn on reserved > 0 in that
 168		 * case.
 169		 */
 170		if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
 171		    !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
 172			WARN_ON(cache->reserved > 0);
 173
 174		/*
 175		 * A block_group shouldn't be on the discard_list anymore.
 176		 * Remove the block_group from the discard_list to prevent us
 177		 * from causing a panic due to NULL pointer dereference.
 178		 */
 179		if (WARN_ON(!list_empty(&cache->discard_list)))
 180			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 181						  cache);
 182
 183		kfree(cache->free_space_ctl);
 184		btrfs_free_chunk_map(cache->physical_map);
 185		kfree(cache);
 186	}
 187}
 188
 189static int btrfs_bg_start_cmp(const struct rb_node *new,
 190			      const struct rb_node *exist)
 191{
 192	const struct btrfs_block_group *new_bg =
 193		rb_entry(new, struct btrfs_block_group, cache_node);
 194	const struct btrfs_block_group *exist_bg =
 195		rb_entry(exist, struct btrfs_block_group, cache_node);
 196
 197	if (new_bg->start < exist_bg->start)
 198		return -1;
 199	if (new_bg->start > exist_bg->start)
 200		return 1;
 201	return 0;
 202}
 203
 204/*
 205 * This adds the block group to the fs_info rb tree for the block group cache
 206 */
 207static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
 208{
 209	struct btrfs_fs_info *fs_info = block_group->fs_info;
 210	struct rb_node *exist;
 211	int ret = 0;
 212
 213	ASSERT(block_group->length != 0);
 214
 215	write_lock(&fs_info->block_group_cache_lock);
 216
 217	exist = rb_find_add_cached(&block_group->cache_node,
 218			&fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
 219	if (exist)
 220		ret = -EEXIST;
 221	write_unlock(&fs_info->block_group_cache_lock);
 222
 223	return ret;
 224}
 225
 226/*
 227 * This will return the block group at or after bytenr if contains is 0, else
 228 * it will return the block group that contains the bytenr
 229 */
 230static struct btrfs_block_group *block_group_cache_tree_search(
 231		struct btrfs_fs_info *info, u64 bytenr, int contains)
 232{
 233	struct btrfs_block_group *cache, *ret = NULL;
 234	struct rb_node *n;
 235	u64 end, start;
 236
 237	read_lock(&info->block_group_cache_lock);
 238	n = info->block_group_cache_tree.rb_root.rb_node;
 239
 240	while (n) {
 241		cache = rb_entry(n, struct btrfs_block_group, cache_node);
 242		end = cache->start + cache->length - 1;
 243		start = cache->start;
 244
 245		if (bytenr < start) {
 246			if (!contains && (!ret || start < ret->start))
 247				ret = cache;
 248			n = n->rb_left;
 249		} else if (bytenr > start) {
 250			if (contains && bytenr <= end) {
 251				ret = cache;
 252				break;
 253			}
 254			n = n->rb_right;
 255		} else {
 256			ret = cache;
 257			break;
 258		}
 259	}
 260	if (ret)
 261		btrfs_get_block_group(ret);
 262	read_unlock(&info->block_group_cache_lock);
 263
 264	return ret;
 265}
 266
 267/*
 268 * Return the block group that starts at or after bytenr
 269 */
 270struct btrfs_block_group *btrfs_lookup_first_block_group(
 271		struct btrfs_fs_info *info, u64 bytenr)
 272{
 273	return block_group_cache_tree_search(info, bytenr, 0);
 274}
 275
 276/*
 277 * Return the block group that contains the given bytenr
 278 */
 279struct btrfs_block_group *btrfs_lookup_block_group(
 280		struct btrfs_fs_info *info, u64 bytenr)
 281{
 282	return block_group_cache_tree_search(info, bytenr, 1);
 283}
 284
 285struct btrfs_block_group *btrfs_next_block_group(
 286		struct btrfs_block_group *cache)
 287{
 288	struct btrfs_fs_info *fs_info = cache->fs_info;
 289	struct rb_node *node;
 290
 291	read_lock(&fs_info->block_group_cache_lock);
 292
 293	/* If our block group was removed, we need a full search. */
 294	if (RB_EMPTY_NODE(&cache->cache_node)) {
 295		const u64 next_bytenr = cache->start + cache->length;
 296
 297		read_unlock(&fs_info->block_group_cache_lock);
 298		btrfs_put_block_group(cache);
 299		return btrfs_lookup_first_block_group(fs_info, next_bytenr);
 300	}
 301	node = rb_next(&cache->cache_node);
 302	btrfs_put_block_group(cache);
 303	if (node) {
 304		cache = rb_entry(node, struct btrfs_block_group, cache_node);
 305		btrfs_get_block_group(cache);
 306	} else
 307		cache = NULL;
 308	read_unlock(&fs_info->block_group_cache_lock);
 309	return cache;
 310}
 311
 312/*
 313 * Check if we can do a NOCOW write for a given extent.
 314 *
 315 * @fs_info:       The filesystem information object.
 316 * @bytenr:        Logical start address of the extent.
 317 *
 318 * Check if we can do a NOCOW write for the given extent, and increments the
 319 * number of NOCOW writers in the block group that contains the extent, as long
 320 * as the block group exists and it's currently not in read-only mode.
 321 *
 322 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
 323 *          is responsible for calling btrfs_dec_nocow_writers() later.
 324 *
 325 *          Or NULL if we can not do a NOCOW write
 326 */
 327struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
 328						  u64 bytenr)
 329{
 330	struct btrfs_block_group *bg;
 331	bool can_nocow = true;
 332
 333	bg = btrfs_lookup_block_group(fs_info, bytenr);
 334	if (!bg)
 335		return NULL;
 336
 337	spin_lock(&bg->lock);
 338	if (bg->ro)
 339		can_nocow = false;
 340	else
 341		atomic_inc(&bg->nocow_writers);
 342	spin_unlock(&bg->lock);
 343
 344	if (!can_nocow) {
 345		btrfs_put_block_group(bg);
 346		return NULL;
 347	}
 348
 349	/* No put on block group, done by btrfs_dec_nocow_writers(). */
 350	return bg;
 351}
 352
 353/*
 354 * Decrement the number of NOCOW writers in a block group.
 355 *
 356 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
 357 * and on the block group returned by that call. Typically this is called after
 358 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
 359 * relocation.
 360 *
 361 * After this call, the caller should not use the block group anymore. It it wants
 362 * to use it, then it should get a reference on it before calling this function.
 363 */
 364void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
 365{
 366	if (atomic_dec_and_test(&bg->nocow_writers))
 367		wake_up_var(&bg->nocow_writers);
 368
 369	/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
 370	btrfs_put_block_group(bg);
 371}
 372
 373void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 374{
 375	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 376}
 377
 378void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 379					const u64 start)
 380{
 381	struct btrfs_block_group *bg;
 382
 383	bg = btrfs_lookup_block_group(fs_info, start);
 384	ASSERT(bg);
 385	if (atomic_dec_and_test(&bg->reservations))
 386		wake_up_var(&bg->reservations);
 387	btrfs_put_block_group(bg);
 388}
 389
 390void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 391{
 392	struct btrfs_space_info *space_info = bg->space_info;
 393
 394	ASSERT(bg->ro);
 395
 396	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 397		return;
 398
 399	/*
 400	 * Our block group is read only but before we set it to read only,
 401	 * some task might have had allocated an extent from it already, but it
 402	 * has not yet created a respective ordered extent (and added it to a
 403	 * root's list of ordered extents).
 404	 * Therefore wait for any task currently allocating extents, since the
 405	 * block group's reservations counter is incremented while a read lock
 406	 * on the groups' semaphore is held and decremented after releasing
 407	 * the read access on that semaphore and creating the ordered extent.
 408	 */
 409	down_write(&space_info->groups_sem);
 410	up_write(&space_info->groups_sem);
 411
 412	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 413}
 414
 415struct btrfs_caching_control *btrfs_get_caching_control(
 416		struct btrfs_block_group *cache)
 417{
 418	struct btrfs_caching_control *ctl;
 419
 420	spin_lock(&cache->lock);
 421	if (!cache->caching_ctl) {
 422		spin_unlock(&cache->lock);
 423		return NULL;
 424	}
 425
 426	ctl = cache->caching_ctl;
 427	refcount_inc(&ctl->count);
 428	spin_unlock(&cache->lock);
 429	return ctl;
 430}
 431
 432static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 433{
 434	if (refcount_dec_and_test(&ctl->count))
 435		kfree(ctl);
 436}
 437
 438/*
 439 * When we wait for progress in the block group caching, its because our
 440 * allocation attempt failed at least once.  So, we must sleep and let some
 441 * progress happen before we try again.
 442 *
 443 * This function will sleep at least once waiting for new free space to show
 444 * up, and then it will check the block group free space numbers for our min
 445 * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 446 * a free extent of a given size, but this is a good start.
 447 *
 448 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 449 * any of the information in this block group.
 450 */
 451void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 452					   u64 num_bytes)
 453{
 454	struct btrfs_caching_control *caching_ctl;
 455	int progress;
 456
 457	caching_ctl = btrfs_get_caching_control(cache);
 458	if (!caching_ctl)
 459		return;
 460
 461	/*
 462	 * We've already failed to allocate from this block group, so even if
 463	 * there's enough space in the block group it isn't contiguous enough to
 464	 * allow for an allocation, so wait for at least the next wakeup tick,
 465	 * or for the thing to be done.
 466	 */
 467	progress = atomic_read(&caching_ctl->progress);
 468
 469	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 470		   (progress != atomic_read(&caching_ctl->progress) &&
 471		    (cache->free_space_ctl->free_space >= num_bytes)));
 472
 473	btrfs_put_caching_control(caching_ctl);
 474}
 475
 476static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
 477				       struct btrfs_caching_control *caching_ctl)
 478{
 479	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 480	return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
 481}
 482
 483static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 484{
 485	struct btrfs_caching_control *caching_ctl;
 486	int ret;
 487
 488	caching_ctl = btrfs_get_caching_control(cache);
 489	if (!caching_ctl)
 490		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 491	ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
 492	btrfs_put_caching_control(caching_ctl);
 493	return ret;
 494}
 495
 496#ifdef CONFIG_BTRFS_DEBUG
 497static void fragment_free_space(struct btrfs_block_group *block_group)
 498{
 499	struct btrfs_fs_info *fs_info = block_group->fs_info;
 500	u64 start = block_group->start;
 501	u64 len = block_group->length;
 502	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 503		fs_info->nodesize : fs_info->sectorsize;
 504	u64 step = chunk << 1;
 505
 506	while (len > chunk) {
 507		btrfs_remove_free_space(block_group, start, chunk);
 508		start += step;
 509		if (len < step)
 510			len = 0;
 511		else
 512			len -= step;
 513	}
 514}
 515#endif
 516
 517/*
 518 * Add a free space range to the in memory free space cache of a block group.
 519 * This checks if the range contains super block locations and any such
 520 * locations are not added to the free space cache.
 521 *
 522 * @block_group:      The target block group.
 523 * @start:            Start offset of the range.
 524 * @end:              End offset of the range (exclusive).
 525 * @total_added_ret:  Optional pointer to return the total amount of space
 526 *                    added to the block group's free space cache.
 527 *
 528 * Returns 0 on success or < 0 on error.
 529 */
 530int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
 531			     u64 end, u64 *total_added_ret)
 532{
 533	struct btrfs_fs_info *info = block_group->fs_info;
 534	u64 extent_start, extent_end, size;
 535	int ret;
 536
 537	if (total_added_ret)
 538		*total_added_ret = 0;
 539
 540	while (start < end) {
 541		if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
 542						 &extent_start, &extent_end,
 543						 EXTENT_DIRTY, NULL))
 544			break;
 545
 546		if (extent_start <= start) {
 547			start = extent_end + 1;
 548		} else if (extent_start > start && extent_start < end) {
 549			size = extent_start - start;
 550			ret = btrfs_add_free_space_async_trimmed(block_group,
 551								 start, size);
 552			if (ret)
 553				return ret;
 554			if (total_added_ret)
 555				*total_added_ret += size;
 556			start = extent_end + 1;
 557		} else {
 558			break;
 559		}
 560	}
 561
 562	if (start < end) {
 563		size = end - start;
 564		ret = btrfs_add_free_space_async_trimmed(block_group, start,
 565							 size);
 566		if (ret)
 567			return ret;
 568		if (total_added_ret)
 569			*total_added_ret += size;
 570	}
 571
 572	return 0;
 573}
 574
 575/*
 576 * Get an arbitrary extent item index / max_index through the block group
 577 *
 578 * @block_group   the block group to sample from
 579 * @index:        the integral step through the block group to grab from
 580 * @max_index:    the granularity of the sampling
 581 * @key:          return value parameter for the item we find
 582 *
 583 * Pre-conditions on indices:
 584 * 0 <= index <= max_index
 585 * 0 < max_index
 586 *
 587 * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
 588 * error code on error.
 589 */
 590static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
 591					  struct btrfs_block_group *block_group,
 592					  int index, int max_index,
 593					  struct btrfs_key *found_key)
 594{
 595	struct btrfs_fs_info *fs_info = block_group->fs_info;
 596	struct btrfs_root *extent_root;
 597	u64 search_offset;
 598	u64 search_end = block_group->start + block_group->length;
 599	BTRFS_PATH_AUTO_FREE(path);
 600	struct btrfs_key search_key;
 601	int ret = 0;
 602
 603	ASSERT(index >= 0);
 604	ASSERT(index <= max_index);
 605	ASSERT(max_index > 0);
 606	lockdep_assert_held(&caching_ctl->mutex);
 607	lockdep_assert_held_read(&fs_info->commit_root_sem);
 608
 609	path = btrfs_alloc_path();
 610	if (!path)
 611		return -ENOMEM;
 612
 613	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
 614						       BTRFS_SUPER_INFO_OFFSET));
 615
 616	path->skip_locking = true;
 617	path->search_commit_root = true;
 618	path->reada = READA_FORWARD;
 619
 620	search_offset = index * div_u64(block_group->length, max_index);
 621	search_key.objectid = block_group->start + search_offset;
 622	search_key.type = BTRFS_EXTENT_ITEM_KEY;
 623	search_key.offset = 0;
 624
 625	btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
 626		/* Success; sampled an extent item in the block group */
 627		if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
 628		    found_key->objectid >= block_group->start &&
 629		    found_key->objectid + found_key->offset <= search_end)
 630			break;
 631
 632		/* We can't possibly find a valid extent item anymore */
 633		if (found_key->objectid >= search_end) {
 634			ret = 1;
 635			break;
 636		}
 637	}
 638
 639	lockdep_assert_held(&caching_ctl->mutex);
 640	lockdep_assert_held_read(&fs_info->commit_root_sem);
 641	return ret;
 642}
 643
 644/*
 645 * Best effort attempt to compute a block group's size class while caching it.
 646 *
 647 * @block_group: the block group we are caching
 648 *
 649 * We cannot infer the size class while adding free space extents, because that
 650 * logic doesn't care about contiguous file extents (it doesn't differentiate
 651 * between a 100M extent and 100 contiguous 1M extents). So we need to read the
 652 * file extent items. Reading all of them is quite wasteful, because usually
 653 * only a handful are enough to give a good answer. Therefore, we just grab 5 of
 654 * them at even steps through the block group and pick the smallest size class
 655 * we see. Since size class is best effort, and not guaranteed in general,
 656 * inaccuracy is acceptable.
 657 *
 658 * To be more explicit about why this algorithm makes sense:
 659 *
 660 * If we are caching in a block group from disk, then there are three major cases
 661 * to consider:
 662 * 1. the block group is well behaved and all extents in it are the same size
 663 *    class.
 664 * 2. the block group is mostly one size class with rare exceptions for last
 665 *    ditch allocations
 666 * 3. the block group was populated before size classes and can have a totally
 667 *    arbitrary mix of size classes.
 668 *
 669 * In case 1, looking at any extent in the block group will yield the correct
 670 * result. For the mixed cases, taking the minimum size class seems like a good
 671 * approximation, since gaps from frees will be usable to the size class. For
 672 * 2., a small handful of file extents is likely to yield the right answer. For
 673 * 3, we can either read every file extent, or admit that this is best effort
 674 * anyway and try to stay fast.
 675 *
 676 * Returns: 0 on success, negative error code on error.
 677 */
 678static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
 679				       struct btrfs_block_group *block_group)
 680{
 681	struct btrfs_fs_info *fs_info = block_group->fs_info;
 682	struct btrfs_key key;
 683	int i;
 684	u64 min_size = block_group->length;
 685	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
 686	int ret;
 687
 688	if (!btrfs_block_group_should_use_size_class(block_group))
 689		return 0;
 690
 691	lockdep_assert_held(&caching_ctl->mutex);
 692	lockdep_assert_held_read(&fs_info->commit_root_sem);
 693	for (i = 0; i < 5; ++i) {
 694		ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
 695		if (ret < 0)
 696			goto out;
 697		if (ret > 0)
 698			continue;
 699		min_size = min_t(u64, min_size, key.offset);
 700		size_class = btrfs_calc_block_group_size_class(min_size);
 701	}
 702	if (size_class != BTRFS_BG_SZ_NONE) {
 703		spin_lock(&block_group->lock);
 704		block_group->size_class = size_class;
 705		spin_unlock(&block_group->lock);
 706	}
 707out:
 708	return ret;
 709}
 710
 711static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 712{
 713	struct btrfs_block_group *block_group = caching_ctl->block_group;
 714	struct btrfs_fs_info *fs_info = block_group->fs_info;
 715	struct btrfs_root *extent_root;
 716	BTRFS_PATH_AUTO_FREE(path);
 717	struct extent_buffer *leaf;
 718	struct btrfs_key key;
 719	u64 total_found = 0;
 720	u64 last = 0;
 721	u32 nritems;
 722	int ret;
 723	bool wakeup = true;
 724
 725	path = btrfs_alloc_path();
 726	if (!path)
 727		return -ENOMEM;
 728
 729	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 730	extent_root = btrfs_extent_root(fs_info, last);
 731
 732#ifdef CONFIG_BTRFS_DEBUG
 733	/*
 734	 * If we're fragmenting we don't want to make anybody think we can
 735	 * allocate from this block group until we've had a chance to fragment
 736	 * the free space.
 737	 */
 738	if (btrfs_should_fragment_free_space(block_group))
 739		wakeup = false;
 740#endif
 741	/*
 742	 * We don't want to deadlock with somebody trying to allocate a new
 743	 * extent for the extent root while also trying to search the extent
 744	 * root to add free space.  So we skip locking and search the commit
 745	 * root, since its read-only
 746	 */
 747	path->skip_locking = true;
 748	path->search_commit_root = true;
 749	path->reada = READA_FORWARD;
 750
 751	key.objectid = last;
 752	key.type = BTRFS_EXTENT_ITEM_KEY;
 753	key.offset = 0;
 754
 755next:
 756	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 757	if (ret < 0)
 758		goto out;
 759
 760	leaf = path->nodes[0];
 761	nritems = btrfs_header_nritems(leaf);
 762
 763	while (1) {
 764		if (btrfs_fs_closing(fs_info) > 1) {
 765			last = (u64)-1;
 766			break;
 767		}
 768
 769		if (path->slots[0] < nritems) {
 770			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 771		} else {
 772			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 773			if (ret)
 774				break;
 775
 776			if (need_resched() ||
 777			    rwsem_is_contended(&fs_info->commit_root_sem)) {
 778				btrfs_release_path(path);
 779				up_read(&fs_info->commit_root_sem);
 780				mutex_unlock(&caching_ctl->mutex);
 781				cond_resched();
 782				mutex_lock(&caching_ctl->mutex);
 783				down_read(&fs_info->commit_root_sem);
 784				goto next;
 785			}
 786
 787			ret = btrfs_next_leaf(extent_root, path);
 788			if (ret < 0)
 789				goto out;
 790			if (ret)
 791				break;
 792			leaf = path->nodes[0];
 793			nritems = btrfs_header_nritems(leaf);
 794			continue;
 795		}
 796
 797		if (key.objectid < last) {
 798			key.objectid = last;
 799			key.type = BTRFS_EXTENT_ITEM_KEY;
 800			key.offset = 0;
 801			btrfs_release_path(path);
 802			goto next;
 803		}
 804
 805		if (key.objectid < block_group->start) {
 806			path->slots[0]++;
 807			continue;
 808		}
 809
 810		if (key.objectid >= block_group->start + block_group->length)
 811			break;
 812
 813		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 814		    key.type == BTRFS_METADATA_ITEM_KEY) {
 815			u64 space_added;
 816
 817			ret = btrfs_add_new_free_space(block_group, last,
 818						       key.objectid, &space_added);
 819			if (ret)
 820				goto out;
 821			total_found += space_added;
 822			if (key.type == BTRFS_METADATA_ITEM_KEY)
 823				last = key.objectid +
 824					fs_info->nodesize;
 825			else
 826				last = key.objectid + key.offset;
 827
 828			if (total_found > CACHING_CTL_WAKE_UP) {
 829				total_found = 0;
 830				if (wakeup) {
 831					atomic_inc(&caching_ctl->progress);
 832					wake_up(&caching_ctl->wait);
 833				}
 834			}
 835		}
 836		path->slots[0]++;
 837	}
 838
 839	ret = btrfs_add_new_free_space(block_group, last,
 840				       block_group->start + block_group->length,
 841				       NULL);
 842out:
 843	return ret;
 844}
 845
 846static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
 847{
 848	btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
 849			       bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
 850}
 851
 852static noinline void caching_thread(struct btrfs_work *work)
 853{
 854	struct btrfs_block_group *block_group;
 855	struct btrfs_fs_info *fs_info;
 856	struct btrfs_caching_control *caching_ctl;
 857	int ret;
 858
 859	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 860	block_group = caching_ctl->block_group;
 861	fs_info = block_group->fs_info;
 862
 863	mutex_lock(&caching_ctl->mutex);
 864	down_read(&fs_info->commit_root_sem);
 865
 866	load_block_group_size_class(caching_ctl, block_group);
 867	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 868		ret = load_free_space_cache(block_group);
 869		if (ret == 1) {
 870			ret = 0;
 871			goto done;
 872		}
 873
 874		/*
 875		 * We failed to load the space cache, set ourselves to
 876		 * CACHE_STARTED and carry on.
 877		 */
 878		spin_lock(&block_group->lock);
 879		block_group->cached = BTRFS_CACHE_STARTED;
 880		spin_unlock(&block_group->lock);
 881		wake_up(&caching_ctl->wait);
 882	}
 883
 884	/*
 885	 * If we are in the transaction that populated the free space tree we
 886	 * can't actually cache from the free space tree as our commit root and
 887	 * real root are the same, so we could change the contents of the blocks
 888	 * while caching.  Instead do the slow caching in this case, and after
 889	 * the transaction has committed we will be safe.
 890	 */
 891	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
 892	    !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
 893		ret = btrfs_load_free_space_tree(caching_ctl);
 894	else
 895		ret = load_extent_tree_free(caching_ctl);
 896done:
 897	spin_lock(&block_group->lock);
 898	block_group->caching_ctl = NULL;
 899	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 900	spin_unlock(&block_group->lock);
 901
 902#ifdef CONFIG_BTRFS_DEBUG
 903	if (btrfs_should_fragment_free_space(block_group)) {
 904		u64 bytes_used;
 905
 906		spin_lock(&block_group->space_info->lock);
 907		spin_lock(&block_group->lock);
 908		bytes_used = block_group->length - block_group->used;
 909		block_group->space_info->bytes_used += bytes_used >> 1;
 910		spin_unlock(&block_group->lock);
 911		spin_unlock(&block_group->space_info->lock);
 912		fragment_free_space(block_group);
 913	}
 914#endif
 915
 916	up_read(&fs_info->commit_root_sem);
 917	btrfs_free_excluded_extents(block_group);
 918	mutex_unlock(&caching_ctl->mutex);
 919
 920	wake_up(&caching_ctl->wait);
 921
 922	btrfs_put_caching_control(caching_ctl);
 923	btrfs_put_block_group(block_group);
 924}
 925
 926int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
 927{
 928	struct btrfs_fs_info *fs_info = cache->fs_info;
 929	struct btrfs_caching_control *caching_ctl = NULL;
 930	int ret = 0;
 931
 932	/* Allocator for zoned filesystems does not use the cache at all */
 933	if (btrfs_is_zoned(fs_info))
 934		return 0;
 935
 936	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 937	if (!caching_ctl)
 938		return -ENOMEM;
 939
 940	INIT_LIST_HEAD(&caching_ctl->list);
 941	mutex_init(&caching_ctl->mutex);
 942	init_waitqueue_head(&caching_ctl->wait);
 943	caching_ctl->block_group = cache;
 944	refcount_set(&caching_ctl->count, 2);
 945	atomic_set(&caching_ctl->progress, 0);
 946	btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
 947
 948	spin_lock(&cache->lock);
 949	if (cache->cached != BTRFS_CACHE_NO) {
 950		kfree(caching_ctl);
 951
 952		caching_ctl = cache->caching_ctl;
 953		if (caching_ctl)
 954			refcount_inc(&caching_ctl->count);
 955		spin_unlock(&cache->lock);
 956		goto out;
 957	}
 958	WARN_ON(cache->caching_ctl);
 959	cache->caching_ctl = caching_ctl;
 960	cache->cached = BTRFS_CACHE_STARTED;
 961	spin_unlock(&cache->lock);
 962
 963	write_lock(&fs_info->block_group_cache_lock);
 964	refcount_inc(&caching_ctl->count);
 965	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 966	write_unlock(&fs_info->block_group_cache_lock);
 967
 968	btrfs_get_block_group(cache);
 969
 970	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 971out:
 972	if (wait && caching_ctl)
 973		ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
 974	if (caching_ctl)
 975		btrfs_put_caching_control(caching_ctl);
 976
 977	return ret;
 978}
 979
 980static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 981{
 982	u64 extra_flags = chunk_to_extended(flags) &
 983				BTRFS_EXTENDED_PROFILE_MASK;
 984
 985	write_seqlock(&fs_info->profiles_lock);
 986	if (flags & BTRFS_BLOCK_GROUP_DATA)
 987		fs_info->avail_data_alloc_bits &= ~extra_flags;
 988	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 989		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 990	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 991		fs_info->avail_system_alloc_bits &= ~extra_flags;
 992	write_sequnlock(&fs_info->profiles_lock);
 993}
 994
 995/*
 996 * Clear incompat bits for the following feature(s):
 997 *
 998 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 999 *            in the whole filesystem
1000 *
1001 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
1002 */
1003static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
1004{
1005	bool found_raid56 = false;
1006	bool found_raid1c34 = false;
1007
1008	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
1009	    (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
1010	    (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
1011		struct list_head *head = &fs_info->space_info;
1012		struct btrfs_space_info *sinfo;
1013
1014		list_for_each_entry_rcu(sinfo, head, list) {
1015			down_read(&sinfo->groups_sem);
1016			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
1017				found_raid56 = true;
1018			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
1019				found_raid56 = true;
1020			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
1021				found_raid1c34 = true;
1022			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
1023				found_raid1c34 = true;
1024			up_read(&sinfo->groups_sem);
1025		}
1026		if (!found_raid56)
1027			btrfs_clear_fs_incompat(fs_info, RAID56);
1028		if (!found_raid1c34)
1029			btrfs_clear_fs_incompat(fs_info, RAID1C34);
1030	}
1031}
1032
1033static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
1034{
1035	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
1036		return fs_info->block_group_root;
1037	return btrfs_extent_root(fs_info, 0);
1038}
1039
1040static int remove_block_group_item(struct btrfs_trans_handle *trans,
1041				   struct btrfs_path *path,
1042				   struct btrfs_block_group *block_group)
1043{
1044	struct btrfs_fs_info *fs_info = trans->fs_info;
1045	struct btrfs_root *root;
1046	struct btrfs_key key;
1047	int ret;
1048
1049	root = btrfs_block_group_root(fs_info);
1050	key.objectid = block_group->start;
1051	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1052	key.offset = block_group->length;
1053
1054	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1055	if (ret > 0)
1056		ret = -ENOENT;
1057	if (ret < 0)
1058		return ret;
1059
1060	ret = btrfs_del_item(trans, root, path);
1061	return ret;
1062}
1063
1064int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1065			     struct btrfs_chunk_map *map)
1066{
1067	struct btrfs_fs_info *fs_info = trans->fs_info;
1068	BTRFS_PATH_AUTO_FREE(path);
1069	struct btrfs_block_group *block_group;
1070	struct btrfs_free_cluster *cluster;
1071	struct inode *inode;
1072	struct kobject *kobj = NULL;
1073	int ret;
1074	int index;
1075	int factor;
1076	struct btrfs_caching_control *caching_ctl = NULL;
1077	bool remove_map;
1078	bool remove_rsv = false;
1079
1080	block_group = btrfs_lookup_block_group(fs_info, map->start);
1081	if (!block_group)
1082		return -ENOENT;
1083
1084	BUG_ON(!block_group->ro);
1085
1086	trace_btrfs_remove_block_group(block_group);
1087	/*
1088	 * Free the reserved super bytes from this block group before
1089	 * remove it.
1090	 */
1091	btrfs_free_excluded_extents(block_group);
1092	btrfs_free_ref_tree_range(fs_info, block_group->start,
1093				  block_group->length);
1094
1095	index = btrfs_bg_flags_to_raid_index(block_group->flags);
1096	factor = btrfs_bg_type_to_factor(block_group->flags);
1097
1098	/* make sure this block group isn't part of an allocation cluster */
1099	cluster = &fs_info->data_alloc_cluster;
1100	spin_lock(&cluster->refill_lock);
1101	btrfs_return_cluster_to_free_space(block_group, cluster);
1102	spin_unlock(&cluster->refill_lock);
1103
1104	/*
1105	 * make sure this block group isn't part of a metadata
1106	 * allocation cluster
1107	 */
1108	cluster = &fs_info->meta_alloc_cluster;
1109	spin_lock(&cluster->refill_lock);
1110	btrfs_return_cluster_to_free_space(block_group, cluster);
1111	spin_unlock(&cluster->refill_lock);
1112
1113	btrfs_clear_treelog_bg(block_group);
1114	btrfs_clear_data_reloc_bg(block_group);
1115
1116	path = btrfs_alloc_path();
1117	if (!path) {
1118		ret = -ENOMEM;
1119		goto out;
1120	}
1121
1122	/*
1123	 * get the inode first so any iput calls done for the io_list
1124	 * aren't the final iput (no unlinks allowed now)
1125	 */
1126	inode = lookup_free_space_inode(block_group, path);
1127
1128	mutex_lock(&trans->transaction->cache_write_mutex);
1129	/*
1130	 * Make sure our free space cache IO is done before removing the
1131	 * free space inode
1132	 */
1133	spin_lock(&trans->transaction->dirty_bgs_lock);
1134	if (!list_empty(&block_group->io_list)) {
1135		list_del_init(&block_group->io_list);
1136
1137		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1138
1139		spin_unlock(&trans->transaction->dirty_bgs_lock);
1140		btrfs_wait_cache_io(trans, block_group, path);
1141		btrfs_put_block_group(block_group);
1142		spin_lock(&trans->transaction->dirty_bgs_lock);
1143	}
1144
1145	if (!list_empty(&block_group->dirty_list)) {
1146		list_del_init(&block_group->dirty_list);
1147		remove_rsv = true;
1148		btrfs_put_block_group(block_group);
1149	}
1150	spin_unlock(&trans->transaction->dirty_bgs_lock);
1151	mutex_unlock(&trans->transaction->cache_write_mutex);
1152
1153	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1154	if (ret)
1155		goto out;
1156
1157	write_lock(&fs_info->block_group_cache_lock);
1158	rb_erase_cached(&block_group->cache_node,
1159			&fs_info->block_group_cache_tree);
1160	RB_CLEAR_NODE(&block_group->cache_node);
1161
1162	/* Once for the block groups rbtree */
1163	btrfs_put_block_group(block_group);
1164
1165	write_unlock(&fs_info->block_group_cache_lock);
1166
1167	down_write(&block_group->space_info->groups_sem);
1168	/*
1169	 * we must use list_del_init so people can check to see if they
1170	 * are still on the list after taking the semaphore
1171	 */
1172	list_del_init(&block_group->list);
1173	if (list_empty(&block_group->space_info->block_groups[index])) {
1174		kobj = block_group->space_info->block_group_kobjs[index];
1175		block_group->space_info->block_group_kobjs[index] = NULL;
1176		clear_avail_alloc_bits(fs_info, block_group->flags);
1177	}
1178	up_write(&block_group->space_info->groups_sem);
1179	clear_incompat_bg_bits(fs_info, block_group->flags);
1180	if (kobj) {
1181		kobject_del(kobj);
1182		kobject_put(kobj);
1183	}
1184
1185	if (block_group->cached == BTRFS_CACHE_STARTED)
1186		btrfs_wait_block_group_cache_done(block_group);
1187
1188	write_lock(&fs_info->block_group_cache_lock);
1189	caching_ctl = btrfs_get_caching_control(block_group);
1190	if (!caching_ctl) {
1191		struct btrfs_caching_control *ctl;
1192
1193		list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1194			if (ctl->block_group == block_group) {
1195				caching_ctl = ctl;
1196				refcount_inc(&caching_ctl->count);
1197				break;
1198			}
1199		}
1200	}
1201	if (caching_ctl)
1202		list_del_init(&caching_ctl->list);
1203	write_unlock(&fs_info->block_group_cache_lock);
1204
1205	if (caching_ctl) {
1206		/* Once for the caching bgs list and once for us. */
1207		btrfs_put_caching_control(caching_ctl);
1208		btrfs_put_caching_control(caching_ctl);
1209	}
1210
1211	spin_lock(&trans->transaction->dirty_bgs_lock);
1212	WARN_ON(!list_empty(&block_group->dirty_list));
1213	WARN_ON(!list_empty(&block_group->io_list));
1214	spin_unlock(&trans->transaction->dirty_bgs_lock);
1215
1216	btrfs_remove_free_space_cache(block_group);
1217
1218	spin_lock(&block_group->space_info->lock);
1219	list_del_init(&block_group->ro_list);
1220
1221	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1222		WARN_ON(block_group->space_info->total_bytes
1223			< block_group->length);
1224		WARN_ON(block_group->space_info->bytes_readonly
1225			< block_group->length - block_group->zone_unusable);
1226		WARN_ON(block_group->space_info->bytes_zone_unusable
1227			< block_group->zone_unusable);
1228		WARN_ON(block_group->space_info->disk_total
1229			< block_group->length * factor);
1230	}
1231	block_group->space_info->total_bytes -= block_group->length;
1232	block_group->space_info->bytes_readonly -=
1233		(block_group->length - block_group->zone_unusable);
1234	btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
1235						    -block_group->zone_unusable);
1236	block_group->space_info->disk_total -= block_group->length * factor;
1237
1238	spin_unlock(&block_group->space_info->lock);
1239
1240	/*
1241	 * Remove the free space for the block group from the free space tree
1242	 * and the block group's item from the extent tree before marking the
1243	 * block group as removed. This is to prevent races with tasks that
1244	 * freeze and unfreeze a block group, this task and another task
1245	 * allocating a new block group - the unfreeze task ends up removing
1246	 * the block group's extent map before the task calling this function
1247	 * deletes the block group item from the extent tree, allowing for
1248	 * another task to attempt to create another block group with the same
1249	 * item key (and failing with -EEXIST and a transaction abort).
1250	 */
1251	ret = btrfs_remove_block_group_free_space(trans, block_group);
1252	if (ret)
1253		goto out;
1254
1255	ret = remove_block_group_item(trans, path, block_group);
1256	if (ret < 0)
1257		goto out;
1258
1259	spin_lock(&block_group->lock);
1260	/*
1261	 * Hitting this WARN means we removed a block group with an unwritten
1262	 * region. It will cause "unable to find chunk map for logical" errors.
1263	 */
1264	if (WARN_ON(has_unwritten_metadata(block_group)))
1265		btrfs_warn(fs_info,
1266			   "block group %llu is removed before metadata write out",
1267			   block_group->start);
1268
1269	set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
1270
1271	/*
1272	 * At this point trimming or scrub can't start on this block group,
1273	 * because we removed the block group from the rbtree
1274	 * fs_info->block_group_cache_tree so no one can't find it anymore and
1275	 * even if someone already got this block group before we removed it
1276	 * from the rbtree, they have already incremented block_group->frozen -
1277	 * if they didn't, for the trimming case they won't find any free space
1278	 * entries because we already removed them all when we called
1279	 * btrfs_remove_free_space_cache().
1280	 *
1281	 * And we must not remove the chunk map from the fs_info->mapping_tree
1282	 * to prevent the same logical address range and physical device space
1283	 * ranges from being reused for a new block group. This is needed to
1284	 * avoid races with trimming and scrub.
1285	 *
1286	 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1287	 * completely transactionless, so while it is trimming a range the
1288	 * currently running transaction might finish and a new one start,
1289	 * allowing for new block groups to be created that can reuse the same
1290	 * physical device locations unless we take this special care.
1291	 *
1292	 * There may also be an implicit trim operation if the file system
1293	 * is mounted with -odiscard. The same protections must remain
1294	 * in place until the extents have been discarded completely when
1295	 * the transaction commit has completed.
1296	 */
1297	remove_map = (atomic_read(&block_group->frozen) == 0);
1298	spin_unlock(&block_group->lock);
1299
1300	if (remove_map)
1301		btrfs_remove_chunk_map(fs_info, map);
1302
1303out:
1304	/* Once for the lookup reference */
1305	btrfs_put_block_group(block_group);
1306	if (remove_rsv)
1307		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
1308	return ret;
1309}
1310
1311struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1312		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1313{
1314	struct btrfs_root *root = btrfs_block_group_root(fs_info);
1315	struct btrfs_chunk_map *map;
1316	unsigned int num_items;
1317
1318	map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
1319	ASSERT(map != NULL);
1320	ASSERT(map->start == chunk_offset);
1321
1322	/*
1323	 * We need to reserve 3 + N units from the metadata space info in order
1324	 * to remove a block group (done at btrfs_remove_chunk() and at
1325	 * btrfs_remove_block_group()), which are used for:
1326	 *
1327	 * 1 unit for adding the free space inode's orphan (located in the tree
1328	 * of tree roots).
1329	 * 1 unit for deleting the block group item (located in the extent
1330	 * tree).
1331	 * 1 unit for deleting the free space item (located in tree of tree
1332	 * roots).
1333	 * N units for deleting N device extent items corresponding to each
1334	 * stripe (located in the device tree).
1335	 *
1336	 * In order to remove a block group we also need to reserve units in the
1337	 * system space info in order to update the chunk tree (update one or
1338	 * more device items and remove one chunk item), but this is done at
1339	 * btrfs_remove_chunk() through a call to check_system_chunk().
1340	 */
1341	num_items = 3 + map->num_stripes;
1342	btrfs_free_chunk_map(map);
1343
1344	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1345}
1346
1347/*
1348 * Mark block group @cache read-only, so later write won't happen to block
1349 * group @cache.
1350 *
1351 * If @force is not set, this function will only mark the block group readonly
1352 * if we have enough free space (1M) in other metadata/system block groups.
1353 * If @force is not set, this function will mark the block group readonly
1354 * without checking free space.
1355 *
1356 * NOTE: This function doesn't care if other block groups can contain all the
1357 * data in this block group. That check should be done by relocation routine,
1358 * not this function.
1359 */
1360static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
1361{
1362	struct btrfs_space_info *sinfo = cache->space_info;
1363	u64 num_bytes;
1364	int ret = -ENOSPC;
1365
1366	spin_lock(&sinfo->lock);
1367	spin_lock(&cache->lock);
1368
1369	if (cache->swap_extents) {
1370		ret = -ETXTBSY;
1371		goto out;
1372	}
1373
1374	if (cache->ro) {
1375		cache->ro++;
1376		ret = 0;
1377		goto out;
1378	}
1379
1380	num_bytes = cache->length - cache->reserved - cache->pinned -
1381		    cache->bytes_super - cache->zone_unusable - cache->used;
1382
1383	/*
1384	 * Data never overcommits, even in mixed mode, so do just the straight
1385	 * check of left over space in how much we have allocated.
1386	 */
1387	if (force) {
1388		ret = 0;
1389	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1390		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1391
1392		/*
1393		 * Here we make sure if we mark this bg RO, we still have enough
1394		 * free space as buffer.
1395		 */
1396		if (sinfo_used + num_bytes <= sinfo->total_bytes)
1397			ret = 0;
1398	} else {
1399		/*
1400		 * We overcommit metadata, so we need to do the
1401		 * btrfs_can_overcommit check here, and we need to pass in
1402		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1403		 * leeway to allow us to mark this block group as read only.
1404		 */
1405		if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH))
1406			ret = 0;
1407	}
1408
1409	if (!ret) {
1410		sinfo->bytes_readonly += num_bytes;
1411		if (btrfs_is_zoned(cache->fs_info)) {
1412			/* Migrate zone_unusable bytes to readonly */
1413			sinfo->bytes_readonly += cache->zone_unusable;
1414			btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
1415			cache->zone_unusable = 0;
1416		}
1417		cache->ro++;
1418		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1419	}
1420out:
1421	spin_unlock(&cache->lock);
1422	spin_unlock(&sinfo->lock);
1423	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1424		btrfs_info(cache->fs_info,
1425			"unable to make block group %llu ro", cache->start);
1426		btrfs_dump_space_info(cache->space_info, 0, false);
1427	}
1428	return ret;
1429}
1430
1431static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1432				 const struct btrfs_block_group *bg)
1433{
1434	struct btrfs_fs_info *fs_info = trans->fs_info;
1435	struct btrfs_transaction *prev_trans = NULL;
1436	const u64 start = bg->start;
1437	const u64 end = start + bg->length - 1;
1438	int ret;
1439
1440	spin_lock(&fs_info->trans_lock);
1441	if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
1442		prev_trans = list_prev_entry(trans->transaction, list);
1443		refcount_inc(&prev_trans->use_count);
1444	}
1445	spin_unlock(&fs_info->trans_lock);
1446
1447	/*
1448	 * Hold the unused_bg_unpin_mutex lock to avoid racing with
1449	 * btrfs_finish_extent_commit(). If we are at transaction N, another
1450	 * task might be running finish_extent_commit() for the previous
1451	 * transaction N - 1, and have seen a range belonging to the block
1452	 * group in pinned_extents before we were able to clear the whole block
1453	 * group range from pinned_extents. This means that task can lookup for
1454	 * the block group after we unpinned it from pinned_extents and removed
1455	 * it, leading to an error at unpin_extent_range().
1456	 */
1457	mutex_lock(&fs_info->unused_bg_unpin_mutex);
1458	if (prev_trans) {
1459		ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
1460					     EXTENT_DIRTY, NULL);
1461		if (ret)
1462			goto out;
1463	}
1464
1465	ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
1466				     EXTENT_DIRTY, NULL);
1467out:
1468	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1469	if (prev_trans)
1470		btrfs_put_transaction(prev_trans);
1471
1472	return ret == 0;
1473}
1474
1475/*
1476 * Link the block_group to a list via bg_list.
1477 *
1478 * @bg:       The block_group to link to the list.
1479 * @list:     The list to link it to.
1480 *
1481 * Use this rather than list_add_tail() directly to ensure proper respect
1482 * to locking and refcounting.
1483 *
1484 * Returns: true if the bg was linked with a refcount bump and false otherwise.
1485 */
1486static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
1487{
1488	struct btrfs_fs_info *fs_info = bg->fs_info;
1489	bool added = false;
1490
1491	spin_lock(&fs_info->unused_bgs_lock);
1492	if (list_empty(&bg->bg_list)) {
1493		btrfs_get_block_group(bg);
1494		list_add_tail(&bg->bg_list, list);
1495		added = true;
1496	}
1497	spin_unlock(&fs_info->unused_bgs_lock);
1498	return added;
1499}
1500
1501/*
1502 * Process the unused_bgs list and remove any that don't have any allocated
1503 * space inside of them.
1504 */
1505void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1506{
1507	LIST_HEAD(retry_list);
1508	struct btrfs_block_group *block_group;
1509	struct btrfs_space_info *space_info;
1510	struct btrfs_trans_handle *trans;
1511	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1512	int ret = 0;
1513
1514	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1515		return;
1516
1517	if (btrfs_fs_closing(fs_info))
1518		return;
1519
1520	/*
1521	 * Long running balances can keep us blocked here for eternity, so
1522	 * simply skip deletion if we're unable to get the mutex.
1523	 */
1524	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1525		return;
1526
1527	spin_lock(&fs_info->unused_bgs_lock);
1528	while (!list_empty(&fs_info->unused_bgs)) {
1529		u64 used;
1530		int trimming;
1531
1532		block_group = list_first_entry(&fs_info->unused_bgs,
1533					       struct btrfs_block_group,
1534					       bg_list);
1535		list_del_init(&block_group->bg_list);
1536
1537		space_info = block_group->space_info;
1538
1539		if (ret || btrfs_mixed_space_info(space_info)) {
1540			btrfs_put_block_group(block_group);
1541			continue;
1542		}
1543		spin_unlock(&fs_info->unused_bgs_lock);
1544
1545		btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1546
1547		/* Don't want to race with allocators so take the groups_sem */
1548		down_write(&space_info->groups_sem);
1549
1550		/*
1551		 * Async discard moves the final block group discard to be prior
1552		 * to the unused_bgs code path.  Therefore, if it's not fully
1553		 * trimmed, punt it back to the async discard lists.
1554		 */
1555		if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1556		    !btrfs_is_free_space_trimmed(block_group)) {
1557			trace_btrfs_skip_unused_block_group(block_group);
1558			up_write(&space_info->groups_sem);
1559			/* Requeue if we failed because of async discard */
1560			btrfs_discard_queue_work(&fs_info->discard_ctl,
1561						 block_group);
1562			goto next;
1563		}
1564
1565		spin_lock(&space_info->lock);
1566		spin_lock(&block_group->lock);
1567		if (btrfs_is_block_group_used(block_group) || block_group->ro ||
1568		    list_is_singular(&block_group->list)) {
1569			/*
1570			 * We want to bail if we made new allocations or have
1571			 * outstanding allocations in this block group.  We do
1572			 * the ro check in case balance is currently acting on
1573			 * this block group.
1574			 *
1575			 * Also bail out if this is the only block group for its
1576			 * type, because otherwise we would lose profile
1577			 * information from fs_info->avail_*_alloc_bits and the
1578			 * next block group of this type would be created with a
1579			 * "single" profile (even if we're in a raid fs) because
1580			 * fs_info->avail_*_alloc_bits would be 0.
1581			 */
1582			trace_btrfs_skip_unused_block_group(block_group);
1583			spin_unlock(&block_group->lock);
1584			spin_unlock(&space_info->lock);
1585			up_write(&space_info->groups_sem);
1586			goto next;
1587		}
1588
1589		/*
1590		 * The block group may be unused but there may be space reserved
1591		 * accounting with the existence of that block group, that is,
1592		 * space_info->bytes_may_use was incremented by a task but no
1593		 * space was yet allocated from the block group by the task.
1594		 * That space may or may not be allocated, as we are generally
1595		 * pessimistic about space reservation for metadata as well as
1596		 * for data when using compression (as we reserve space based on
1597		 * the worst case, when data can't be compressed, and before
1598		 * actually attempting compression, before starting writeback).
1599		 *
1600		 * So check if the total space of the space_info minus the size
1601		 * of this block group is less than the used space of the
1602		 * space_info - if that's the case, then it means we have tasks
1603		 * that might be relying on the block group in order to allocate
1604		 * extents, and add back the block group to the unused list when
1605		 * we finish, so that we retry later in case no tasks ended up
1606		 * needing to allocate extents from the block group.
1607		 */
1608		used = btrfs_space_info_used(space_info, true);
1609		if ((space_info->total_bytes - block_group->length < used &&
1610		     block_group->zone_unusable < block_group->length) ||
1611		    has_unwritten_metadata(block_group)) {
1612			/*
1613			 * Add a reference for the list, compensate for the ref
1614			 * drop under the "next" label for the
1615			 * fs_info->unused_bgs list.
1616			 */
1617			btrfs_link_bg_list(block_group, &retry_list);
1618
1619			trace_btrfs_skip_unused_block_group(block_group);
1620			spin_unlock(&block_group->lock);
1621			spin_unlock(&space_info->lock);
1622			up_write(&space_info->groups_sem);
1623			goto next;
1624		}
1625
1626		spin_unlock(&block_group->lock);
1627		spin_unlock(&space_info->lock);
1628
1629		/* We don't want to force the issue, only flip if it's ok. */
1630		ret = inc_block_group_ro(block_group, 0);
1631		up_write(&space_info->groups_sem);
1632		if (ret < 0) {
1633			ret = 0;
1634			goto next;
1635		}
1636
1637		ret = btrfs_zone_finish(block_group);
1638		if (ret < 0) {
1639			btrfs_dec_block_group_ro(block_group);
1640			if (ret == -EAGAIN) {
1641				btrfs_link_bg_list(block_group, &retry_list);
1642				ret = 0;
1643			}
1644			goto next;
1645		}
1646
1647		/*
1648		 * Want to do this before we do anything else so we can recover
1649		 * properly if we fail to join the transaction.
1650		 */
1651		trans = btrfs_start_trans_remove_block_group(fs_info,
1652						     block_group->start);
1653		if (IS_ERR(trans)) {
1654			btrfs_dec_block_group_ro(block_group);
1655			ret = PTR_ERR(trans);
1656			goto next;
1657		}
1658
1659		/*
1660		 * We could have pending pinned extents for this block group,
1661		 * just delete them, we don't care about them anymore.
1662		 */
1663		if (!clean_pinned_extents(trans, block_group)) {
1664			btrfs_dec_block_group_ro(block_group);
1665			goto end_trans;
1666		}
1667
1668		/*
1669		 * At this point, the block_group is read only and should fail
1670		 * new allocations.  However, btrfs_finish_extent_commit() can
1671		 * cause this block_group to be placed back on the discard
1672		 * lists because now the block_group isn't fully discarded.
1673		 * Bail here and try again later after discarding everything.
1674		 */
1675		spin_lock(&fs_info->discard_ctl.lock);
1676		if (!list_empty(&block_group->discard_list)) {
1677			spin_unlock(&fs_info->discard_ctl.lock);
1678			btrfs_dec_block_group_ro(block_group);
1679			btrfs_discard_queue_work(&fs_info->discard_ctl,
1680						 block_group);
1681			goto end_trans;
1682		}
1683		spin_unlock(&fs_info->discard_ctl.lock);
1684
1685		/* Reset pinned so btrfs_put_block_group doesn't complain */
1686		spin_lock(&space_info->lock);
1687		spin_lock(&block_group->lock);
1688
1689		btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
1690		space_info->bytes_readonly += block_group->pinned;
1691		block_group->pinned = 0;
1692
1693		spin_unlock(&block_group->lock);
1694		spin_unlock(&space_info->lock);
1695
1696		/*
1697		 * The normal path here is an unused block group is passed here,
1698		 * then trimming is handled in the transaction commit path.
1699		 * Async discard interposes before this to do the trimming
1700		 * before coming down the unused block group path as trimming
1701		 * will no longer be done later in the transaction commit path.
1702		 */
1703		if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1704			goto flip_async;
1705
1706		/*
1707		 * DISCARD can flip during remount. On zoned filesystems, we
1708		 * need to reset sequential-required zones.
1709		 */
1710		trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1711				btrfs_is_zoned(fs_info);
1712
1713		/* Implicit trim during transaction commit. */
1714		if (trimming)
1715			btrfs_freeze_block_group(block_group);
1716
1717		/*
1718		 * Btrfs_remove_chunk will abort the transaction if things go
1719		 * horribly wrong.
1720		 */
1721		ret = btrfs_remove_chunk(trans, block_group->start);
1722
1723		if (ret) {
1724			if (trimming)
1725				btrfs_unfreeze_block_group(block_group);
1726			goto end_trans;
1727		}
1728
1729		/*
1730		 * If we're not mounted with -odiscard, we can just forget
1731		 * about this block group. Otherwise we'll need to wait
1732		 * until transaction commit to do the actual discard.
1733		 */
1734		if (trimming) {
1735			spin_lock(&fs_info->unused_bgs_lock);
1736			/*
1737			 * A concurrent scrub might have added us to the list
1738			 * fs_info->unused_bgs, so use a list_move operation
1739			 * to add the block group to the deleted_bgs list.
1740			 */
1741			list_move(&block_group->bg_list,
1742				  &trans->transaction->deleted_bgs);
1743			spin_unlock(&fs_info->unused_bgs_lock);
1744			btrfs_get_block_group(block_group);
1745		}
1746end_trans:
1747		btrfs_end_transaction(trans);
1748next:
1749		btrfs_put_block_group(block_group);
1750		spin_lock(&fs_info->unused_bgs_lock);
1751	}
1752	list_splice_tail(&retry_list, &fs_info->unused_bgs);
1753	spin_unlock(&fs_info->unused_bgs_lock);
1754	mutex_unlock(&fs_info->reclaim_bgs_lock);
1755	return;
1756
1757flip_async:
1758	btrfs_end_transaction(trans);
1759	spin_lock(&fs_info->unused_bgs_lock);
1760	list_splice_tail(&retry_list, &fs_info->unused_bgs);
1761	spin_unlock(&fs_info->unused_bgs_lock);
1762	mutex_unlock(&fs_info->reclaim_bgs_lock);
1763	btrfs_put_block_group(block_group);
1764	btrfs_discard_punt_unused_bgs_list(fs_info);
1765}
1766
1767void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1768{
1769	struct btrfs_fs_info *fs_info = bg->fs_info;
1770
1771	spin_lock(&fs_info->unused_bgs_lock);
1772	if (list_empty(&bg->bg_list)) {
1773		btrfs_get_block_group(bg);
1774		trace_btrfs_add_unused_block_group(bg);
1775		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1776	} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
1777		/* Pull out the block group from the reclaim_bgs list. */
1778		trace_btrfs_add_unused_block_group(bg);
1779		list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
1780	}
1781	spin_unlock(&fs_info->unused_bgs_lock);
1782}
1783
1784/*
1785 * We want block groups with a low number of used bytes to be in the beginning
1786 * of the list, so they will get reclaimed first.
1787 */
1788static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1789			   const struct list_head *b)
1790{
1791	const struct btrfs_block_group *bg1, *bg2;
1792
1793	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1794	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1795
1796	/*
1797	 * Some other task may be updating the ->used field concurrently, but it
1798	 * is not serious if we get a stale value or load/store tearing issues,
1799	 * as sorting the list of block groups to reclaim is not critical and an
1800	 * occasional imperfect order is ok. So silence KCSAN and avoid the
1801	 * overhead of locking or any other synchronization.
1802	 */
1803	return data_race(bg1->used > bg2->used);
1804}
1805
1806static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
1807{
1808	if (btrfs_is_zoned(fs_info))
1809		return btrfs_zoned_should_reclaim(fs_info);
1810	return true;
1811}
1812
1813static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
1814{
1815	const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
1816	u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
1817	const u64 new_val = bg->used;
1818	const u64 old_val = new_val + bytes_freed;
1819
1820	if (thresh_bytes == 0)
1821		return false;
1822
1823	/*
1824	 * If we were below the threshold before don't reclaim, we are likely a
1825	 * brand new block group and we don't want to relocate new block groups.
1826	 */
1827	if (old_val < thresh_bytes)
1828		return false;
1829	if (new_val >= thresh_bytes)
1830		return false;
1831	return true;
1832}
1833
1834void btrfs_reclaim_bgs_work(struct work_struct *work)
1835{
1836	struct btrfs_fs_info *fs_info =
1837		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1838	struct btrfs_block_group *bg;
1839	struct btrfs_space_info *space_info;
1840	LIST_HEAD(retry_list);
1841
1842	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1843		return;
1844
1845	if (btrfs_fs_closing(fs_info))
1846		return;
1847
1848	if (!btrfs_should_reclaim(fs_info))
1849		return;
1850
1851	guard(super_write)(fs_info->sb);
1852
1853	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
1854		return;
1855
1856	/*
1857	 * Long running balances can keep us blocked here for eternity, so
1858	 * simply skip reclaim if we're unable to get the mutex.
1859	 */
1860	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1861		btrfs_exclop_finish(fs_info);
1862		return;
1863	}
1864
1865	spin_lock(&fs_info->unused_bgs_lock);
1866	/*
1867	 * Sort happens under lock because we can't simply splice it and sort.
1868	 * The block groups might still be in use and reachable via bg_list,
1869	 * and their presence in the reclaim_bgs list must be preserved.
1870	 */
1871	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1872	while (!list_empty(&fs_info->reclaim_bgs)) {
1873		u64 used;
1874		u64 reserved;
1875		int ret = 0;
1876
1877		bg = list_first_entry(&fs_info->reclaim_bgs,
1878				      struct btrfs_block_group,
1879				      bg_list);
1880		list_del_init(&bg->bg_list);
1881
1882		space_info = bg->space_info;
1883		spin_unlock(&fs_info->unused_bgs_lock);
1884
1885		/* Don't race with allocators so take the groups_sem */
1886		down_write(&space_info->groups_sem);
1887
1888		spin_lock(&space_info->lock);
1889		spin_lock(&bg->lock);
1890		if (bg->reserved || bg->pinned || bg->ro) {
1891			/*
1892			 * We want to bail if we made new allocations or have
1893			 * outstanding allocations in this block group.  We do
1894			 * the ro check in case balance is currently acting on
1895			 * this block group.
1896			 */
1897			spin_unlock(&bg->lock);
1898			spin_unlock(&space_info->lock);
1899			up_write(&space_info->groups_sem);
1900			goto next;
1901		}
1902		if (bg->used == 0) {
1903			/*
1904			 * It is possible that we trigger relocation on a block
1905			 * group as its extents are deleted and it first goes
1906			 * below the threshold, then shortly after goes empty.
1907			 *
1908			 * In this case, relocating it does delete it, but has
1909			 * some overhead in relocation specific metadata, looking
1910			 * for the non-existent extents and running some extra
1911			 * transactions, which we can avoid by using one of the
1912			 * other mechanisms for dealing with empty block groups.
1913			 */
1914			if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1915				btrfs_mark_bg_unused(bg);
1916			spin_unlock(&bg->lock);
1917			spin_unlock(&space_info->lock);
1918			up_write(&space_info->groups_sem);
1919			goto next;
1920
1921		}
1922		/*
1923		 * The block group might no longer meet the reclaim condition by
1924		 * the time we get around to reclaiming it, so to avoid
1925		 * reclaiming overly full block_groups, skip reclaiming them.
1926		 *
1927		 * Since the decision making process also depends on the amount
1928		 * being freed, pass in a fake giant value to skip that extra
1929		 * check, which is more meaningful when adding to the list in
1930		 * the first place.
1931		 */
1932		if (!should_reclaim_block_group(bg, bg->length)) {
1933			spin_unlock(&bg->lock);
1934			spin_unlock(&space_info->lock);
1935			up_write(&space_info->groups_sem);
1936			goto next;
1937		}
1938
1939		spin_unlock(&bg->lock);
1940		spin_unlock(&space_info->lock);
1941
1942		/*
1943		 * Get out fast, in case we're read-only or unmounting the
1944		 * filesystem. It is OK to drop block groups from the list even
1945		 * for the read-only case. As we did take the super write lock,
1946		 * "mount -o remount,ro" won't happen and read-only filesystem
1947		 * means it is forced read-only due to a fatal error. So, it
1948		 * never gets back to read-write to let us reclaim again.
1949		 */
1950		if (btrfs_need_cleaner_sleep(fs_info)) {
1951			up_write(&space_info->groups_sem);
1952			goto next;
1953		}
1954
1955		ret = inc_block_group_ro(bg, 0);
1956		up_write(&space_info->groups_sem);
1957		if (ret < 0)
1958			goto next;
1959
1960		/*
1961		 * The amount of bytes reclaimed corresponds to the sum of the
1962		 * "used" and "reserved" counters. We have set the block group
1963		 * to RO above, which prevents reservations from happening but
1964		 * we may have existing reservations for which allocation has
1965		 * not yet been done - btrfs_update_block_group() was not yet
1966		 * called, which is where we will transfer a reserved extent's
1967		 * size from the "reserved" counter to the "used" counter - this
1968		 * happens when running delayed references. When we relocate the
1969		 * chunk below, relocation first flushes delalloc, waits for
1970		 * ordered extent completion (which is where we create delayed
1971		 * references for data extents) and commits the current
1972		 * transaction (which runs delayed references), and only after
1973		 * it does the actual work to move extents out of the block
1974		 * group. So the reported amount of reclaimed bytes is
1975		 * effectively the sum of the 'used' and 'reserved' counters.
1976		 */
1977		spin_lock(&bg->lock);
1978		used = bg->used;
1979		reserved = bg->reserved;
1980		spin_unlock(&bg->lock);
1981
1982		trace_btrfs_reclaim_block_group(bg);
1983		ret = btrfs_relocate_chunk(fs_info, bg->start, false);
1984		if (ret) {
1985			btrfs_dec_block_group_ro(bg);
1986			btrfs_err(fs_info, "error relocating chunk %llu",
1987				  bg->start);
1988			used = 0;
1989			reserved = 0;
1990			spin_lock(&space_info->lock);
1991			space_info->reclaim_errors++;
1992			if (READ_ONCE(space_info->periodic_reclaim))
1993				space_info->periodic_reclaim_ready = false;
1994			spin_unlock(&space_info->lock);
1995		}
1996		spin_lock(&space_info->lock);
1997		space_info->reclaim_count++;
1998		space_info->reclaim_bytes += used;
1999		space_info->reclaim_bytes += reserved;
2000		spin_unlock(&space_info->lock);
2001
2002next:
2003		if (ret && !READ_ONCE(space_info->periodic_reclaim))
2004			btrfs_link_bg_list(bg, &retry_list);
2005		btrfs_put_block_group(bg);
2006
2007		mutex_unlock(&fs_info->reclaim_bgs_lock);
2008		/*
2009		 * Reclaiming all the block groups in the list can take really
2010		 * long.  Prioritize cleaning up unused block groups.
2011		 */
2012		btrfs_delete_unused_bgs(fs_info);
2013		/*
2014		 * If we are interrupted by a balance, we can just bail out. The
2015		 * cleaner thread restart again if necessary.
2016		 */
2017		if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
2018			goto end;
2019		spin_lock(&fs_info->unused_bgs_lock);
2020	}
2021	spin_unlock(&fs_info->unused_bgs_lock);
2022	mutex_unlock(&fs_info->reclaim_bgs_lock);
2023end:
2024	spin_lock(&fs_info->unused_bgs_lock);
2025	list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
2026	spin_unlock(&fs_info->unused_bgs_lock);
2027	btrfs_exclop_finish(fs_info);
2028}
2029
2030void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
2031{
2032	btrfs_reclaim_sweep(fs_info);
2033	spin_lock(&fs_info->unused_bgs_lock);
2034	if (!list_empty(&fs_info->reclaim_bgs))
2035		queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
2036	spin_unlock(&fs_info->unused_bgs_lock);
2037}
2038
2039void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
2040{
2041	struct btrfs_fs_info *fs_info = bg->fs_info;
2042
2043	if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
2044		trace_btrfs_add_reclaim_block_group(bg);
2045}
2046
2047static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
2048			   const struct btrfs_path *path)
2049{
2050	struct btrfs_chunk_map *map;
2051	struct btrfs_block_group_item bg;
2052	struct extent_buffer *leaf;
2053	int slot;
2054	u64 flags;
2055	int ret = 0;
2056
2057	slot = path->slots[0];
2058	leaf = path->nodes[0];
2059
2060	map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
2061	if (!map) {
2062		btrfs_err(fs_info,
2063			  "logical %llu len %llu found bg but no related chunk",
2064			  key->objectid, key->offset);
2065		return -ENOENT;
2066	}
2067
2068	if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
2069		btrfs_err(fs_info,
2070			"block group %llu len %llu mismatch with chunk %llu len %llu",
2071			  key->objectid, key->offset, map->start, map->chunk_len);
2072		ret = -EUCLEAN;
2073		goto out_free_map;
2074	}
2075
2076	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
2077			   sizeof(bg));
2078	flags = btrfs_stack_block_group_flags(&bg) &
2079		BTRFS_BLOCK_GROUP_TYPE_MASK;
2080
2081	if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
2082		btrfs_err(fs_info,
2083"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
2084			  key->objectid, key->offset, flags,
2085			  (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
2086		ret = -EUCLEAN;
2087	}
2088
2089out_free_map:
2090	btrfs_free_chunk_map(map);
2091	return ret;
2092}
2093
2094static int find_first_block_group(struct btrfs_fs_info *fs_info,
2095				  struct btrfs_path *path,
2096				  const struct btrfs_key *key)
2097{
2098	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2099	int ret;
2100	struct btrfs_key found_key;
2101
2102	btrfs_for_each_slot(root, key, &found_key, path, ret) {
2103		if (found_key.objectid >= key->objectid &&
2104		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
2105			return read_bg_from_eb(fs_info, &found_key, path);
2106		}
2107	}
2108	return ret;
2109}
2110
2111static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2112{
2113	u64 extra_flags = chunk_to_extended(flags) &
2114				BTRFS_EXTENDED_PROFILE_MASK;
2115
2116	write_seqlock(&fs_info->profiles_lock);
2117	if (flags & BTRFS_BLOCK_GROUP_DATA)
2118		fs_info->avail_data_alloc_bits |= extra_flags;
2119	if (flags & BTRFS_BLOCK_GROUP_METADATA)
2120		fs_info->avail_metadata_alloc_bits |= extra_flags;
2121	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2122		fs_info->avail_system_alloc_bits |= extra_flags;
2123	write_sequnlock(&fs_info->profiles_lock);
2124}
2125
2126/*
2127 * Map a physical disk address to a list of logical addresses.
2128 *
2129 * @fs_info:       the filesystem
2130 * @chunk_start:   logical address of block group
2131 * @physical:	   physical address to map to logical addresses
2132 * @logical:	   return array of logical addresses which map to @physical
2133 * @naddrs:	   length of @logical
2134 * @stripe_len:    size of IO stripe for the given block group
2135 *
2136 * Maps a particular @physical disk address to a list of @logical addresses.
2137 * Used primarily to exclude those portions of a block group that contain super
2138 * block copies.
2139 */
2140int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
2141		     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
2142{
2143	struct btrfs_chunk_map *map;
2144	u64 *buf;
2145	u64 bytenr;
2146	u64 data_stripe_length;
2147	u64 io_stripe_size;
2148	int i, nr = 0;
2149	int ret = 0;
2150
2151	map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
2152	if (IS_ERR(map))
2153		return -EIO;
2154
2155	data_stripe_length = map->stripe_size;
2156	io_stripe_size = BTRFS_STRIPE_LEN;
2157	chunk_start = map->start;
2158
2159	/* For RAID5/6 adjust to a full IO stripe length */
2160	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2161		io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
2162
2163	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
2164	if (!buf) {
2165		ret = -ENOMEM;
2166		goto out;
2167	}
2168
2169	for (i = 0; i < map->num_stripes; i++) {
2170		bool already_inserted = false;
2171		u32 stripe_nr;
2172		u32 offset;
2173		int j;
2174
2175		if (!in_range(physical, map->stripes[i].physical,
2176			      data_stripe_length))
2177			continue;
2178
2179		stripe_nr = (physical - map->stripes[i].physical) >>
2180			    BTRFS_STRIPE_LEN_SHIFT;
2181		offset = (physical - map->stripes[i].physical) &
2182			 BTRFS_STRIPE_LEN_MASK;
2183
2184		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2185				 BTRFS_BLOCK_GROUP_RAID10))
2186			stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
2187					    map->sub_stripes);
2188		/*
2189		 * The remaining case would be for RAID56, multiply by
2190		 * nr_data_stripes().  Alternatively, just use rmap_len below
2191		 * instead of map->stripe_len
2192		 */
2193		bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
2194
2195		/* Ensure we don't add duplicate addresses */
2196		for (j = 0; j < nr; j++) {
2197			if (buf[j] == bytenr) {
2198				already_inserted = true;
2199				break;
2200			}
2201		}
2202
2203		if (!already_inserted)
2204			buf[nr++] = bytenr;
2205	}
2206
2207	*logical = buf;
2208	*naddrs = nr;
2209	*stripe_len = io_stripe_size;
2210out:
2211	btrfs_free_chunk_map(map);
2212	return ret;
2213}
2214
2215static int exclude_super_stripes(struct btrfs_block_group *cache)
2216{
2217	struct btrfs_fs_info *fs_info = cache->fs_info;
2218	const bool zoned = btrfs_is_zoned(fs_info);
2219	u64 bytenr;
2220	u64 *logical;
2221	int stripe_len;
2222	int i, nr, ret;
2223
2224	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2225		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
2226		cache->bytes_super += stripe_len;
2227		ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
2228					   cache->start + stripe_len - 1,
2229					   EXTENT_DIRTY, NULL);
2230		if (ret)
2231			return ret;
2232	}
2233
2234	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2235		bytenr = btrfs_sb_offset(i);
2236		ret = btrfs_rmap_block(fs_info, cache->start,
2237				       bytenr, &logical, &nr, &stripe_len);
2238		if (ret)
2239			return ret;
2240
2241		/* Shouldn't have super stripes in sequential zones */
2242		if (unlikely(zoned && nr)) {
2243			kfree(logical);
2244			btrfs_err(fs_info,
2245			"zoned: block group %llu must not contain super block",
2246				  cache->start);
2247			return -EUCLEAN;
2248		}
2249
2250		while (nr--) {
2251			u64 len = min_t(u64, stripe_len,
2252				cache->start + cache->length - logical[nr]);
2253
2254			cache->bytes_super += len;
2255			ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
2256						   logical[nr], logical[nr] + len - 1,
2257						   EXTENT_DIRTY, NULL);
2258			if (ret) {
2259				kfree(logical);
2260				return ret;
2261			}
2262		}
2263
2264		kfree(logical);
2265	}
2266	return 0;
2267}
2268
2269static struct btrfs_block_group *btrfs_create_block_group_cache(
2270		struct btrfs_fs_info *fs_info, u64 start)
2271{
2272	struct btrfs_block_group *cache;
2273
2274	cache = kzalloc(sizeof(*cache), GFP_NOFS);
2275	if (!cache)
2276		return NULL;
2277
2278	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
2279					GFP_NOFS);
2280	if (!cache->free_space_ctl) {
2281		kfree(cache);
2282		return NULL;
2283	}
2284
2285	cache->start = start;
2286
2287	cache->fs_info = fs_info;
2288	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
2289
2290	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2291
2292	refcount_set(&cache->refs, 1);
2293	spin_lock_init(&cache->lock);
2294	init_rwsem(&cache->data_rwsem);
2295	INIT_LIST_HEAD(&cache->list);
2296	INIT_LIST_HEAD(&cache->cluster_list);
2297	INIT_LIST_HEAD(&cache->bg_list);
2298	INIT_LIST_HEAD(&cache->ro_list);
2299	INIT_LIST_HEAD(&cache->discard_list);
2300	INIT_LIST_HEAD(&cache->dirty_list);
2301	INIT_LIST_HEAD(&cache->io_list);
2302	INIT_LIST_HEAD(&cache->active_bg_list);
2303	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
2304	atomic_set(&cache->frozen, 0);
2305	mutex_init(&cache->free_space_lock);
2306
2307	return cache;
2308}
2309
2310/*
2311 * Iterate all chunks and verify that each of them has the corresponding block
2312 * group
2313 */
2314static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2315{
2316	u64 start = 0;
2317	int ret = 0;
2318
2319	while (1) {
2320		struct btrfs_chunk_map *map;
2321		struct btrfs_block_group *bg;
2322
2323		/*
2324		 * btrfs_find_chunk_map() will return the first chunk map
2325		 * intersecting the range, so setting @length to 1 is enough to
2326		 * get the first chunk.
2327		 */
2328		map = btrfs_find_chunk_map(fs_info, start, 1);
2329		if (!map)
2330			break;
2331
2332		bg = btrfs_lookup_block_group(fs_info, map->start);
2333		if (unlikely(!bg)) {
2334			btrfs_err(fs_info,
2335	"chunk start=%llu len=%llu doesn't have corresponding block group",
2336				     map->start, map->chunk_len);
2337			ret = -EUCLEAN;
2338			btrfs_free_chunk_map(map);
2339			break;
2340		}
2341		if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
2342			     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
2343			     (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
2344			btrfs_err(fs_info,
2345"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
2346				map->start, map->chunk_len,
2347				map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
2348				bg->start, bg->length,
2349				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2350			ret = -EUCLEAN;
2351			btrfs_free_chunk_map(map);
2352			btrfs_put_block_group(bg);
2353			break;
2354		}
2355		start = map->start + map->chunk_len;
2356		btrfs_free_chunk_map(map);
2357		btrfs_put_block_group(bg);
2358	}
2359	return ret;
2360}
2361
2362static int read_one_block_group(struct btrfs_fs_info *info,
2363				struct btrfs_block_group_item *bgi,
2364				const struct btrfs_key *key,
2365				int need_clear)
2366{
2367	struct btrfs_block_group *cache;
2368	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2369	int ret;
2370
2371	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2372
2373	cache = btrfs_create_block_group_cache(info, key->objectid);
2374	if (!cache)
2375		return -ENOMEM;
2376
2377	cache->length = key->offset;
2378	cache->used = btrfs_stack_block_group_used(bgi);
2379	cache->commit_used = cache->used;
2380	cache->flags = btrfs_stack_block_group_flags(bgi);
2381	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2382	cache->space_info = btrfs_find_space_info(info, cache->flags);
2383
2384	btrfs_set_free_space_tree_thresholds(cache);
2385
2386	if (need_clear) {
2387		/*
2388		 * When we mount with old space cache, we need to
2389		 * set BTRFS_DC_CLEAR and set dirty flag.
2390		 *
2391		 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2392		 *    truncate the old free space cache inode and
2393		 *    setup a new one.
2394		 * b) Setting 'dirty flag' makes sure that we flush
2395		 *    the new space cache info onto disk.
2396		 */
2397		if (btrfs_test_opt(info, SPACE_CACHE))
2398			cache->disk_cache_state = BTRFS_DC_CLEAR;
2399	}
2400	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2401	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2402			btrfs_err(info,
2403"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2404				  cache->start);
2405			ret = -EINVAL;
2406			goto error;
2407	}
2408
2409	ret = btrfs_load_block_group_zone_info(cache, false);
2410	if (ret) {
2411		btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2412			  cache->start);
2413		goto error;
2414	}
2415
2416	/*
2417	 * We need to exclude the super stripes now so that the space info has
2418	 * super bytes accounted for, otherwise we'll think we have more space
2419	 * than we actually do.
2420	 */
2421	ret = exclude_super_stripes(cache);
2422	if (ret) {
2423		/* We may have excluded something, so call this just in case. */
2424		btrfs_free_excluded_extents(cache);
2425		goto error;
2426	}
2427
2428	/*
2429	 * For zoned filesystem, space after the allocation offset is the only
2430	 * free space for a block group. So, we don't need any caching work.
2431	 * btrfs_calc_zone_unusable() will set the amount of free space and
2432	 * zone_unusable space.
2433	 *
2434	 * For regular filesystem, check for two cases, either we are full, and
2435	 * therefore don't need to bother with the caching work since we won't
2436	 * find any space, or we are empty, and we can just add all the space
2437	 * in and be done with it.  This saves us _a_lot_ of time, particularly
2438	 * in the full case.
2439	 */
2440	if (btrfs_is_zoned(info)) {
2441		btrfs_calc_zone_unusable(cache);
2442		/* Should not have any excluded extents. Just in case, though. */
2443		btrfs_free_excluded_extents(cache);
2444	} else if (cache->length == cache->used) {
2445		cache->cached = BTRFS_CACHE_FINISHED;
2446		btrfs_free_excluded_extents(cache);
2447	} else if (cache->used == 0) {
2448		cache->cached = BTRFS_CACHE_FINISHED;
2449		ret = btrfs_add_new_free_space(cache, cache->start,
2450					       cache->start + cache->length, NULL);
2451		btrfs_free_excluded_extents(cache);
2452		if (ret)
2453			goto error;
2454	}
2455
2456	ret = btrfs_add_block_group_cache(cache);
2457	if (ret) {
2458		btrfs_remove_free_space_cache(cache);
2459		goto error;
2460	}
2461
2462	trace_btrfs_add_block_group(info, cache, 0);
2463	btrfs_add_bg_to_space_info(info, cache);
2464
2465	set_avail_alloc_bits(info, cache->flags);
2466	if (btrfs_chunk_writeable(info, cache->start)) {
2467		if (cache->used == 0) {
2468			ASSERT(list_empty(&cache->bg_list));
2469			if (btrfs_test_opt(info, DISCARD_ASYNC))
2470				btrfs_discard_queue_work(&info->discard_ctl, cache);
2471			else
2472				btrfs_mark_bg_unused(cache);
2473		}
2474	} else {
2475		inc_block_group_ro(cache, 1);
2476	}
2477
2478	return 0;
2479error:
2480	btrfs_put_block_group(cache);
2481	return ret;
2482}
2483
2484static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2485{
2486	struct rb_node *node;
2487	int ret = 0;
2488
2489	for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
2490		struct btrfs_chunk_map *map;
2491		struct btrfs_block_group *bg;
2492
2493		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
2494		bg = btrfs_create_block_group_cache(fs_info, map->start);
2495		if (!bg) {
2496			ret = -ENOMEM;
2497			break;
2498		}
2499
2500		/* Fill dummy cache as FULL */
2501		bg->length = map->chunk_len;
2502		bg->flags = map->type;
2503		bg->cached = BTRFS_CACHE_FINISHED;
2504		bg->used = map->chunk_len;
2505		bg->flags = map->type;
2506		bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
2507		ret = btrfs_add_block_group_cache(bg);
2508		/*
2509		 * We may have some valid block group cache added already, in
2510		 * that case we skip to the next one.
2511		 */
2512		if (ret == -EEXIST) {
2513			ret = 0;
2514			btrfs_put_block_group(bg);
2515			continue;
2516		}
2517
2518		if (ret) {
2519			btrfs_remove_free_space_cache(bg);
2520			btrfs_put_block_group(bg);
2521			break;
2522		}
2523
2524		btrfs_add_bg_to_space_info(fs_info, bg);
2525
2526		set_avail_alloc_bits(fs_info, bg->flags);
2527	}
2528	if (!ret)
2529		btrfs_init_global_block_rsv(fs_info);
2530	return ret;
2531}
2532
2533int btrfs_read_block_groups(struct btrfs_fs_info *info)
2534{
2535	struct btrfs_root *root = btrfs_block_group_root(info);
2536	struct btrfs_path *path;
2537	int ret;
2538	struct btrfs_block_group *cache;
2539	struct btrfs_space_info *space_info;
2540	struct btrfs_key key;
2541	int need_clear = 0;
2542	u64 cache_gen;
2543
2544	/*
2545	 * Either no extent root (with ibadroots rescue option) or we have
2546	 * unsupported RO options. The fs can never be mounted read-write, so no
2547	 * need to waste time searching block group items.
2548	 *
2549	 * This also allows new extent tree related changes to be RO compat,
2550	 * no need for a full incompat flag.
2551	 */
2552	if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
2553		      ~BTRFS_FEATURE_COMPAT_RO_SUPP))
2554		return fill_dummy_bgs(info);
2555
2556	key.objectid = 0;
2557	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2558	key.offset = 0;
2559	path = btrfs_alloc_path();
2560	if (!path)
2561		return -ENOMEM;
2562
2563	cache_gen = btrfs_super_cache_generation(info->super_copy);
2564	if (btrfs_test_opt(info, SPACE_CACHE) &&
2565	    btrfs_super_generation(info->super_copy) != cache_gen)
2566		need_clear = 1;
2567	if (btrfs_test_opt(info, CLEAR_CACHE))
2568		need_clear = 1;
2569
2570	while (1) {
2571		struct btrfs_block_group_item bgi;
2572		struct extent_buffer *leaf;
2573		int slot;
2574
2575		ret = find_first_block_group(info, path, &key);
2576		if (ret > 0)
2577			break;
2578		if (ret != 0)
2579			goto error;
2580
2581		leaf = path->nodes[0];
2582		slot = path->slots[0];
2583
2584		read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2585				   sizeof(bgi));
2586
2587		btrfs_item_key_to_cpu(leaf, &key, slot);
2588		btrfs_release_path(path);
2589		ret = read_one_block_group(info, &bgi, &key, need_clear);
2590		if (ret < 0)
2591			goto error;
2592		key.objectid += key.offset;
2593		key.offset = 0;
2594	}
2595	btrfs_release_path(path);
2596
2597	list_for_each_entry(space_info, &info->space_info, list) {
2598		int i;
2599
2600		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2601			if (list_empty(&space_info->block_groups[i]))
2602				continue;
2603			cache = list_first_entry(&space_info->block_groups[i],
2604						 struct btrfs_block_group,
2605						 list);
2606			btrfs_sysfs_add_block_group_type(cache);
2607		}
2608
2609		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2610		      (BTRFS_BLOCK_GROUP_RAID10 |
2611		       BTRFS_BLOCK_GROUP_RAID1_MASK |
2612		       BTRFS_BLOCK_GROUP_RAID56_MASK |
2613		       BTRFS_BLOCK_GROUP_DUP)))
2614			continue;
2615		/*
2616		 * Avoid allocating from un-mirrored block group if there are
2617		 * mirrored block groups.
2618		 */
2619		list_for_each_entry(cache,
2620				&space_info->block_groups[BTRFS_RAID_RAID0],
2621				list)
2622			inc_block_group_ro(cache, 1);
2623		list_for_each_entry(cache,
2624				&space_info->block_groups[BTRFS_RAID_SINGLE],
2625				list)
2626			inc_block_group_ro(cache, 1);
2627	}
2628
2629	btrfs_init_global_block_rsv(info);
2630	ret = check_chunk_block_group_mappings(info);
2631error:
2632	btrfs_free_path(path);
2633	/*
2634	 * We've hit some error while reading the extent tree, and have
2635	 * rescue=ibadroots mount option.
2636	 * Try to fill the tree using dummy block groups so that the user can
2637	 * continue to mount and grab their data.
2638	 */
2639	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2640		ret = fill_dummy_bgs(info);
2641	return ret;
2642}
2643
2644/*
2645 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2646 * allocation.
2647 *
2648 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2649 * phases.
2650 */
2651static int insert_block_group_item(struct btrfs_trans_handle *trans,
2652				   struct btrfs_block_group *block_group)
2653{
2654	struct btrfs_fs_info *fs_info = trans->fs_info;
2655	struct btrfs_block_group_item bgi;
2656	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2657	struct btrfs_key key;
2658	u64 old_commit_used;
2659	int ret;
2660
2661	spin_lock(&block_group->lock);
2662	btrfs_set_stack_block_group_used(&bgi, block_group->used);
2663	btrfs_set_stack_block_group_chunk_objectid(&bgi,
2664						   block_group->global_root_id);
2665	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2666	old_commit_used = block_group->commit_used;
2667	block_group->commit_used = block_group->used;
2668	key.objectid = block_group->start;
2669	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2670	key.offset = block_group->length;
2671	spin_unlock(&block_group->lock);
2672
2673	ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2674	if (ret < 0) {
2675		spin_lock(&block_group->lock);
2676		block_group->commit_used = old_commit_used;
2677		spin_unlock(&block_group->lock);
2678	}
2679
2680	return ret;
2681}
2682
2683static int insert_dev_extent(struct btrfs_trans_handle *trans,
2684			     const struct btrfs_device *device, u64 chunk_offset,
2685			     u64 start, u64 num_bytes)
2686{
2687	struct btrfs_fs_info *fs_info = device->fs_info;
2688	struct btrfs_root *root = fs_info->dev_root;
2689	BTRFS_PATH_AUTO_FREE(path);
2690	struct btrfs_dev_extent *extent;
2691	struct extent_buffer *leaf;
2692	struct btrfs_key key;
2693	int ret;
2694
2695	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2696	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2697	path = btrfs_alloc_path();
2698	if (!path)
2699		return -ENOMEM;
2700
2701	key.objectid = device->devid;
2702	key.type = BTRFS_DEV_EXTENT_KEY;
2703	key.offset = start;
2704	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2705	if (ret)
2706		return ret;
2707
2708	leaf = path->nodes[0];
2709	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2710	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2711	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2712					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2713	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2714	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2715
2716	return ret;
2717}
2718
2719/*
2720 * This function belongs to phase 2.
2721 *
2722 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2723 * phases.
2724 */
2725static int insert_dev_extents(struct btrfs_trans_handle *trans,
2726				   u64 chunk_offset, u64 chunk_size)
2727{
2728	struct btrfs_fs_info *fs_info = trans->fs_info;
2729	struct btrfs_device *device;
2730	struct btrfs_chunk_map *map;
2731	u64 dev_offset;
2732	int i;
2733	int ret = 0;
2734
2735	map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2736	if (IS_ERR(map))
2737		return PTR_ERR(map);
2738
2739	/*
2740	 * Take the device list mutex to prevent races with the final phase of
2741	 * a device replace operation that replaces the device object associated
2742	 * with the map's stripes, because the device object's id can change
2743	 * at any time during that final phase of the device replace operation
2744	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2745	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2746	 * resulting in persisting a device extent item with such ID.
2747	 */
2748	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2749	for (i = 0; i < map->num_stripes; i++) {
2750		device = map->stripes[i].dev;
2751		dev_offset = map->stripes[i].physical;
2752
2753		ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2754					map->stripe_size);
2755		if (ret)
2756			break;
2757	}
2758	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2759
2760	btrfs_free_chunk_map(map);
2761	return ret;
2762}
2763
2764/*
2765 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2766 * chunk allocation.
2767 *
2768 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2769 * phases.
2770 */
2771void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2772{
2773	struct btrfs_fs_info *fs_info = trans->fs_info;
2774	struct btrfs_block_group *block_group;
2775	int ret = 0;
2776
2777	while (!list_empty(&trans->new_bgs)) {
2778		int index;
2779
2780		block_group = list_first_entry(&trans->new_bgs,
2781					       struct btrfs_block_group,
2782					       bg_list);
2783		if (ret)
2784			goto next;
2785
2786		index = btrfs_bg_flags_to_raid_index(block_group->flags);
2787
2788		ret = insert_block_group_item(trans, block_group);
2789		if (ret)
2790			btrfs_abort_transaction(trans, ret);
2791		if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2792			      &block_group->runtime_flags)) {
2793			mutex_lock(&fs_info->chunk_mutex);
2794			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2795			mutex_unlock(&fs_info->chunk_mutex);
2796			if (ret)
2797				btrfs_abort_transaction(trans, ret);
2798		}
2799		ret = insert_dev_extents(trans, block_group->start,
2800					 block_group->length);
2801		if (ret)
2802			btrfs_abort_transaction(trans, ret);
2803		btrfs_add_block_group_free_space(trans, block_group);
2804
2805		/*
2806		 * If we restriped during balance, we may have added a new raid
2807		 * type, so now add the sysfs entries when it is safe to do so.
2808		 * We don't have to worry about locking here as it's handled in
2809		 * btrfs_sysfs_add_block_group_type.
2810		 */
2811		if (block_group->space_info->block_group_kobjs[index] == NULL)
2812			btrfs_sysfs_add_block_group_type(block_group);
2813
2814		/* Already aborted the transaction if it failed. */
2815next:
2816		btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2817
2818		spin_lock(&fs_info->unused_bgs_lock);
2819		list_del_init(&block_group->bg_list);
2820		clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
2821		btrfs_put_block_group(block_group);
2822		spin_unlock(&fs_info->unused_bgs_lock);
2823
2824		/*
2825		 * If the block group is still unused, add it to the list of
2826		 * unused block groups. The block group may have been created in
2827		 * order to satisfy a space reservation, in which case the
2828		 * extent allocation only happens later. But often we don't
2829		 * actually need to allocate space that we previously reserved,
2830		 * so the block group may become unused for a long time. For
2831		 * example for metadata we generally reserve space for a worst
2832		 * possible scenario, but then don't end up allocating all that
2833		 * space or none at all (due to no need to COW, extent buffers
2834		 * were already COWed in the current transaction and still
2835		 * unwritten, tree heights lower than the maximum possible
2836		 * height, etc). For data we generally reserve the exact amount
2837		 * of space we are going to allocate later, the exception is
2838		 * when using compression, as we must reserve space based on the
2839		 * uncompressed data size, because the compression is only done
2840		 * when writeback triggered and we don't know how much space we
2841		 * are actually going to need, so we reserve the uncompressed
2842		 * size because the data may be incompressible in the worst case.
2843		 */
2844		if (ret == 0) {
2845			bool used;
2846
2847			spin_lock(&block_group->lock);
2848			used = btrfs_is_block_group_used(block_group);
2849			spin_unlock(&block_group->lock);
2850
2851			if (!used)
2852				btrfs_mark_bg_unused(block_group);
2853		}
2854	}
2855	btrfs_trans_release_chunk_metadata(trans);
2856}
2857
2858/*
2859 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
2860 * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2861 */
2862static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
2863{
2864	u64 div = SZ_1G;
2865	u64 index;
2866
2867	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2868		return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2869
2870	/* If we have a smaller fs index based on 128MiB. */
2871	if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2872		div = SZ_128M;
2873
2874	offset = div64_u64(offset, div);
2875	div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2876	return index;
2877}
2878
2879struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2880						 struct btrfs_space_info *space_info,
2881						 u64 type, u64 chunk_offset, u64 size)
2882{
2883	struct btrfs_fs_info *fs_info = trans->fs_info;
2884	struct btrfs_block_group *cache;
2885	int ret;
2886
2887	btrfs_set_log_full_commit(trans);
2888
2889	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2890	if (!cache)
2891		return ERR_PTR(-ENOMEM);
2892
2893	/*
2894	 * Mark it as new before adding it to the rbtree of block groups or any
2895	 * list, so that no other task finds it and calls btrfs_mark_bg_unused()
2896	 * before the new flag is set.
2897	 */
2898	set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
2899
2900	cache->length = size;
2901	btrfs_set_free_space_tree_thresholds(cache);
2902	cache->flags = type;
2903	cache->cached = BTRFS_CACHE_FINISHED;
2904	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2905
2906	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2907		set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
2908
2909	ret = btrfs_load_block_group_zone_info(cache, true);
2910	if (ret) {
2911		btrfs_put_block_group(cache);
2912		return ERR_PTR(ret);
2913	}
2914
2915	ret = exclude_super_stripes(cache);
2916	if (ret) {
2917		/* We may have excluded something, so call this just in case */
2918		btrfs_free_excluded_extents(cache);
2919		btrfs_put_block_group(cache);
2920		return ERR_PTR(ret);
2921	}
2922
2923	ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
2924	btrfs_free_excluded_extents(cache);
2925	if (ret) {
2926		btrfs_put_block_group(cache);
2927		return ERR_PTR(ret);
2928	}
2929
2930	/*
2931	 * Ensure the corresponding space_info object is created and
2932	 * assigned to our block group. We want our bg to be added to the rbtree
2933	 * with its ->space_info set.
2934	 */
2935	cache->space_info = space_info;
2936	ASSERT(cache->space_info);
2937
2938	ret = btrfs_add_block_group_cache(cache);
2939	if (ret) {
2940		btrfs_remove_free_space_cache(cache);
2941		btrfs_put_block_group(cache);
2942		return ERR_PTR(ret);
2943	}
2944
2945	/*
2946	 * Now that our block group has its ->space_info set and is inserted in
2947	 * the rbtree, update the space info's counters.
2948	 */
2949	trace_btrfs_add_block_group(fs_info, cache, 1);
2950	btrfs_add_bg_to_space_info(fs_info, cache);
2951	btrfs_update_global_block_rsv(fs_info);
2952
2953#ifdef CONFIG_BTRFS_DEBUG
2954	if (btrfs_should_fragment_free_space(cache)) {
2955		cache->space_info->bytes_used += size >> 1;
2956		fragment_free_space(cache);
2957	}
2958#endif
2959
2960	btrfs_link_bg_list(cache, &trans->new_bgs);
2961	btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
2962
2963	set_avail_alloc_bits(fs_info, type);
2964	return cache;
2965}
2966
2967/*
2968 * Mark one block group RO, can be called several times for the same block
2969 * group.
2970 *
2971 * @cache:		the destination block group
2972 * @do_chunk_alloc:	whether need to do chunk pre-allocation, this is to
2973 * 			ensure we still have some free space after marking this
2974 * 			block group RO.
2975 */
2976int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2977			     bool do_chunk_alloc)
2978{
2979	struct btrfs_fs_info *fs_info = cache->fs_info;
2980	struct btrfs_space_info *space_info = cache->space_info;
2981	struct btrfs_trans_handle *trans;
2982	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2983	u64 alloc_flags;
2984	int ret;
2985	bool dirty_bg_running;
2986
2987	/*
2988	 * This can only happen when we are doing read-only scrub on read-only
2989	 * mount.
2990	 * In that case we should not start a new transaction on read-only fs.
2991	 * Thus here we skip all chunk allocations.
2992	 */
2993	if (sb_rdonly(fs_info->sb)) {
2994		mutex_lock(&fs_info->ro_block_group_mutex);
2995		ret = inc_block_group_ro(cache, 0);
2996		mutex_unlock(&fs_info->ro_block_group_mutex);
2997		return ret;
2998	}
2999
3000	do {
3001		trans = btrfs_join_transaction(root);
3002		if (IS_ERR(trans))
3003			return PTR_ERR(trans);
3004
3005		dirty_bg_running = false;
3006
3007		/*
3008		 * We're not allowed to set block groups readonly after the dirty
3009		 * block group cache has started writing.  If it already started,
3010		 * back off and let this transaction commit.
3011		 */
3012		mutex_lock(&fs_info->ro_block_group_mutex);
3013		if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
3014			u64 transid = trans->transid;
3015
3016			mutex_unlock(&fs_info->ro_block_group_mutex);
3017			btrfs_end_transaction(trans);
3018
3019			ret = btrfs_wait_for_commit(fs_info, transid);
3020			if (ret)
3021				return ret;
3022			dirty_bg_running = true;
3023		}
3024	} while (dirty_bg_running);
3025
3026	if (do_chunk_alloc) {
3027		/*
3028		 * If we are changing raid levels, try to allocate a
3029		 * corresponding block group with the new raid level.
3030		 */
3031		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3032		if (alloc_flags != cache->flags) {
3033			ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
3034						CHUNK_ALLOC_FORCE);
3035			/*
3036			 * ENOSPC is allowed here, we may have enough space
3037			 * already allocated at the new raid level to carry on
3038			 */
3039			if (ret == -ENOSPC)
3040				ret = 0;
3041			if (ret < 0)
3042				goto out;
3043		}
3044	}
3045
3046	ret = inc_block_group_ro(cache, 0);
3047	if (!ret)
3048		goto out;
3049	if (ret == -ETXTBSY)
3050		goto unlock_out;
3051
3052	/*
3053	 * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
3054	 * chunk allocation storm to exhaust the system chunk array.  Otherwise
3055	 * we still want to try our best to mark the block group read-only.
3056	 */
3057	if (!do_chunk_alloc && ret == -ENOSPC &&
3058	    (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
3059		goto unlock_out;
3060
3061	alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
3062	ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
3063	if (ret < 0)
3064		goto out;
3065	/*
3066	 * We have allocated a new chunk. We also need to activate that chunk to
3067	 * grant metadata tickets for zoned filesystem.
3068	 */
3069	ret = btrfs_zoned_activate_one_bg(space_info, true);
3070	if (ret < 0)
3071		goto out;
3072
3073	ret = inc_block_group_ro(cache, 0);
3074	if (ret == -ETXTBSY)
3075		goto unlock_out;
3076out:
3077	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3078		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3079		mutex_lock(&fs_info->chunk_mutex);
3080		check_system_chunk(trans, alloc_flags);
3081		mutex_unlock(&fs_info->chunk_mutex);
3082	}
3083unlock_out:
3084	mutex_unlock(&fs_info->ro_block_group_mutex);
3085
3086	btrfs_end_transaction(trans);
3087	return ret;
3088}
3089
3090void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
3091{
3092	struct btrfs_space_info *sinfo = cache->space_info;
3093	u64 num_bytes;
3094
3095	BUG_ON(!cache->ro);
3096
3097	spin_lock(&sinfo->lock);
3098	spin_lock(&cache->lock);
3099	if (!--cache->ro) {
3100		if (btrfs_is_zoned(cache->fs_info)) {
3101			/* Migrate zone_unusable bytes back */
3102			cache->zone_unusable =
3103				(cache->alloc_offset - cache->used - cache->pinned -
3104				 cache->reserved) +
3105				(cache->length - cache->zone_capacity);
3106			btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
3107			sinfo->bytes_readonly -= cache->zone_unusable;
3108		}
3109		num_bytes = cache->length - cache->reserved -
3110			    cache->pinned - cache->bytes_super -
3111			    cache->zone_unusable - cache->used;
3112		sinfo->bytes_readonly -= num_bytes;
3113		list_del_init(&cache->ro_list);
3114	}
3115	spin_unlock(&cache->lock);
3116	spin_unlock(&sinfo->lock);
3117}
3118
3119static int update_block_group_item(struct btrfs_trans_handle *trans,
3120				   struct btrfs_path *path,
3121				   struct btrfs_block_group *cache)
3122{
3123	struct btrfs_fs_info *fs_info = trans->fs_info;
3124	int ret;
3125	struct btrfs_root *root = btrfs_block_group_root(fs_info);
3126	unsigned long bi;
3127	struct extent_buffer *leaf;
3128	struct btrfs_block_group_item bgi;
3129	struct btrfs_key key;
3130	u64 old_commit_used;
3131	u64 used;
3132
3133	/*
3134	 * Block group items update can be triggered out of commit transaction
3135	 * critical section, thus we need a consistent view of used bytes.
3136	 * We cannot use cache->used directly outside of the spin lock, as it
3137	 * may be changed.
3138	 */
3139	spin_lock(&cache->lock);
3140	old_commit_used = cache->commit_used;
3141	used = cache->used;
3142	/* No change in used bytes, can safely skip it. */
3143	if (cache->commit_used == used) {
3144		spin_unlock(&cache->lock);
3145		return 0;
3146	}
3147	cache->commit_used = used;
3148	spin_unlock(&cache->lock);
3149
3150	key.objectid = cache->start;
3151	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
3152	key.offset = cache->length;
3153
3154	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3155	if (ret) {
3156		if (ret > 0)
3157			ret = -ENOENT;
3158		goto fail;
3159	}
3160
3161	leaf = path->nodes[0];
3162	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3163	btrfs_set_stack_block_group_used(&bgi, used);
3164	btrfs_set_stack_block_group_chunk_objectid(&bgi,
3165						   cache->global_root_id);
3166	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
3167	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
3168fail:
3169	btrfs_release_path(path);
3170	/*
3171	 * We didn't update the block group item, need to revert commit_used
3172	 * unless the block group item didn't exist yet - this is to prevent a
3173	 * race with a concurrent insertion of the block group item, with
3174	 * insert_block_group_item(), that happened just after we attempted to
3175	 * update. In that case we would reset commit_used to 0 just after the
3176	 * insertion set it to a value greater than 0 - if the block group later
3177	 * becomes with 0 used bytes, we would incorrectly skip its update.
3178	 */
3179	if (ret < 0 && ret != -ENOENT) {
3180		spin_lock(&cache->lock);
3181		cache->commit_used = old_commit_used;
3182		spin_unlock(&cache->lock);
3183	}
3184	return ret;
3185
3186}
3187
3188static int cache_save_setup(struct btrfs_block_group *block_group,
3189			    struct btrfs_trans_handle *trans,
3190			    struct btrfs_path *path)
3191{
3192	struct btrfs_fs_info *fs_info = block_group->fs_info;
3193	struct inode *inode = NULL;
3194	struct extent_changeset *data_reserved = NULL;
3195	u64 alloc_hint = 0;
3196	int dcs = BTRFS_DC_ERROR;
3197	u64 cache_size = 0;
3198	int retries = 0;
3199	int ret = 0;
3200
3201	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
3202		return 0;
3203
3204	/*
3205	 * If this block group is smaller than 100 megs don't bother caching the
3206	 * block group.
3207	 */
3208	if (block_group->length < (100 * SZ_1M)) {
3209		spin_lock(&block_group->lock);
3210		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3211		spin_unlock(&block_group->lock);
3212		return 0;
3213	}
3214
3215	if (TRANS_ABORTED(trans))
3216		return 0;
3217again:
3218	inode = lookup_free_space_inode(block_group, path);
3219	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3220		ret = PTR_ERR(inode);
3221		btrfs_release_path(path);
3222		goto out;
3223	}
3224
3225	if (IS_ERR(inode)) {
3226		BUG_ON(retries);
3227		retries++;
3228
3229		if (block_group->ro)
3230			goto out_free;
3231
3232		ret = create_free_space_inode(trans, block_group, path);
3233		if (ret)
3234			goto out_free;
3235		goto again;
3236	}
3237
3238	/*
3239	 * We want to set the generation to 0, that way if anything goes wrong
3240	 * from here on out we know not to trust this cache when we load up next
3241	 * time.
3242	 */
3243	BTRFS_I(inode)->generation = 0;
3244	ret = btrfs_update_inode(trans, BTRFS_I(inode));
3245	if (unlikely(ret)) {
3246		/*
3247		 * So theoretically we could recover from this, simply set the
3248		 * super cache generation to 0 so we know to invalidate the
3249		 * cache, but then we'd have to keep track of the block groups
3250		 * that fail this way so we know we _have_ to reset this cache
3251		 * before the next commit or risk reading stale cache.  So to
3252		 * limit our exposure to horrible edge cases lets just abort the
3253		 * transaction, this only happens in really bad situations
3254		 * anyway.
3255		 */
3256		btrfs_abort_transaction(trans, ret);
3257		goto out_put;
3258	}
3259	WARN_ON(ret);
3260
3261	/* We've already setup this transaction, go ahead and exit */
3262	if (block_group->cache_generation == trans->transid &&
3263	    i_size_read(inode)) {
3264		dcs = BTRFS_DC_SETUP;
3265		goto out_put;
3266	}
3267
3268	if (i_size_read(inode) > 0) {
3269		ret = btrfs_check_trunc_cache_free_space(fs_info,
3270					&fs_info->global_block_rsv);
3271		if (ret)
3272			goto out_put;
3273
3274		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3275		if (ret)
3276			goto out_put;
3277	}
3278
3279	spin_lock(&block_group->lock);
3280	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3281	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3282		/*
3283		 * don't bother trying to write stuff out _if_
3284		 * a) we're not cached,
3285		 * b) we're with nospace_cache mount option,
3286		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3287		 */
3288		dcs = BTRFS_DC_WRITTEN;
3289		spin_unlock(&block_group->lock);
3290		goto out_put;
3291	}
3292	spin_unlock(&block_group->lock);
3293
3294	/*
3295	 * We hit an ENOSPC when setting up the cache in this transaction, just
3296	 * skip doing the setup, we've already cleared the cache so we're safe.
3297	 */
3298	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3299		ret = -ENOSPC;
3300		goto out_put;
3301	}
3302
3303	/*
3304	 * Try to preallocate enough space based on how big the block group is.
3305	 * Keep in mind this has to include any pinned space which could end up
3306	 * taking up quite a bit since it's not folded into the other space
3307	 * cache.
3308	 */
3309	cache_size = div_u64(block_group->length, SZ_256M);
3310	if (!cache_size)
3311		cache_size = 1;
3312
3313	cache_size *= 16;
3314	cache_size *= fs_info->sectorsize;
3315
3316	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
3317					  cache_size, false);
3318	if (ret)
3319		goto out_put;
3320
3321	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
3322					      cache_size, cache_size,
3323					      &alloc_hint);
3324	/*
3325	 * Our cache requires contiguous chunks so that we don't modify a bunch
3326	 * of metadata or split extents when writing the cache out, which means
3327	 * we can enospc if we are heavily fragmented in addition to just normal
3328	 * out of space conditions.  So if we hit this just skip setting up any
3329	 * other block groups for this transaction, maybe we'll unpin enough
3330	 * space the next time around.
3331	 */
3332	if (!ret)
3333		dcs = BTRFS_DC_SETUP;
3334	else if (ret == -ENOSPC)
3335		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3336
3337out_put:
3338	iput(inode);
3339out_free:
3340	btrfs_release_path(path);
3341out:
3342	spin_lock(&block_group->lock);
3343	if (!ret && dcs == BTRFS_DC_SETUP)
3344		block_group->cache_generation = trans->transid;
3345	block_group->disk_cache_state = dcs;
3346	spin_unlock(&block_group->lock);
3347
3348	extent_changeset_free(data_reserved);
3349	return ret;
3350}
3351
3352int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3353{
3354	struct btrfs_fs_info *fs_info = trans->fs_info;
3355	struct btrfs_block_group *cache, *tmp;
3356	struct btrfs_transaction *cur_trans = trans->transaction;
3357	BTRFS_PATH_AUTO_FREE(path);
3358
3359	if (list_empty(&cur_trans->dirty_bgs) ||
3360	    !btrfs_test_opt(fs_info, SPACE_CACHE))
3361		return 0;
3362
3363	path = btrfs_alloc_path();
3364	if (!path)
3365		return -ENOMEM;
3366
3367	/* Could add new block groups, use _safe just in case */
3368	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3369				 dirty_list) {
3370		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3371			cache_save_setup(cache, trans, path);
3372	}
3373
3374	return 0;
3375}
3376
3377/*
3378 * Transaction commit does final block group cache writeback during a critical
3379 * section where nothing is allowed to change the FS.  This is required in
3380 * order for the cache to actually match the block group, but can introduce a
3381 * lot of latency into the commit.
3382 *
3383 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3384 * There's a chance we'll have to redo some of it if the block group changes
3385 * again during the commit, but it greatly reduces the commit latency by
3386 * getting rid of the easy block groups while we're still allowing others to
3387 * join the commit.
3388 */
3389int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3390{
3391	struct btrfs_fs_info *fs_info = trans->fs_info;
3392	struct btrfs_block_group *cache;
3393	struct btrfs_transaction *cur_trans = trans->transaction;
3394	int ret = 0;
3395	int should_put;
3396	BTRFS_PATH_AUTO_FREE(path);
3397	LIST_HEAD(dirty);
3398	struct list_head *io = &cur_trans->io_bgs;
3399	int loops = 0;
3400
3401	spin_lock(&cur_trans->dirty_bgs_lock);
3402	if (list_empty(&cur_trans->dirty_bgs)) {
3403		spin_unlock(&cur_trans->dirty_bgs_lock);
3404		return 0;
3405	}
3406	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3407	spin_unlock(&cur_trans->dirty_bgs_lock);
3408
3409again:
3410	/* Make sure all the block groups on our dirty list actually exist */
3411	btrfs_create_pending_block_groups(trans);
3412
3413	if (!path) {
3414		path = btrfs_alloc_path();
3415		if (!path) {
3416			ret = -ENOMEM;
3417			goto out;
3418		}
3419	}
3420
3421	/*
3422	 * cache_write_mutex is here only to save us from balance or automatic
3423	 * removal of empty block groups deleting this block group while we are
3424	 * writing out the cache
3425	 */
3426	mutex_lock(&trans->transaction->cache_write_mutex);
3427	while (!list_empty(&dirty)) {
3428		bool drop_reserve = true;
3429
3430		cache = list_first_entry(&dirty, struct btrfs_block_group,
3431					 dirty_list);
3432		/*
3433		 * This can happen if something re-dirties a block group that
3434		 * is already under IO.  Just wait for it to finish and then do
3435		 * it all again
3436		 */
3437		if (!list_empty(&cache->io_list)) {
3438			list_del_init(&cache->io_list);
3439			btrfs_wait_cache_io(trans, cache, path);
3440			btrfs_put_block_group(cache);
3441		}
3442
3443
3444		/*
3445		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
3446		 * it should update the cache_state.  Don't delete until after
3447		 * we wait.
3448		 *
3449		 * Since we're not running in the commit critical section
3450		 * we need the dirty_bgs_lock to protect from update_block_group
3451		 */
3452		spin_lock(&cur_trans->dirty_bgs_lock);
3453		list_del_init(&cache->dirty_list);
3454		spin_unlock(&cur_trans->dirty_bgs_lock);
3455
3456		should_put = 1;
3457
3458		cache_save_setup(cache, trans, path);
3459
3460		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3461			cache->io_ctl.inode = NULL;
3462			ret = btrfs_write_out_cache(trans, cache, path);
3463			if (ret == 0 && cache->io_ctl.inode) {
3464				should_put = 0;
3465
3466				/*
3467				 * The cache_write_mutex is protecting the
3468				 * io_list, also refer to the definition of
3469				 * btrfs_transaction::io_bgs for more details
3470				 */
3471				list_add_tail(&cache->io_list, io);
3472			} else {
3473				/*
3474				 * If we failed to write the cache, the
3475				 * generation will be bad and life goes on
3476				 */
3477				ret = 0;
3478			}
3479		}
3480		if (!ret) {
3481			ret = update_block_group_item(trans, path, cache);
3482			/*
3483			 * Our block group might still be attached to the list
3484			 * of new block groups in the transaction handle of some
3485			 * other task (struct btrfs_trans_handle->new_bgs). This
3486			 * means its block group item isn't yet in the extent
3487			 * tree. If this happens ignore the error, as we will
3488			 * try again later in the critical section of the
3489			 * transaction commit.
3490			 */
3491			if (ret == -ENOENT) {
3492				ret = 0;
3493				spin_lock(&cur_trans->dirty_bgs_lock);
3494				if (list_empty(&cache->dirty_list)) {
3495					list_add_tail(&cache->dirty_list,
3496						      &cur_trans->dirty_bgs);
3497					btrfs_get_block_group(cache);
3498					drop_reserve = false;
3499				}
3500				spin_unlock(&cur_trans->dirty_bgs_lock);
3501			} else if (ret) {
3502				btrfs_abort_transaction(trans, ret);
3503			}
3504		}
3505
3506		/* If it's not on the io list, we need to put the block group */
3507		if (should_put)
3508			btrfs_put_block_group(cache);
3509		if (drop_reserve)
3510			btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3511		/*
3512		 * Avoid blocking other tasks for too long. It might even save
3513		 * us from writing caches for block groups that are going to be
3514		 * removed.
3515		 */
3516		mutex_unlock(&trans->transaction->cache_write_mutex);
3517		if (ret)
3518			goto out;
3519		mutex_lock(&trans->transaction->cache_write_mutex);
3520	}
3521	mutex_unlock(&trans->transaction->cache_write_mutex);
3522
3523	/*
3524	 * Go through delayed refs for all the stuff we've just kicked off
3525	 * and then loop back (just once)
3526	 */
3527	if (!ret)
3528		ret = btrfs_run_delayed_refs(trans, 0);
3529	if (!ret && loops == 0) {
3530		loops++;
3531		spin_lock(&cur_trans->dirty_bgs_lock);
3532		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3533		/*
3534		 * dirty_bgs_lock protects us from concurrent block group
3535		 * deletes too (not just cache_write_mutex).
3536		 */
3537		if (!list_empty(&dirty)) {
3538			spin_unlock(&cur_trans->dirty_bgs_lock);
3539			goto again;
3540		}
3541		spin_unlock(&cur_trans->dirty_bgs_lock);
3542	}
3543out:
3544	if (ret < 0) {
3545		spin_lock(&cur_trans->dirty_bgs_lock);
3546		list_splice_init(&dirty, &cur_trans->dirty_bgs);
3547		spin_unlock(&cur_trans->dirty_bgs_lock);
3548		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3549	}
3550
3551	return ret;
3552}
3553
3554int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3555{
3556	struct btrfs_fs_info *fs_info = trans->fs_info;
3557	struct btrfs_block_group *cache;
3558	struct btrfs_transaction *cur_trans = trans->transaction;
3559	int ret = 0;
3560	int should_put;
3561	BTRFS_PATH_AUTO_FREE(path);
3562	struct list_head *io = &cur_trans->io_bgs;
3563
3564	path = btrfs_alloc_path();
3565	if (!path)
3566		return -ENOMEM;
3567
3568	/*
3569	 * Even though we are in the critical section of the transaction commit,
3570	 * we can still have concurrent tasks adding elements to this
3571	 * transaction's list of dirty block groups. These tasks correspond to
3572	 * endio free space workers started when writeback finishes for a
3573	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3574	 * allocate new block groups as a result of COWing nodes of the root
3575	 * tree when updating the free space inode. The writeback for the space
3576	 * caches is triggered by an earlier call to
3577	 * btrfs_start_dirty_block_groups() and iterations of the following
3578	 * loop.
3579	 * Also we want to do the cache_save_setup first and then run the
3580	 * delayed refs to make sure we have the best chance at doing this all
3581	 * in one shot.
3582	 */
3583	spin_lock(&cur_trans->dirty_bgs_lock);
3584	while (!list_empty(&cur_trans->dirty_bgs)) {
3585		cache = list_first_entry(&cur_trans->dirty_bgs,
3586					 struct btrfs_block_group,
3587					 dirty_list);
3588
3589		/*
3590		 * This can happen if cache_save_setup re-dirties a block group
3591		 * that is already under IO.  Just wait for it to finish and
3592		 * then do it all again
3593		 */
3594		if (!list_empty(&cache->io_list)) {
3595			spin_unlock(&cur_trans->dirty_bgs_lock);
3596			list_del_init(&cache->io_list);
3597			btrfs_wait_cache_io(trans, cache, path);
3598			btrfs_put_block_group(cache);
3599			spin_lock(&cur_trans->dirty_bgs_lock);
3600		}
3601
3602		/*
3603		 * Don't remove from the dirty list until after we've waited on
3604		 * any pending IO
3605		 */
3606		list_del_init(&cache->dirty_list);
3607		spin_unlock(&cur_trans->dirty_bgs_lock);
3608		should_put = 1;
3609
3610		cache_save_setup(cache, trans, path);
3611
3612		if (!ret)
3613			ret = btrfs_run_delayed_refs(trans, U64_MAX);
3614
3615		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3616			cache->io_ctl.inode = NULL;
3617			ret = btrfs_write_out_cache(trans, cache, path);
3618			if (ret == 0 && cache->io_ctl.inode) {
3619				should_put = 0;
3620				list_add_tail(&cache->io_list, io);
3621			} else {
3622				/*
3623				 * If we failed to write the cache, the
3624				 * generation will be bad and life goes on
3625				 */
3626				ret = 0;
3627			}
3628		}
3629		if (!ret) {
3630			ret = update_block_group_item(trans, path, cache);
3631			/*
3632			 * One of the free space endio workers might have
3633			 * created a new block group while updating a free space
3634			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3635			 * and hasn't released its transaction handle yet, in
3636			 * which case the new block group is still attached to
3637			 * its transaction handle and its creation has not
3638			 * finished yet (no block group item in the extent tree
3639			 * yet, etc). If this is the case, wait for all free
3640			 * space endio workers to finish and retry. This is a
3641			 * very rare case so no need for a more efficient and
3642			 * complex approach.
3643			 */
3644			if (ret == -ENOENT) {
3645				wait_event(cur_trans->writer_wait,
3646				   atomic_read(&cur_trans->num_writers) == 1);
3647				ret = update_block_group_item(trans, path, cache);
3648				if (ret)
3649					btrfs_abort_transaction(trans, ret);
3650			} else if (ret) {
3651				btrfs_abort_transaction(trans, ret);
3652			}
3653		}
3654
3655		/* If its not on the io list, we need to put the block group */
3656		if (should_put)
3657			btrfs_put_block_group(cache);
3658		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3659		spin_lock(&cur_trans->dirty_bgs_lock);
3660	}
3661	spin_unlock(&cur_trans->dirty_bgs_lock);
3662
3663	/*
3664	 * Refer to the definition of io_bgs member for details why it's safe
3665	 * to use it without any locking
3666	 */
3667	while (!list_empty(io)) {
3668		cache = list_first_entry(io, struct btrfs_block_group,
3669					 io_list);
3670		list_del_init(&cache->io_list);
3671		btrfs_wait_cache_io(trans, cache, path);
3672		btrfs_put_block_group(cache);
3673	}
3674
3675	return ret;
3676}
3677
3678int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3679			     u64 bytenr, u64 num_bytes, bool alloc)
3680{
3681	struct btrfs_fs_info *info = trans->fs_info;
3682	struct btrfs_space_info *space_info;
3683	struct btrfs_block_group *cache;
3684	u64 old_val;
3685	bool reclaim = false;
3686	bool bg_already_dirty = true;
3687	int factor;
3688
3689	/* Block accounting for super block */
3690	spin_lock(&info->delalloc_root_lock);
3691	old_val = btrfs_super_bytes_used(info->super_copy);
3692	if (alloc)
3693		old_val += num_bytes;
3694	else
3695		old_val -= num_bytes;
3696	btrfs_set_super_bytes_used(info->super_copy, old_val);
3697	spin_unlock(&info->delalloc_root_lock);
3698
3699	cache = btrfs_lookup_block_group(info, bytenr);
3700	if (!cache)
3701		return -ENOENT;
3702
3703	/* An extent can not span multiple block groups. */
3704	ASSERT(bytenr + num_bytes <= cache->start + cache->length);
3705
3706	space_info = cache->space_info;
3707	factor = btrfs_bg_type_to_factor(cache->flags);
3708
3709	/*
3710	 * If this block group has free space cache written out, we need to make
3711	 * sure to load it if we are removing space.  This is because we need
3712	 * the unpinning stage to actually add the space back to the block group,
3713	 * otherwise we will leak space.
3714	 */
3715	if (!alloc && !btrfs_block_group_done(cache))
3716		btrfs_cache_block_group(cache, true);
3717
3718	spin_lock(&space_info->lock);
3719	spin_lock(&cache->lock);
3720
3721	if (btrfs_test_opt(info, SPACE_CACHE) &&
3722	    cache->disk_cache_state < BTRFS_DC_CLEAR)
3723		cache->disk_cache_state = BTRFS_DC_CLEAR;
3724
3725	old_val = cache->used;
3726	if (alloc) {
3727		old_val += num_bytes;
3728		cache->used = old_val;
3729		cache->reserved -= num_bytes;
3730		cache->reclaim_mark = 0;
3731		space_info->bytes_reserved -= num_bytes;
3732		space_info->bytes_used += num_bytes;
3733		space_info->disk_used += num_bytes * factor;
3734		if (READ_ONCE(space_info->periodic_reclaim))
3735			btrfs_space_info_update_reclaimable(space_info, -num_bytes);
3736		spin_unlock(&cache->lock);
3737		spin_unlock(&space_info->lock);
3738	} else {
3739		old_val -= num_bytes;
3740		cache->used = old_val;
3741		cache->pinned += num_bytes;
3742		btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
3743		space_info->bytes_used -= num_bytes;
3744		space_info->disk_used -= num_bytes * factor;
3745		if (READ_ONCE(space_info->periodic_reclaim))
3746			btrfs_space_info_update_reclaimable(space_info, num_bytes);
3747		else
3748			reclaim = should_reclaim_block_group(cache, num_bytes);
3749
3750		spin_unlock(&cache->lock);
3751		spin_unlock(&space_info->lock);
3752
3753		btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
3754				     bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
3755	}
3756
3757	spin_lock(&trans->transaction->dirty_bgs_lock);
3758	if (list_empty(&cache->dirty_list)) {
3759		list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
3760		bg_already_dirty = false;
3761		btrfs_get_block_group(cache);
3762	}
3763	spin_unlock(&trans->transaction->dirty_bgs_lock);
3764
3765	/*
3766	 * No longer have used bytes in this block group, queue it for deletion.
3767	 * We do this after adding the block group to the dirty list to avoid
3768	 * races between cleaner kthread and space cache writeout.
3769	 */
3770	if (!alloc && old_val == 0) {
3771		if (!btrfs_test_opt(info, DISCARD_ASYNC))
3772			btrfs_mark_bg_unused(cache);
3773	} else if (!alloc && reclaim) {
3774		btrfs_mark_bg_to_reclaim(cache);
3775	}
3776
3777	btrfs_put_block_group(cache);
3778
3779	/* Modified block groups are accounted for in the delayed_refs_rsv. */
3780	if (!bg_already_dirty)
3781		btrfs_inc_delayed_refs_rsv_bg_updates(info);
3782
3783	return 0;
3784}
3785
3786/*
3787 * Update the block_group and space info counters.
3788 *
3789 * @cache:	The cache we are manipulating
3790 * @ram_bytes:  The number of bytes of file content, and will be same to
3791 *              @num_bytes except for the compress path.
3792 * @num_bytes:	The number of bytes in question
3793 * @delalloc:   The blocks are allocated for the delalloc write
3794 *
3795 * This is called by the allocator when it reserves space. If this is a
3796 * reservation and the block group has become read only we cannot make the
3797 * reservation and return -EAGAIN, otherwise this function always succeeds.
3798 */
3799int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3800			     u64 ram_bytes, u64 num_bytes, bool delalloc,
3801			     bool force_wrong_size_class)
3802{
3803	struct btrfs_space_info *space_info = cache->space_info;
3804	enum btrfs_block_group_size_class size_class;
3805	int ret = 0;
3806
3807	spin_lock(&space_info->lock);
3808	spin_lock(&cache->lock);
3809	if (cache->ro) {
3810		ret = -EAGAIN;
3811		goto out_error;
3812	}
3813
3814	if (btrfs_block_group_should_use_size_class(cache)) {
3815		size_class = btrfs_calc_block_group_size_class(num_bytes);
3816		ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
3817		if (ret)
3818			goto out_error;
3819	}
3820
3821	cache->reserved += num_bytes;
3822	if (delalloc)
3823		cache->delalloc_bytes += num_bytes;
3824
3825	trace_btrfs_space_reservation(cache->fs_info, "space_info",
3826				      space_info->flags, num_bytes, 1);
3827	spin_unlock(&cache->lock);
3828
3829	space_info->bytes_reserved += num_bytes;
3830	btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
3831
3832	/*
3833	 * Compression can use less space than we reserved, so wake tickets if
3834	 * that happens.
3835	 */
3836	if (num_bytes < ram_bytes)
3837		btrfs_try_granting_tickets(space_info);
3838	spin_unlock(&space_info->lock);
3839
3840	return 0;
3841
3842out_error:
3843	spin_unlock(&cache->lock);
3844	spin_unlock(&space_info->lock);
3845	return ret;
3846}
3847
3848/*
3849 * Update the block_group and space info counters.
3850 *
3851 * @cache:       The cache we are manipulating.
3852 * @num_bytes:   The number of bytes in question.
3853 * @is_delalloc: Whether the blocks are allocated for a delalloc write.
3854 *
3855 * This is called by somebody who is freeing space that was never actually used
3856 * on disk.  For example if you reserve some space for a new leaf in transaction
3857 * A and before transaction A commits you free that leaf, you call this with
3858 * reserve set to 0 in order to clear the reservation.
3859 */
3860void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
3861			       bool is_delalloc)
3862{
3863	struct btrfs_space_info *space_info = cache->space_info;
3864	bool bg_ro;
3865
3866	spin_lock(&space_info->lock);
3867	spin_lock(&cache->lock);
3868	bg_ro = cache->ro;
3869	cache->reserved -= num_bytes;
3870	if (is_delalloc)
3871		cache->delalloc_bytes -= num_bytes;
3872	spin_unlock(&cache->lock);
3873
3874	if (bg_ro)
3875		space_info->bytes_readonly += num_bytes;
3876	else if (btrfs_is_zoned(cache->fs_info))
3877		space_info->bytes_zone_unusable += num_bytes;
3878
3879	space_info->bytes_reserved -= num_bytes;
3880	space_info->max_extent_size = 0;
3881
3882	btrfs_try_granting_tickets(space_info);
3883	spin_unlock(&space_info->lock);
3884}
3885
3886static void force_metadata_allocation(struct btrfs_fs_info *info)
3887{
3888	struct list_head *head = &info->space_info;
3889	struct btrfs_space_info *found;
3890
3891	list_for_each_entry(found, head, list) {
3892		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3893			found->force_alloc = CHUNK_ALLOC_FORCE;
3894	}
3895}
3896
3897static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
3898			       const struct btrfs_space_info *sinfo, int force)
3899{
3900	u64 bytes_used = btrfs_space_info_used(sinfo, false);
3901	u64 thresh;
3902
3903	if (force == CHUNK_ALLOC_FORCE)
3904		return true;
3905
3906	/*
3907	 * in limited mode, we want to have some free space up to
3908	 * about 1% of the FS size.
3909	 */
3910	if (force == CHUNK_ALLOC_LIMITED) {
3911		thresh = btrfs_super_total_bytes(fs_info->super_copy);
3912		thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
3913
3914		if (sinfo->total_bytes - bytes_used < thresh)
3915			return true;
3916	}
3917
3918	if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
3919		return false;
3920	return true;
3921}
3922
3923int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3924{
3925	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3926	struct btrfs_space_info *space_info;
3927
3928	space_info = btrfs_find_space_info(trans->fs_info, type);
3929	if (!space_info) {
3930		DEBUG_WARN();
3931		return -EINVAL;
3932	}
3933
3934	return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
3935}
3936
3937static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
3938						struct btrfs_space_info *space_info,
3939						u64 flags)
3940{
3941	struct btrfs_block_group *bg;
3942	int ret;
3943
3944	/*
3945	 * Check if we have enough space in the system space info because we
3946	 * will need to update device items in the chunk btree and insert a new
3947	 * chunk item in the chunk btree as well. This will allocate a new
3948	 * system block group if needed.
3949	 */
3950	check_system_chunk(trans, flags);
3951
3952	bg = btrfs_create_chunk(trans, space_info, flags);
3953	if (IS_ERR(bg)) {
3954		ret = PTR_ERR(bg);
3955		goto out;
3956	}
3957
3958	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3959	/*
3960	 * Normally we are not expected to fail with -ENOSPC here, since we have
3961	 * previously reserved space in the system space_info and allocated one
3962	 * new system chunk if necessary. However there are three exceptions:
3963	 *
3964	 * 1) We may have enough free space in the system space_info but all the
3965	 *    existing system block groups have a profile which can not be used
3966	 *    for extent allocation.
3967	 *
3968	 *    This happens when mounting in degraded mode. For example we have a
3969	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
3970	 *    using the other device in degraded mode. If we then allocate a chunk,
3971	 *    we may have enough free space in the existing system space_info, but
3972	 *    none of the block groups can be used for extent allocation since they
3973	 *    have a RAID1 profile, and because we are in degraded mode with a
3974	 *    single device, we are forced to allocate a new system chunk with a
3975	 *    SINGLE profile. Making check_system_chunk() iterate over all system
3976	 *    block groups and check if they have a usable profile and enough space
3977	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
3978	 *    try again after forcing allocation of a new system chunk. Like this
3979	 *    we avoid paying the cost of that search in normal circumstances, when
3980	 *    we were not mounted in degraded mode;
3981	 *
3982	 * 2) We had enough free space info the system space_info, and one suitable
3983	 *    block group to allocate from when we called check_system_chunk()
3984	 *    above. However right after we called it, the only system block group
3985	 *    with enough free space got turned into RO mode by a running scrub,
3986	 *    and in this case we have to allocate a new one and retry. We only
3987	 *    need do this allocate and retry once, since we have a transaction
3988	 *    handle and scrub uses the commit root to search for block groups;
3989	 *
3990	 * 3) We had one system block group with enough free space when we called
3991	 *    check_system_chunk(), but after that, right before we tried to
3992	 *    allocate the last extent buffer we needed, a discard operation came
3993	 *    in and it temporarily removed the last free space entry from the
3994	 *    block group (discard removes a free space entry, discards it, and
3995	 *    then adds back the entry to the block group cache).
3996	 */
3997	if (ret == -ENOSPC) {
3998		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3999		struct btrfs_block_group *sys_bg;
4000		struct btrfs_space_info *sys_space_info;
4001
4002		sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
4003		if (unlikely(!sys_space_info)) {
4004			ret = -EINVAL;
4005			btrfs_abort_transaction(trans, ret);
4006			goto out;
4007		}
4008
4009		sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
4010		if (IS_ERR(sys_bg)) {
4011			ret = PTR_ERR(sys_bg);
4012			btrfs_abort_transaction(trans, ret);
4013			goto out;
4014		}
4015
4016		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
4017		if (unlikely(ret)) {
4018			btrfs_abort_transaction(trans, ret);
4019			goto out;
4020		}
4021
4022		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
4023		if (unlikely(ret)) {
4024			btrfs_abort_transaction(trans, ret);
4025			goto out;
4026		}
4027	} else if (unlikely(ret)) {
4028		btrfs_abort_transaction(trans, ret);
4029		goto out;
4030	}
4031out:
4032	btrfs_trans_release_chunk_metadata(trans);
4033
4034	if (ret)
4035		return ERR_PTR(ret);
4036
4037	btrfs_get_block_group(bg);
4038	return bg;
4039}
4040
4041/*
4042 * Chunk allocation is done in 2 phases:
4043 *
4044 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
4045 *    the chunk, the chunk mapping, create its block group and add the items
4046 *    that belong in the chunk btree to it - more specifically, we need to
4047 *    update device items in the chunk btree and add a new chunk item to it.
4048 *
4049 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
4050 *    group item to the extent btree and the device extent items to the devices
4051 *    btree.
4052 *
4053 * This is done to prevent deadlocks. For example when COWing a node from the
4054 * extent btree we are holding a write lock on the node's parent and if we
4055 * trigger chunk allocation and attempted to insert the new block group item
4056 * in the extent btree right way, we could deadlock because the path for the
4057 * insertion can include that parent node. At first glance it seems impossible
4058 * to trigger chunk allocation after starting a transaction since tasks should
4059 * reserve enough transaction units (metadata space), however while that is true
4060 * most of the time, chunk allocation may still be triggered for several reasons:
4061 *
4062 * 1) When reserving metadata, we check if there is enough free space in the
4063 *    metadata space_info and therefore don't trigger allocation of a new chunk.
4064 *    However later when the task actually tries to COW an extent buffer from
4065 *    the extent btree or from the device btree for example, it is forced to
4066 *    allocate a new block group (chunk) because the only one that had enough
4067 *    free space was just turned to RO mode by a running scrub for example (or
4068 *    device replace, block group reclaim thread, etc), so we can not use it
4069 *    for allocating an extent and end up being forced to allocate a new one;
4070 *
4071 * 2) Because we only check that the metadata space_info has enough free bytes,
4072 *    we end up not allocating a new metadata chunk in that case. However if
4073 *    the filesystem was mounted in degraded mode, none of the existing block
4074 *    groups might be suitable for extent allocation due to their incompatible
4075 *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
4076 *    use a RAID1 profile, in degraded mode using a single device). In this case
4077 *    when the task attempts to COW some extent buffer of the extent btree for
4078 *    example, it will trigger allocation of a new metadata block group with a
4079 *    suitable profile (SINGLE profile in the example of the degraded mount of
4080 *    the RAID1 filesystem);
4081 *
4082 * 3) The task has reserved enough transaction units / metadata space, but when
4083 *    it attempts to COW an extent buffer from the extent or device btree for
4084 *    example, it does not find any free extent in any metadata block group,
4085 *    therefore forced to try to allocate a new metadata block group.
4086 *    This is because some other task allocated all available extents in the
4087 *    meanwhile - this typically happens with tasks that don't reserve space
4088 *    properly, either intentionally or as a bug. One example where this is
4089 *    done intentionally is fsync, as it does not reserve any transaction units
4090 *    and ends up allocating a variable number of metadata extents for log
4091 *    tree extent buffers;
4092 *
4093 * 4) The task has reserved enough transaction units / metadata space, but right
4094 *    before it tries to allocate the last extent buffer it needs, a discard
4095 *    operation comes in and, temporarily, removes the last free space entry from
4096 *    the only metadata block group that had free space (discard starts by
4097 *    removing a free space entry from a block group, then does the discard
4098 *    operation and, once it's done, it adds back the free space entry to the
4099 *    block group).
4100 *
4101 * We also need this 2 phases setup when adding a device to a filesystem with
4102 * a seed device - we must create new metadata and system chunks without adding
4103 * any of the block group items to the chunk, extent and device btrees. If we
4104 * did not do it this way, we would get ENOSPC when attempting to update those
4105 * btrees, since all the chunks from the seed device are read-only.
4106 *
4107 * Phase 1 does the updates and insertions to the chunk btree because if we had
4108 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
4109 * parallel, we risk having too many system chunks allocated by many tasks if
4110 * many tasks reach phase 1 without the previous ones completing phase 2. In the
4111 * extreme case this leads to exhaustion of the system chunk array in the
4112 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
4113 * and with RAID filesystems (so we have more device items in the chunk btree).
4114 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
4115 * the system chunk array due to concurrent allocations") provides more details.
4116 *
4117 * Allocation of system chunks does not happen through this function. A task that
4118 * needs to update the chunk btree (the only btree that uses system chunks), must
4119 * preallocate chunk space by calling either check_system_chunk() or
4120 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
4121 * metadata chunk or when removing a chunk, while the later is used before doing
4122 * a modification to the chunk btree - use cases for the later are adding,
4123 * removing and resizing a device as well as relocation of a system chunk.
4124 * See the comment below for more details.
4125 *
4126 * The reservation of system space, done through check_system_chunk(), as well
4127 * as all the updates and insertions into the chunk btree must be done while
4128 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
4129 * an extent buffer from the chunks btree we never trigger allocation of a new
4130 * system chunk, which would result in a deadlock (trying to lock twice an
4131 * extent buffer of the chunk btree, first time before triggering the chunk
4132 * allocation and the second time during chunk allocation while attempting to
4133 * update the chunks btree). The system chunk array is also updated while holding
4134 * that mutex. The same logic applies to removing chunks - we must reserve system
4135 * space, update the chunk btree and the system chunk array in the superblock
4136 * while holding fs_info->chunk_mutex.
4137 *
4138 * This function, btrfs_chunk_alloc(), belongs to phase 1.
4139 *
4140 * @space_info: specify which space_info the new chunk should belong to.
4141 *
4142 * If @force is CHUNK_ALLOC_FORCE:
4143 *    - return 1 if it successfully allocates a chunk,
4144 *    - return errors including -ENOSPC otherwise.
4145 * If @force is NOT CHUNK_ALLOC_FORCE:
4146 *    - return 0 if it doesn't need to allocate a new chunk,
4147 *    - return 1 if it successfully allocates a chunk,
4148 *    - return errors including -ENOSPC otherwise.
4149 */
4150int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
4151		      struct btrfs_space_info *space_info, u64 flags,
4152		      enum btrfs_chunk_alloc_enum force)
4153{
4154	struct btrfs_fs_info *fs_info = trans->fs_info;
4155	struct btrfs_block_group *ret_bg;
4156	bool wait_for_alloc = false;
4157	bool should_alloc = false;
4158	bool from_extent_allocation = false;
4159	int ret = 0;
4160
4161	if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
4162		from_extent_allocation = true;
4163		force = CHUNK_ALLOC_FORCE;
4164	}
4165
4166	/* Don't re-enter if we're already allocating a chunk */
4167	if (trans->allocating_chunk)
4168		return -ENOSPC;
4169	/*
4170	 * Allocation of system chunks can not happen through this path, as we
4171	 * could end up in a deadlock if we are allocating a data or metadata
4172	 * chunk and there is another task modifying the chunk btree.
4173	 *
4174	 * This is because while we are holding the chunk mutex, we will attempt
4175	 * to add the new chunk item to the chunk btree or update an existing
4176	 * device item in the chunk btree, while the other task that is modifying
4177	 * the chunk btree is attempting to COW an extent buffer while holding a
4178	 * lock on it and on its parent - if the COW operation triggers a system
4179	 * chunk allocation, then we can deadlock because we are holding the
4180	 * chunk mutex and we may need to access that extent buffer or its parent
4181	 * in order to add the chunk item or update a device item.
4182	 *
4183	 * Tasks that want to modify the chunk tree should reserve system space
4184	 * before updating the chunk btree, by calling either
4185	 * btrfs_reserve_chunk_metadata() or check_system_chunk().
4186	 * It's possible that after a task reserves the space, it still ends up
4187	 * here - this happens in the cases described above at do_chunk_alloc().
4188	 * The task will have to either retry or fail.
4189	 */
4190	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4191		return -ENOSPC;
4192
4193	do {
4194		spin_lock(&space_info->lock);
4195		if (force < space_info->force_alloc)
4196			force = space_info->force_alloc;
4197		should_alloc = should_alloc_chunk(fs_info, space_info, force);
4198		if (space_info->full) {
4199			/* No more free physical space */
4200			spin_unlock(&space_info->lock);
4201			if (should_alloc)
4202				ret = -ENOSPC;
4203			else
4204				ret = 0;
4205			return ret;
4206		} else if (!should_alloc) {
4207			spin_unlock(&space_info->lock);
4208			return 0;
4209		} else if (space_info->chunk_alloc) {
4210			/*
4211			 * Someone is already allocating, so we need to block
4212			 * until this someone is finished and then loop to
4213			 * recheck if we should continue with our allocation
4214			 * attempt.
4215			 */
4216			spin_unlock(&space_info->lock);
4217			wait_for_alloc = true;
4218			force = CHUNK_ALLOC_NO_FORCE;
4219			mutex_lock(&fs_info->chunk_mutex);
4220			mutex_unlock(&fs_info->chunk_mutex);
4221		} else {
4222			/* Proceed with allocation */
4223			space_info->chunk_alloc = true;
4224			spin_unlock(&space_info->lock);
4225			wait_for_alloc = false;
4226		}
4227
4228		cond_resched();
4229	} while (wait_for_alloc);
4230
4231	mutex_lock(&fs_info->chunk_mutex);
4232	trans->allocating_chunk = true;
4233
4234	/*
4235	 * If we have mixed data/metadata chunks we want to make sure we keep
4236	 * allocating mixed chunks instead of individual chunks.
4237	 */
4238	if (btrfs_mixed_space_info(space_info))
4239		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4240
4241	/*
4242	 * if we're doing a data chunk, go ahead and make sure that
4243	 * we keep a reasonable number of metadata chunks allocated in the
4244	 * FS as well.
4245	 */
4246	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4247		fs_info->data_chunk_allocations++;
4248		if (!(fs_info->data_chunk_allocations %
4249		      fs_info->metadata_ratio))
4250			force_metadata_allocation(fs_info);
4251	}
4252
4253	ret_bg = do_chunk_alloc(trans, space_info, flags);
4254	trans->allocating_chunk = false;
4255
4256	if (IS_ERR(ret_bg)) {
4257		ret = PTR_ERR(ret_bg);
4258	} else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
4259		/*
4260		 * New block group is likely to be used soon. Try to activate
4261		 * it now. Failure is OK for now.
4262		 */
4263		btrfs_zone_activate(ret_bg);
4264	}
4265
4266	if (!ret)
4267		btrfs_put_block_group(ret_bg);
4268
4269	spin_lock(&space_info->lock);
4270	if (ret < 0) {
4271		if (ret == -ENOSPC)
4272			space_info->full = true;
4273		else
4274			goto out;
4275	} else {
4276		ret = 1;
4277		space_info->max_extent_size = 0;
4278	}
4279
4280	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4281out:
4282	space_info->chunk_alloc = false;
4283	spin_unlock(&space_info->lock);
4284	mutex_unlock(&fs_info->chunk_mutex);
4285
4286	return ret;
4287}
4288
4289static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
4290{
4291	u64 num_dev;
4292
4293	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4294	if (!num_dev)
4295		num_dev = fs_info->fs_devices->rw_devices;
4296
4297	return num_dev;
4298}
4299
4300static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4301				u64 bytes,
4302				u64 type)
4303{
4304	struct btrfs_fs_info *fs_info = trans->fs_info;
4305	struct btrfs_space_info *info;
4306	u64 left;
4307	int ret = 0;
4308
4309	/*
4310	 * Needed because we can end up allocating a system chunk and for an
4311	 * atomic and race free space reservation in the chunk block reserve.
4312	 */
4313	lockdep_assert_held(&fs_info->chunk_mutex);
4314
4315	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4316	spin_lock(&info->lock);
4317	left = info->total_bytes - btrfs_space_info_used(info, true);
4318	spin_unlock(&info->lock);
4319
4320	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4321		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4322			   left, bytes, type);
4323		btrfs_dump_space_info(info, 0, false);
4324	}
4325
4326	if (left < bytes) {
4327		u64 flags = btrfs_system_alloc_profile(fs_info);
4328		struct btrfs_block_group *bg;
4329		struct btrfs_space_info *space_info;
4330
4331		space_info = btrfs_find_space_info(fs_info, flags);
4332		ASSERT(space_info);
4333
4334		/*
4335		 * Ignore failure to create system chunk. We might end up not
4336		 * needing it, as we might not need to COW all nodes/leafs from
4337		 * the paths we visit in the chunk tree (they were already COWed
4338		 * or created in the current transaction for example).
4339		 */
4340		bg = btrfs_create_chunk(trans, space_info, flags);
4341		if (IS_ERR(bg)) {
4342			ret = PTR_ERR(bg);
4343		} else {
4344			/*
4345			 * We have a new chunk. We also need to activate it for
4346			 * zoned filesystem.
4347			 */
4348			ret = btrfs_zoned_activate_one_bg(info, true);
4349			if (ret < 0)
4350				return;
4351
4352			/*
4353			 * If we fail to add the chunk item here, we end up
4354			 * trying again at phase 2 of chunk allocation, at
4355			 * btrfs_create_pending_block_groups(). So ignore
4356			 * any error here. An ENOSPC here could happen, due to
4357			 * the cases described at do_chunk_alloc() - the system
4358			 * block group we just created was just turned into RO
4359			 * mode by a scrub for example, or a running discard
4360			 * temporarily removed its free space entries, etc.
4361			 */
4362			btrfs_chunk_alloc_add_chunk_item(trans, bg);
4363		}
4364	}
4365
4366	if (!ret) {
4367		ret = btrfs_block_rsv_add(fs_info,
4368					  &fs_info->chunk_block_rsv,
4369					  bytes, BTRFS_RESERVE_NO_FLUSH);
4370		if (!ret)
4371			trans->chunk_bytes_reserved += bytes;
4372	}
4373}
4374
4375/*
4376 * Reserve space in the system space for allocating or removing a chunk.
4377 * The caller must be holding fs_info->chunk_mutex.
4378 */
4379void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4380{
4381	struct btrfs_fs_info *fs_info = trans->fs_info;
4382	const u64 num_devs = get_profile_num_devs(fs_info, type);
4383	u64 bytes;
4384
4385	/* num_devs device items to update and 1 chunk item to add or remove. */
4386	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
4387		btrfs_calc_insert_metadata_size(fs_info, 1);
4388
4389	reserve_chunk_space(trans, bytes, type);
4390}
4391
4392/*
4393 * Reserve space in the system space, if needed, for doing a modification to the
4394 * chunk btree.
4395 *
4396 * @trans:		A transaction handle.
4397 * @is_item_insertion:	Indicate if the modification is for inserting a new item
4398 *			in the chunk btree or if it's for the deletion or update
4399 *			of an existing item.
4400 *
4401 * This is used in a context where we need to update the chunk btree outside
4402 * block group allocation and removal, to avoid a deadlock with a concurrent
4403 * task that is allocating a metadata or data block group and therefore needs to
4404 * update the chunk btree while holding the chunk mutex. After the update to the
4405 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4406 *
4407 */
4408void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4409				  bool is_item_insertion)
4410{
4411	struct btrfs_fs_info *fs_info = trans->fs_info;
4412	u64 bytes;
4413
4414	if (is_item_insertion)
4415		bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4416	else
4417		bytes = btrfs_calc_metadata_size(fs_info, 1);
4418
4419	mutex_lock(&fs_info->chunk_mutex);
4420	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4421	mutex_unlock(&fs_info->chunk_mutex);
4422}
4423
4424void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4425{
4426	struct btrfs_block_group *block_group;
4427
4428	block_group = btrfs_lookup_first_block_group(info, 0);
4429	while (block_group) {
4430		btrfs_wait_block_group_cache_done(block_group);
4431		spin_lock(&block_group->lock);
4432		if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
4433				       &block_group->runtime_flags)) {
4434			struct btrfs_inode *inode = block_group->inode;
4435
4436			block_group->inode = NULL;
4437			spin_unlock(&block_group->lock);
4438
4439			ASSERT(block_group->io_ctl.inode == NULL);
4440			iput(&inode->vfs_inode);
4441		} else {
4442			spin_unlock(&block_group->lock);
4443		}
4444		block_group = btrfs_next_block_group(block_group);
4445	}
4446}
4447
4448static void check_removing_space_info(struct btrfs_space_info *space_info)
4449{
4450	struct btrfs_fs_info *info = space_info->fs_info;
4451
4452	if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
4453		/* This is a top space_info, proceed with its children first. */
4454		for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
4455			if (space_info->sub_group[i]) {
4456				check_removing_space_info(space_info->sub_group[i]);
4457				kfree(space_info->sub_group[i]);
4458				space_info->sub_group[i] = NULL;
4459			}
4460		}
4461	}
4462
4463	/*
4464	 * Do not hide this behind enospc_debug, this is actually important and
4465	 * indicates a real bug if this happens.
4466	 */
4467	if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
4468		btrfs_dump_space_info(space_info, 0, false);
4469
4470	/*
4471	 * If there was a failure to cleanup a log tree, very likely due to an
4472	 * IO failure on a writeback attempt of one or more of its extent
4473	 * buffers, we could not do proper (and cheap) unaccounting of their
4474	 * reserved space, so don't warn on bytes_reserved > 0 in that case.
4475	 */
4476	if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4477	    !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4478		if (WARN_ON(space_info->bytes_reserved > 0))
4479			btrfs_dump_space_info(space_info, 0, false);
4480	}
4481
4482	WARN_ON(space_info->reclaim_size > 0);
4483}
4484
4485/*
4486 * Must be called only after stopping all workers, since we could have block
4487 * group caching kthreads running, and therefore they could race with us if we
4488 * freed the block groups before stopping them.
4489 */
4490int btrfs_free_block_groups(struct btrfs_fs_info *info)
4491{
4492	struct btrfs_block_group *block_group;
4493	struct btrfs_space_info *space_info;
4494	struct btrfs_caching_control *caching_ctl;
4495	struct rb_node *n;
4496
4497	if (btrfs_is_zoned(info)) {
4498		if (info->active_meta_bg) {
4499			btrfs_put_block_group(info->active_meta_bg);
4500			info->active_meta_bg = NULL;
4501		}
4502		if (info->active_system_bg) {
4503			btrfs_put_block_group(info->active_system_bg);
4504			info->active_system_bg = NULL;
4505		}
4506	}
4507
4508	write_lock(&info->block_group_cache_lock);
4509	while (!list_empty(&info->caching_block_groups)) {
4510		caching_ctl = list_first_entry(&info->caching_block_groups,
4511					       struct btrfs_caching_control, list);
4512		list_del(&caching_ctl->list);
4513		btrfs_put_caching_control(caching_ctl);
4514	}
4515	write_unlock(&info->block_group_cache_lock);
4516
4517	spin_lock(&info->unused_bgs_lock);
4518	while (!list_empty(&info->unused_bgs)) {
4519		block_group = list_first_entry(&info->unused_bgs,
4520					       struct btrfs_block_group,
4521					       bg_list);
4522		list_del_init(&block_group->bg_list);
4523		btrfs_put_block_group(block_group);
4524	}
4525
4526	while (!list_empty(&info->reclaim_bgs)) {
4527		block_group = list_first_entry(&info->reclaim_bgs,
4528					       struct btrfs_block_group,
4529					       bg_list);
4530		list_del_init(&block_group->bg_list);
4531		btrfs_put_block_group(block_group);
4532	}
4533	spin_unlock(&info->unused_bgs_lock);
4534
4535	spin_lock(&info->zone_active_bgs_lock);
4536	while (!list_empty(&info->zone_active_bgs)) {
4537		block_group = list_first_entry(&info->zone_active_bgs,
4538					       struct btrfs_block_group,
4539					       active_bg_list);
4540		list_del_init(&block_group->active_bg_list);
4541		btrfs_put_block_group(block_group);
4542	}
4543	spin_unlock(&info->zone_active_bgs_lock);
4544
4545	write_lock(&info->block_group_cache_lock);
4546	while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4547		block_group = rb_entry(n, struct btrfs_block_group,
4548				       cache_node);
4549		rb_erase_cached(&block_group->cache_node,
4550				&info->block_group_cache_tree);
4551		RB_CLEAR_NODE(&block_group->cache_node);
4552		write_unlock(&info->block_group_cache_lock);
4553
4554		down_write(&block_group->space_info->groups_sem);
4555		list_del(&block_group->list);
4556		up_write(&block_group->space_info->groups_sem);
4557
4558		/*
4559		 * We haven't cached this block group, which means we could
4560		 * possibly have excluded extents on this block group.
4561		 */
4562		if (block_group->cached == BTRFS_CACHE_NO ||
4563		    block_group->cached == BTRFS_CACHE_ERROR)
4564			btrfs_free_excluded_extents(block_group);
4565
4566		btrfs_remove_free_space_cache(block_group);
4567		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4568		ASSERT(list_empty(&block_group->dirty_list));
4569		ASSERT(list_empty(&block_group->io_list));
4570		ASSERT(list_empty(&block_group->bg_list));
4571		ASSERT(refcount_read(&block_group->refs) == 1);
4572		ASSERT(block_group->swap_extents == 0);
4573		btrfs_put_block_group(block_group);
4574
4575		write_lock(&info->block_group_cache_lock);
4576	}
4577	write_unlock(&info->block_group_cache_lock);
4578
4579	btrfs_release_global_block_rsv(info);
4580
4581	while (!list_empty(&info->space_info)) {
4582		space_info = list_first_entry(&info->space_info,
4583					      struct btrfs_space_info, list);
4584
4585		check_removing_space_info(space_info);
4586		list_del(&space_info->list);
4587		btrfs_sysfs_remove_space_info(space_info);
4588	}
4589	return 0;
4590}
4591
4592void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4593{
4594	atomic_inc(&cache->frozen);
4595}
4596
4597void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4598{
4599	struct btrfs_fs_info *fs_info = block_group->fs_info;
4600	bool cleanup;
4601
4602	spin_lock(&block_group->lock);
4603	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4604		   test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
4605	spin_unlock(&block_group->lock);
4606
4607	if (cleanup) {
4608		struct btrfs_chunk_map *map;
4609
4610		map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
4611		/* Logic error, can't happen. */
4612		ASSERT(map);
4613
4614		btrfs_remove_chunk_map(fs_info, map);
4615
4616		/* Once for our lookup reference. */
4617		btrfs_free_chunk_map(map);
4618
4619		/*
4620		 * We may have left one free space entry and other possible
4621		 * tasks trimming this block group have left 1 entry each one.
4622		 * Free them if any.
4623		 */
4624		btrfs_remove_free_space_cache(block_group);
4625	}
4626}
4627
4628bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4629{
4630	bool ret = true;
4631
4632	spin_lock(&bg->lock);
4633	if (bg->ro)
4634		ret = false;
4635	else
4636		bg->swap_extents++;
4637	spin_unlock(&bg->lock);
4638
4639	return ret;
4640}
4641
4642void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4643{
4644	spin_lock(&bg->lock);
4645	ASSERT(!bg->ro);
4646	ASSERT(bg->swap_extents >= amount);
4647	bg->swap_extents -= amount;
4648	spin_unlock(&bg->lock);
4649}
4650
4651enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4652{
4653	if (size <= SZ_128K)
4654		return BTRFS_BG_SZ_SMALL;
4655	if (size <= SZ_8M)
4656		return BTRFS_BG_SZ_MEDIUM;
4657	return BTRFS_BG_SZ_LARGE;
4658}
4659
4660/*
4661 * Handle a block group allocating an extent in a size class
4662 *
4663 * @bg:				The block group we allocated in.
4664 * @size_class:			The size class of the allocation.
4665 * @force_wrong_size_class:	Whether we are desperate enough to allow
4666 *				mismatched size classes.
4667 *
4668 * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4669 * case of a race that leads to the wrong size class without
4670 * force_wrong_size_class set.
4671 *
4672 * find_free_extent will skip block groups with a mismatched size class until
4673 * it really needs to avoid ENOSPC. In that case it will set
4674 * force_wrong_size_class. However, if a block group is newly allocated and
4675 * doesn't yet have a size class, then it is possible for two allocations of
4676 * different sizes to race and both try to use it. The loser is caught here and
4677 * has to retry.
4678 */
4679int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4680				     enum btrfs_block_group_size_class size_class,
4681				     bool force_wrong_size_class)
4682{
4683	ASSERT(size_class != BTRFS_BG_SZ_NONE);
4684
4685	/* The new allocation is in the right size class, do nothing */
4686	if (bg->size_class == size_class)
4687		return 0;
4688	/*
4689	 * The new allocation is in a mismatched size class.
4690	 * This means one of two things:
4691	 *
4692	 * 1. Two tasks in find_free_extent for different size_classes raced
4693	 *    and hit the same empty block_group. Make the loser try again.
4694	 * 2. A call to find_free_extent got desperate enough to set
4695	 *    'force_wrong_slab'. Don't change the size_class, but allow the
4696	 *    allocation.
4697	 */
4698	if (bg->size_class != BTRFS_BG_SZ_NONE) {
4699		if (force_wrong_size_class)
4700			return 0;
4701		return -EAGAIN;
4702	}
4703	/*
4704	 * The happy new block group case: the new allocation is the first
4705	 * one in the block_group so we set size_class.
4706	 */
4707	bg->size_class = size_class;
4708
4709	return 0;
4710}
4711
4712bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
4713{
4714	if (btrfs_is_zoned(bg->fs_info))
4715		return false;
4716	if (!btrfs_is_block_group_data_only(bg))
4717		return false;
4718	return true;
4719}