fs/btrfs/block-group.c at v5.19-rc6

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / btrfs / block-group.c
at v5.19-rc6 4171 lines 126 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/list_sort.h>
   4#include "misc.h"
   5#include "ctree.h"
   6#include "block-group.h"
   7#include "space-info.h"
   8#include "disk-io.h"
   9#include "free-space-cache.h"
  10#include "free-space-tree.h"
  11#include "volumes.h"
  12#include "transaction.h"
  13#include "ref-verify.h"
  14#include "sysfs.h"
  15#include "tree-log.h"
  16#include "delalloc-space.h"
  17#include "discard.h"
  18#include "raid56.h"
  19#include "zoned.h"
  20
  21/*
  22 * Return target flags in extended format or 0 if restripe for this chunk_type
  23 * is not in progress
  24 *
  25 * Should be called with balance_lock held
  26 */
  27static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  28{
  29	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  30	u64 target = 0;
  31
  32	if (!bctl)
  33		return 0;
  34
  35	if (flags & BTRFS_BLOCK_GROUP_DATA &&
  36	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  37		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  38	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  39		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  40		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  41	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  42		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  43		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  44	}
  45
  46	return target;
  47}
  48
  49/*
  50 * @flags: available profiles in extended format (see ctree.h)
  51 *
  52 * Return reduced profile in chunk format.  If profile changing is in progress
  53 * (either running or paused) picks the target profile (if it's already
  54 * available), otherwise falls back to plain reducing.
  55 */
  56static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  57{
  58	u64 num_devices = fs_info->fs_devices->rw_devices;
  59	u64 target;
  60	u64 raid_type;
  61	u64 allowed = 0;
  62
  63	/*
  64	 * See if restripe for this chunk_type is in progress, if so try to
  65	 * reduce to the target profile
  66	 */
  67	spin_lock(&fs_info->balance_lock);
  68	target = get_restripe_target(fs_info, flags);
  69	if (target) {
  70		spin_unlock(&fs_info->balance_lock);
  71		return extended_to_chunk(target);
  72	}
  73	spin_unlock(&fs_info->balance_lock);
  74
  75	/* First, mask out the RAID levels which aren't possible */
  76	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  77		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  78			allowed |= btrfs_raid_array[raid_type].bg_flag;
  79	}
  80	allowed &= flags;
  81
  82	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  83		allowed = BTRFS_BLOCK_GROUP_RAID6;
  84	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
  85		allowed = BTRFS_BLOCK_GROUP_RAID5;
  86	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
  87		allowed = BTRFS_BLOCK_GROUP_RAID10;
  88	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
  89		allowed = BTRFS_BLOCK_GROUP_RAID1;
  90	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
  91		allowed = BTRFS_BLOCK_GROUP_RAID0;
  92
  93	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
  94
  95	return extended_to_chunk(flags | allowed);
  96}
  97
  98u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
  99{
 100	unsigned seq;
 101	u64 flags;
 102
 103	do {
 104		flags = orig_flags;
 105		seq = read_seqbegin(&fs_info->profiles_lock);
 106
 107		if (flags & BTRFS_BLOCK_GROUP_DATA)
 108			flags |= fs_info->avail_data_alloc_bits;
 109		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 110			flags |= fs_info->avail_system_alloc_bits;
 111		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 112			flags |= fs_info->avail_metadata_alloc_bits;
 113	} while (read_seqretry(&fs_info->profiles_lock, seq));
 114
 115	return btrfs_reduce_alloc_profile(fs_info, flags);
 116}
 117
 118void btrfs_get_block_group(struct btrfs_block_group *cache)
 119{
 120	refcount_inc(&cache->refs);
 121}
 122
 123void btrfs_put_block_group(struct btrfs_block_group *cache)
 124{
 125	if (refcount_dec_and_test(&cache->refs)) {
 126		WARN_ON(cache->pinned > 0);
 127		/*
 128		 * If there was a failure to cleanup a log tree, very likely due
 129		 * to an IO failure on a writeback attempt of one or more of its
 130		 * extent buffers, we could not do proper (and cheap) unaccounting
 131		 * of their reserved space, so don't warn on reserved > 0 in that
 132		 * case.
 133		 */
 134		if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
 135		    !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
 136			WARN_ON(cache->reserved > 0);
 137
 138		/*
 139		 * A block_group shouldn't be on the discard_list anymore.
 140		 * Remove the block_group from the discard_list to prevent us
 141		 * from causing a panic due to NULL pointer dereference.
 142		 */
 143		if (WARN_ON(!list_empty(&cache->discard_list)))
 144			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 145						  cache);
 146
 147		/*
 148		 * If not empty, someone is still holding mutex of
 149		 * full_stripe_lock, which can only be released by caller.
 150		 * And it will definitely cause use-after-free when caller
 151		 * tries to release full stripe lock.
 152		 *
 153		 * No better way to resolve, but only to warn.
 154		 */
 155		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 156		kfree(cache->free_space_ctl);
 157		kfree(cache->physical_map);
 158		kfree(cache);
 159	}
 160}
 161
 162/*
 163 * This adds the block group to the fs_info rb tree for the block group cache
 164 */
 165static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 166				       struct btrfs_block_group *block_group)
 167{
 168	struct rb_node **p;
 169	struct rb_node *parent = NULL;
 170	struct btrfs_block_group *cache;
 171	bool leftmost = true;
 172
 173	ASSERT(block_group->length != 0);
 174
 175	write_lock(&info->block_group_cache_lock);
 176	p = &info->block_group_cache_tree.rb_root.rb_node;
 177
 178	while (*p) {
 179		parent = *p;
 180		cache = rb_entry(parent, struct btrfs_block_group, cache_node);
 181		if (block_group->start < cache->start) {
 182			p = &(*p)->rb_left;
 183		} else if (block_group->start > cache->start) {
 184			p = &(*p)->rb_right;
 185			leftmost = false;
 186		} else {
 187			write_unlock(&info->block_group_cache_lock);
 188			return -EEXIST;
 189		}
 190	}
 191
 192	rb_link_node(&block_group->cache_node, parent, p);
 193	rb_insert_color_cached(&block_group->cache_node,
 194			       &info->block_group_cache_tree, leftmost);
 195
 196	write_unlock(&info->block_group_cache_lock);
 197
 198	return 0;
 199}
 200
 201/*
 202 * This will return the block group at or after bytenr if contains is 0, else
 203 * it will return the block group that contains the bytenr
 204 */
 205static struct btrfs_block_group *block_group_cache_tree_search(
 206		struct btrfs_fs_info *info, u64 bytenr, int contains)
 207{
 208	struct btrfs_block_group *cache, *ret = NULL;
 209	struct rb_node *n;
 210	u64 end, start;
 211
 212	read_lock(&info->block_group_cache_lock);
 213	n = info->block_group_cache_tree.rb_root.rb_node;
 214
 215	while (n) {
 216		cache = rb_entry(n, struct btrfs_block_group, cache_node);
 217		end = cache->start + cache->length - 1;
 218		start = cache->start;
 219
 220		if (bytenr < start) {
 221			if (!contains && (!ret || start < ret->start))
 222				ret = cache;
 223			n = n->rb_left;
 224		} else if (bytenr > start) {
 225			if (contains && bytenr <= end) {
 226				ret = cache;
 227				break;
 228			}
 229			n = n->rb_right;
 230		} else {
 231			ret = cache;
 232			break;
 233		}
 234	}
 235	if (ret)
 236		btrfs_get_block_group(ret);
 237	read_unlock(&info->block_group_cache_lock);
 238
 239	return ret;
 240}
 241
 242/*
 243 * Return the block group that starts at or after bytenr
 244 */
 245struct btrfs_block_group *btrfs_lookup_first_block_group(
 246		struct btrfs_fs_info *info, u64 bytenr)
 247{
 248	return block_group_cache_tree_search(info, bytenr, 0);
 249}
 250
 251/*
 252 * Return the block group that contains the given bytenr
 253 */
 254struct btrfs_block_group *btrfs_lookup_block_group(
 255		struct btrfs_fs_info *info, u64 bytenr)
 256{
 257	return block_group_cache_tree_search(info, bytenr, 1);
 258}
 259
 260struct btrfs_block_group *btrfs_next_block_group(
 261		struct btrfs_block_group *cache)
 262{
 263	struct btrfs_fs_info *fs_info = cache->fs_info;
 264	struct rb_node *node;
 265
 266	read_lock(&fs_info->block_group_cache_lock);
 267
 268	/* If our block group was removed, we need a full search. */
 269	if (RB_EMPTY_NODE(&cache->cache_node)) {
 270		const u64 next_bytenr = cache->start + cache->length;
 271
 272		read_unlock(&fs_info->block_group_cache_lock);
 273		btrfs_put_block_group(cache);
 274		return btrfs_lookup_first_block_group(fs_info, next_bytenr);
 275	}
 276	node = rb_next(&cache->cache_node);
 277	btrfs_put_block_group(cache);
 278	if (node) {
 279		cache = rb_entry(node, struct btrfs_block_group, cache_node);
 280		btrfs_get_block_group(cache);
 281	} else
 282		cache = NULL;
 283	read_unlock(&fs_info->block_group_cache_lock);
 284	return cache;
 285}
 286
 287/**
 288 * Check if we can do a NOCOW write for a given extent.
 289 *
 290 * @fs_info:       The filesystem information object.
 291 * @bytenr:        Logical start address of the extent.
 292 *
 293 * Check if we can do a NOCOW write for the given extent, and increments the
 294 * number of NOCOW writers in the block group that contains the extent, as long
 295 * as the block group exists and it's currently not in read-only mode.
 296 *
 297 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
 298 *          is responsible for calling btrfs_dec_nocow_writers() later.
 299 *
 300 *          Or NULL if we can not do a NOCOW write
 301 */
 302struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
 303						  u64 bytenr)
 304{
 305	struct btrfs_block_group *bg;
 306	bool can_nocow = true;
 307
 308	bg = btrfs_lookup_block_group(fs_info, bytenr);
 309	if (!bg)
 310		return NULL;
 311
 312	spin_lock(&bg->lock);
 313	if (bg->ro)
 314		can_nocow = false;
 315	else
 316		atomic_inc(&bg->nocow_writers);
 317	spin_unlock(&bg->lock);
 318
 319	if (!can_nocow) {
 320		btrfs_put_block_group(bg);
 321		return NULL;
 322	}
 323
 324	/* No put on block group, done by btrfs_dec_nocow_writers(). */
 325	return bg;
 326}
 327
 328/**
 329 * Decrement the number of NOCOW writers in a block group.
 330 *
 331 * @bg:       The block group.
 332 *
 333 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
 334 * and on the block group returned by that call. Typically this is called after
 335 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
 336 * relocation.
 337 *
 338 * After this call, the caller should not use the block group anymore. It it wants
 339 * to use it, then it should get a reference on it before calling this function.
 340 */
 341void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
 342{
 343	if (atomic_dec_and_test(&bg->nocow_writers))
 344		wake_up_var(&bg->nocow_writers);
 345
 346	/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
 347	btrfs_put_block_group(bg);
 348}
 349
 350void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 351{
 352	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 353}
 354
 355void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 356					const u64 start)
 357{
 358	struct btrfs_block_group *bg;
 359
 360	bg = btrfs_lookup_block_group(fs_info, start);
 361	ASSERT(bg);
 362	if (atomic_dec_and_test(&bg->reservations))
 363		wake_up_var(&bg->reservations);
 364	btrfs_put_block_group(bg);
 365}
 366
 367void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 368{
 369	struct btrfs_space_info *space_info = bg->space_info;
 370
 371	ASSERT(bg->ro);
 372
 373	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 374		return;
 375
 376	/*
 377	 * Our block group is read only but before we set it to read only,
 378	 * some task might have had allocated an extent from it already, but it
 379	 * has not yet created a respective ordered extent (and added it to a
 380	 * root's list of ordered extents).
 381	 * Therefore wait for any task currently allocating extents, since the
 382	 * block group's reservations counter is incremented while a read lock
 383	 * on the groups' semaphore is held and decremented after releasing
 384	 * the read access on that semaphore and creating the ordered extent.
 385	 */
 386	down_write(&space_info->groups_sem);
 387	up_write(&space_info->groups_sem);
 388
 389	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 390}
 391
 392struct btrfs_caching_control *btrfs_get_caching_control(
 393		struct btrfs_block_group *cache)
 394{
 395	struct btrfs_caching_control *ctl;
 396
 397	spin_lock(&cache->lock);
 398	if (!cache->caching_ctl) {
 399		spin_unlock(&cache->lock);
 400		return NULL;
 401	}
 402
 403	ctl = cache->caching_ctl;
 404	refcount_inc(&ctl->count);
 405	spin_unlock(&cache->lock);
 406	return ctl;
 407}
 408
 409void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 410{
 411	if (refcount_dec_and_test(&ctl->count))
 412		kfree(ctl);
 413}
 414
 415/*
 416 * When we wait for progress in the block group caching, its because our
 417 * allocation attempt failed at least once.  So, we must sleep and let some
 418 * progress happen before we try again.
 419 *
 420 * This function will sleep at least once waiting for new free space to show
 421 * up, and then it will check the block group free space numbers for our min
 422 * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 423 * a free extent of a given size, but this is a good start.
 424 *
 425 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 426 * any of the information in this block group.
 427 */
 428void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 429					   u64 num_bytes)
 430{
 431	struct btrfs_caching_control *caching_ctl;
 432
 433	caching_ctl = btrfs_get_caching_control(cache);
 434	if (!caching_ctl)
 435		return;
 436
 437	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 438		   (cache->free_space_ctl->free_space >= num_bytes));
 439
 440	btrfs_put_caching_control(caching_ctl);
 441}
 442
 443int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 444{
 445	struct btrfs_caching_control *caching_ctl;
 446	int ret = 0;
 447
 448	caching_ctl = btrfs_get_caching_control(cache);
 449	if (!caching_ctl)
 450		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 451
 452	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 453	if (cache->cached == BTRFS_CACHE_ERROR)
 454		ret = -EIO;
 455	btrfs_put_caching_control(caching_ctl);
 456	return ret;
 457}
 458
 459static bool space_cache_v1_done(struct btrfs_block_group *cache)
 460{
 461	bool ret;
 462
 463	spin_lock(&cache->lock);
 464	ret = cache->cached != BTRFS_CACHE_FAST;
 465	spin_unlock(&cache->lock);
 466
 467	return ret;
 468}
 469
 470void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
 471				struct btrfs_caching_control *caching_ctl)
 472{
 473	wait_event(caching_ctl->wait, space_cache_v1_done(cache));
 474}
 475
 476#ifdef CONFIG_BTRFS_DEBUG
 477static void fragment_free_space(struct btrfs_block_group *block_group)
 478{
 479	struct btrfs_fs_info *fs_info = block_group->fs_info;
 480	u64 start = block_group->start;
 481	u64 len = block_group->length;
 482	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 483		fs_info->nodesize : fs_info->sectorsize;
 484	u64 step = chunk << 1;
 485
 486	while (len > chunk) {
 487		btrfs_remove_free_space(block_group, start, chunk);
 488		start += step;
 489		if (len < step)
 490			len = 0;
 491		else
 492			len -= step;
 493	}
 494}
 495#endif
 496
 497/*
 498 * This is only called by btrfs_cache_block_group, since we could have freed
 499 * extents we need to check the pinned_extents for any extents that can't be
 500 * used yet since their free space will be released as soon as the transaction
 501 * commits.
 502 */
 503u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
 504{
 505	struct btrfs_fs_info *info = block_group->fs_info;
 506	u64 extent_start, extent_end, size, total_added = 0;
 507	int ret;
 508
 509	while (start < end) {
 510		ret = find_first_extent_bit(&info->excluded_extents, start,
 511					    &extent_start, &extent_end,
 512					    EXTENT_DIRTY | EXTENT_UPTODATE,
 513					    NULL);
 514		if (ret)
 515			break;
 516
 517		if (extent_start <= start) {
 518			start = extent_end + 1;
 519		} else if (extent_start > start && extent_start < end) {
 520			size = extent_start - start;
 521			total_added += size;
 522			ret = btrfs_add_free_space_async_trimmed(block_group,
 523								 start, size);
 524			BUG_ON(ret); /* -ENOMEM or logic error */
 525			start = extent_end + 1;
 526		} else {
 527			break;
 528		}
 529	}
 530
 531	if (start < end) {
 532		size = end - start;
 533		total_added += size;
 534		ret = btrfs_add_free_space_async_trimmed(block_group, start,
 535							 size);
 536		BUG_ON(ret); /* -ENOMEM or logic error */
 537	}
 538
 539	return total_added;
 540}
 541
 542static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 543{
 544	struct btrfs_block_group *block_group = caching_ctl->block_group;
 545	struct btrfs_fs_info *fs_info = block_group->fs_info;
 546	struct btrfs_root *extent_root;
 547	struct btrfs_path *path;
 548	struct extent_buffer *leaf;
 549	struct btrfs_key key;
 550	u64 total_found = 0;
 551	u64 last = 0;
 552	u32 nritems;
 553	int ret;
 554	bool wakeup = true;
 555
 556	path = btrfs_alloc_path();
 557	if (!path)
 558		return -ENOMEM;
 559
 560	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 561	extent_root = btrfs_extent_root(fs_info, last);
 562
 563#ifdef CONFIG_BTRFS_DEBUG
 564	/*
 565	 * If we're fragmenting we don't want to make anybody think we can
 566	 * allocate from this block group until we've had a chance to fragment
 567	 * the free space.
 568	 */
 569	if (btrfs_should_fragment_free_space(block_group))
 570		wakeup = false;
 571#endif
 572	/*
 573	 * We don't want to deadlock with somebody trying to allocate a new
 574	 * extent for the extent root while also trying to search the extent
 575	 * root to add free space.  So we skip locking and search the commit
 576	 * root, since its read-only
 577	 */
 578	path->skip_locking = 1;
 579	path->search_commit_root = 1;
 580	path->reada = READA_FORWARD;
 581
 582	key.objectid = last;
 583	key.offset = 0;
 584	key.type = BTRFS_EXTENT_ITEM_KEY;
 585
 586next:
 587	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 588	if (ret < 0)
 589		goto out;
 590
 591	leaf = path->nodes[0];
 592	nritems = btrfs_header_nritems(leaf);
 593
 594	while (1) {
 595		if (btrfs_fs_closing(fs_info) > 1) {
 596			last = (u64)-1;
 597			break;
 598		}
 599
 600		if (path->slots[0] < nritems) {
 601			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 602		} else {
 603			ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 604			if (ret)
 605				break;
 606
 607			if (need_resched() ||
 608			    rwsem_is_contended(&fs_info->commit_root_sem)) {
 609				if (wakeup)
 610					caching_ctl->progress = last;
 611				btrfs_release_path(path);
 612				up_read(&fs_info->commit_root_sem);
 613				mutex_unlock(&caching_ctl->mutex);
 614				cond_resched();
 615				mutex_lock(&caching_ctl->mutex);
 616				down_read(&fs_info->commit_root_sem);
 617				goto next;
 618			}
 619
 620			ret = btrfs_next_leaf(extent_root, path);
 621			if (ret < 0)
 622				goto out;
 623			if (ret)
 624				break;
 625			leaf = path->nodes[0];
 626			nritems = btrfs_header_nritems(leaf);
 627			continue;
 628		}
 629
 630		if (key.objectid < last) {
 631			key.objectid = last;
 632			key.offset = 0;
 633			key.type = BTRFS_EXTENT_ITEM_KEY;
 634
 635			if (wakeup)
 636				caching_ctl->progress = last;
 637			btrfs_release_path(path);
 638			goto next;
 639		}
 640
 641		if (key.objectid < block_group->start) {
 642			path->slots[0]++;
 643			continue;
 644		}
 645
 646		if (key.objectid >= block_group->start + block_group->length)
 647			break;
 648
 649		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 650		    key.type == BTRFS_METADATA_ITEM_KEY) {
 651			total_found += add_new_free_space(block_group, last,
 652							  key.objectid);
 653			if (key.type == BTRFS_METADATA_ITEM_KEY)
 654				last = key.objectid +
 655					fs_info->nodesize;
 656			else
 657				last = key.objectid + key.offset;
 658
 659			if (total_found > CACHING_CTL_WAKE_UP) {
 660				total_found = 0;
 661				if (wakeup)
 662					wake_up(&caching_ctl->wait);
 663			}
 664		}
 665		path->slots[0]++;
 666	}
 667	ret = 0;
 668
 669	total_found += add_new_free_space(block_group, last,
 670				block_group->start + block_group->length);
 671	caching_ctl->progress = (u64)-1;
 672
 673out:
 674	btrfs_free_path(path);
 675	return ret;
 676}
 677
 678static noinline void caching_thread(struct btrfs_work *work)
 679{
 680	struct btrfs_block_group *block_group;
 681	struct btrfs_fs_info *fs_info;
 682	struct btrfs_caching_control *caching_ctl;
 683	int ret;
 684
 685	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 686	block_group = caching_ctl->block_group;
 687	fs_info = block_group->fs_info;
 688
 689	mutex_lock(&caching_ctl->mutex);
 690	down_read(&fs_info->commit_root_sem);
 691
 692	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 693		ret = load_free_space_cache(block_group);
 694		if (ret == 1) {
 695			ret = 0;
 696			goto done;
 697		}
 698
 699		/*
 700		 * We failed to load the space cache, set ourselves to
 701		 * CACHE_STARTED and carry on.
 702		 */
 703		spin_lock(&block_group->lock);
 704		block_group->cached = BTRFS_CACHE_STARTED;
 705		spin_unlock(&block_group->lock);
 706		wake_up(&caching_ctl->wait);
 707	}
 708
 709	/*
 710	 * If we are in the transaction that populated the free space tree we
 711	 * can't actually cache from the free space tree as our commit root and
 712	 * real root are the same, so we could change the contents of the blocks
 713	 * while caching.  Instead do the slow caching in this case, and after
 714	 * the transaction has committed we will be safe.
 715	 */
 716	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
 717	    !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
 718		ret = load_free_space_tree(caching_ctl);
 719	else
 720		ret = load_extent_tree_free(caching_ctl);
 721done:
 722	spin_lock(&block_group->lock);
 723	block_group->caching_ctl = NULL;
 724	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 725	spin_unlock(&block_group->lock);
 726
 727#ifdef CONFIG_BTRFS_DEBUG
 728	if (btrfs_should_fragment_free_space(block_group)) {
 729		u64 bytes_used;
 730
 731		spin_lock(&block_group->space_info->lock);
 732		spin_lock(&block_group->lock);
 733		bytes_used = block_group->length - block_group->used;
 734		block_group->space_info->bytes_used += bytes_used >> 1;
 735		spin_unlock(&block_group->lock);
 736		spin_unlock(&block_group->space_info->lock);
 737		fragment_free_space(block_group);
 738	}
 739#endif
 740
 741	caching_ctl->progress = (u64)-1;
 742
 743	up_read(&fs_info->commit_root_sem);
 744	btrfs_free_excluded_extents(block_group);
 745	mutex_unlock(&caching_ctl->mutex);
 746
 747	wake_up(&caching_ctl->wait);
 748
 749	btrfs_put_caching_control(caching_ctl);
 750	btrfs_put_block_group(block_group);
 751}
 752
 753int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
 754{
 755	DEFINE_WAIT(wait);
 756	struct btrfs_fs_info *fs_info = cache->fs_info;
 757	struct btrfs_caching_control *caching_ctl = NULL;
 758	int ret = 0;
 759
 760	/* Allocator for zoned filesystems does not use the cache at all */
 761	if (btrfs_is_zoned(fs_info))
 762		return 0;
 763
 764	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 765	if (!caching_ctl)
 766		return -ENOMEM;
 767
 768	INIT_LIST_HEAD(&caching_ctl->list);
 769	mutex_init(&caching_ctl->mutex);
 770	init_waitqueue_head(&caching_ctl->wait);
 771	caching_ctl->block_group = cache;
 772	caching_ctl->progress = cache->start;
 773	refcount_set(&caching_ctl->count, 2);
 774	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 775
 776	spin_lock(&cache->lock);
 777	if (cache->cached != BTRFS_CACHE_NO) {
 778		kfree(caching_ctl);
 779
 780		caching_ctl = cache->caching_ctl;
 781		if (caching_ctl)
 782			refcount_inc(&caching_ctl->count);
 783		spin_unlock(&cache->lock);
 784		goto out;
 785	}
 786	WARN_ON(cache->caching_ctl);
 787	cache->caching_ctl = caching_ctl;
 788	if (btrfs_test_opt(fs_info, SPACE_CACHE))
 789		cache->cached = BTRFS_CACHE_FAST;
 790	else
 791		cache->cached = BTRFS_CACHE_STARTED;
 792	cache->has_caching_ctl = 1;
 793	spin_unlock(&cache->lock);
 794
 795	write_lock(&fs_info->block_group_cache_lock);
 796	refcount_inc(&caching_ctl->count);
 797	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 798	write_unlock(&fs_info->block_group_cache_lock);
 799
 800	btrfs_get_block_group(cache);
 801
 802	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 803out:
 804	if (load_cache_only && caching_ctl)
 805		btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
 806	if (caching_ctl)
 807		btrfs_put_caching_control(caching_ctl);
 808
 809	return ret;
 810}
 811
 812static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 813{
 814	u64 extra_flags = chunk_to_extended(flags) &
 815				BTRFS_EXTENDED_PROFILE_MASK;
 816
 817	write_seqlock(&fs_info->profiles_lock);
 818	if (flags & BTRFS_BLOCK_GROUP_DATA)
 819		fs_info->avail_data_alloc_bits &= ~extra_flags;
 820	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 821		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 822	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 823		fs_info->avail_system_alloc_bits &= ~extra_flags;
 824	write_sequnlock(&fs_info->profiles_lock);
 825}
 826
 827/*
 828 * Clear incompat bits for the following feature(s):
 829 *
 830 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 831 *            in the whole filesystem
 832 *
 833 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 834 */
 835static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 836{
 837	bool found_raid56 = false;
 838	bool found_raid1c34 = false;
 839
 840	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
 841	    (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
 842	    (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
 843		struct list_head *head = &fs_info->space_info;
 844		struct btrfs_space_info *sinfo;
 845
 846		list_for_each_entry_rcu(sinfo, head, list) {
 847			down_read(&sinfo->groups_sem);
 848			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 849				found_raid56 = true;
 850			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 851				found_raid56 = true;
 852			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
 853				found_raid1c34 = true;
 854			if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
 855				found_raid1c34 = true;
 856			up_read(&sinfo->groups_sem);
 857		}
 858		if (!found_raid56)
 859			btrfs_clear_fs_incompat(fs_info, RAID56);
 860		if (!found_raid1c34)
 861			btrfs_clear_fs_incompat(fs_info, RAID1C34);
 862	}
 863}
 864
 865static int remove_block_group_item(struct btrfs_trans_handle *trans,
 866				   struct btrfs_path *path,
 867				   struct btrfs_block_group *block_group)
 868{
 869	struct btrfs_fs_info *fs_info = trans->fs_info;
 870	struct btrfs_root *root;
 871	struct btrfs_key key;
 872	int ret;
 873
 874	root = btrfs_block_group_root(fs_info);
 875	key.objectid = block_group->start;
 876	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 877	key.offset = block_group->length;
 878
 879	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 880	if (ret > 0)
 881		ret = -ENOENT;
 882	if (ret < 0)
 883		return ret;
 884
 885	ret = btrfs_del_item(trans, root, path);
 886	return ret;
 887}
 888
 889int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 890			     u64 group_start, struct extent_map *em)
 891{
 892	struct btrfs_fs_info *fs_info = trans->fs_info;
 893	struct btrfs_path *path;
 894	struct btrfs_block_group *block_group;
 895	struct btrfs_free_cluster *cluster;
 896	struct inode *inode;
 897	struct kobject *kobj = NULL;
 898	int ret;
 899	int index;
 900	int factor;
 901	struct btrfs_caching_control *caching_ctl = NULL;
 902	bool remove_em;
 903	bool remove_rsv = false;
 904
 905	block_group = btrfs_lookup_block_group(fs_info, group_start);
 906	BUG_ON(!block_group);
 907	BUG_ON(!block_group->ro);
 908
 909	trace_btrfs_remove_block_group(block_group);
 910	/*
 911	 * Free the reserved super bytes from this block group before
 912	 * remove it.
 913	 */
 914	btrfs_free_excluded_extents(block_group);
 915	btrfs_free_ref_tree_range(fs_info, block_group->start,
 916				  block_group->length);
 917
 918	index = btrfs_bg_flags_to_raid_index(block_group->flags);
 919	factor = btrfs_bg_type_to_factor(block_group->flags);
 920
 921	/* make sure this block group isn't part of an allocation cluster */
 922	cluster = &fs_info->data_alloc_cluster;
 923	spin_lock(&cluster->refill_lock);
 924	btrfs_return_cluster_to_free_space(block_group, cluster);
 925	spin_unlock(&cluster->refill_lock);
 926
 927	/*
 928	 * make sure this block group isn't part of a metadata
 929	 * allocation cluster
 930	 */
 931	cluster = &fs_info->meta_alloc_cluster;
 932	spin_lock(&cluster->refill_lock);
 933	btrfs_return_cluster_to_free_space(block_group, cluster);
 934	spin_unlock(&cluster->refill_lock);
 935
 936	btrfs_clear_treelog_bg(block_group);
 937	btrfs_clear_data_reloc_bg(block_group);
 938
 939	path = btrfs_alloc_path();
 940	if (!path) {
 941		ret = -ENOMEM;
 942		goto out;
 943	}
 944
 945	/*
 946	 * get the inode first so any iput calls done for the io_list
 947	 * aren't the final iput (no unlinks allowed now)
 948	 */
 949	inode = lookup_free_space_inode(block_group, path);
 950
 951	mutex_lock(&trans->transaction->cache_write_mutex);
 952	/*
 953	 * Make sure our free space cache IO is done before removing the
 954	 * free space inode
 955	 */
 956	spin_lock(&trans->transaction->dirty_bgs_lock);
 957	if (!list_empty(&block_group->io_list)) {
 958		list_del_init(&block_group->io_list);
 959
 960		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
 961
 962		spin_unlock(&trans->transaction->dirty_bgs_lock);
 963		btrfs_wait_cache_io(trans, block_group, path);
 964		btrfs_put_block_group(block_group);
 965		spin_lock(&trans->transaction->dirty_bgs_lock);
 966	}
 967
 968	if (!list_empty(&block_group->dirty_list)) {
 969		list_del_init(&block_group->dirty_list);
 970		remove_rsv = true;
 971		btrfs_put_block_group(block_group);
 972	}
 973	spin_unlock(&trans->transaction->dirty_bgs_lock);
 974	mutex_unlock(&trans->transaction->cache_write_mutex);
 975
 976	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
 977	if (ret)
 978		goto out;
 979
 980	write_lock(&fs_info->block_group_cache_lock);
 981	rb_erase_cached(&block_group->cache_node,
 982			&fs_info->block_group_cache_tree);
 983	RB_CLEAR_NODE(&block_group->cache_node);
 984
 985	/* Once for the block groups rbtree */
 986	btrfs_put_block_group(block_group);
 987
 988	write_unlock(&fs_info->block_group_cache_lock);
 989
 990	down_write(&block_group->space_info->groups_sem);
 991	/*
 992	 * we must use list_del_init so people can check to see if they
 993	 * are still on the list after taking the semaphore
 994	 */
 995	list_del_init(&block_group->list);
 996	if (list_empty(&block_group->space_info->block_groups[index])) {
 997		kobj = block_group->space_info->block_group_kobjs[index];
 998		block_group->space_info->block_group_kobjs[index] = NULL;
 999		clear_avail_alloc_bits(fs_info, block_group->flags);
1000	}
1001	up_write(&block_group->space_info->groups_sem);
1002	clear_incompat_bg_bits(fs_info, block_group->flags);
1003	if (kobj) {
1004		kobject_del(kobj);
1005		kobject_put(kobj);
1006	}
1007
1008	if (block_group->has_caching_ctl)
1009		caching_ctl = btrfs_get_caching_control(block_group);
1010	if (block_group->cached == BTRFS_CACHE_STARTED)
1011		btrfs_wait_block_group_cache_done(block_group);
1012	if (block_group->has_caching_ctl) {
1013		write_lock(&fs_info->block_group_cache_lock);
1014		if (!caching_ctl) {
1015			struct btrfs_caching_control *ctl;
1016
1017			list_for_each_entry(ctl,
1018				    &fs_info->caching_block_groups, list)
1019				if (ctl->block_group == block_group) {
1020					caching_ctl = ctl;
1021					refcount_inc(&caching_ctl->count);
1022					break;
1023				}
1024		}
1025		if (caching_ctl)
1026			list_del_init(&caching_ctl->list);
1027		write_unlock(&fs_info->block_group_cache_lock);
1028		if (caching_ctl) {
1029			/* Once for the caching bgs list and once for us. */
1030			btrfs_put_caching_control(caching_ctl);
1031			btrfs_put_caching_control(caching_ctl);
1032		}
1033	}
1034
1035	spin_lock(&trans->transaction->dirty_bgs_lock);
1036	WARN_ON(!list_empty(&block_group->dirty_list));
1037	WARN_ON(!list_empty(&block_group->io_list));
1038	spin_unlock(&trans->transaction->dirty_bgs_lock);
1039
1040	btrfs_remove_free_space_cache(block_group);
1041
1042	spin_lock(&block_group->space_info->lock);
1043	list_del_init(&block_group->ro_list);
1044
1045	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1046		WARN_ON(block_group->space_info->total_bytes
1047			< block_group->length);
1048		WARN_ON(block_group->space_info->bytes_readonly
1049			< block_group->length - block_group->zone_unusable);
1050		WARN_ON(block_group->space_info->bytes_zone_unusable
1051			< block_group->zone_unusable);
1052		WARN_ON(block_group->space_info->disk_total
1053			< block_group->length * factor);
1054	}
1055	block_group->space_info->total_bytes -= block_group->length;
1056	block_group->space_info->bytes_readonly -=
1057		(block_group->length - block_group->zone_unusable);
1058	block_group->space_info->bytes_zone_unusable -=
1059		block_group->zone_unusable;
1060	block_group->space_info->disk_total -= block_group->length * factor;
1061
1062	spin_unlock(&block_group->space_info->lock);
1063
1064	/*
1065	 * Remove the free space for the block group from the free space tree
1066	 * and the block group's item from the extent tree before marking the
1067	 * block group as removed. This is to prevent races with tasks that
1068	 * freeze and unfreeze a block group, this task and another task
1069	 * allocating a new block group - the unfreeze task ends up removing
1070	 * the block group's extent map before the task calling this function
1071	 * deletes the block group item from the extent tree, allowing for
1072	 * another task to attempt to create another block group with the same
1073	 * item key (and failing with -EEXIST and a transaction abort).
1074	 */
1075	ret = remove_block_group_free_space(trans, block_group);
1076	if (ret)
1077		goto out;
1078
1079	ret = remove_block_group_item(trans, path, block_group);
1080	if (ret < 0)
1081		goto out;
1082
1083	spin_lock(&block_group->lock);
1084	block_group->removed = 1;
1085	/*
1086	 * At this point trimming or scrub can't start on this block group,
1087	 * because we removed the block group from the rbtree
1088	 * fs_info->block_group_cache_tree so no one can't find it anymore and
1089	 * even if someone already got this block group before we removed it
1090	 * from the rbtree, they have already incremented block_group->frozen -
1091	 * if they didn't, for the trimming case they won't find any free space
1092	 * entries because we already removed them all when we called
1093	 * btrfs_remove_free_space_cache().
1094	 *
1095	 * And we must not remove the extent map from the fs_info->mapping_tree
1096	 * to prevent the same logical address range and physical device space
1097	 * ranges from being reused for a new block group. This is needed to
1098	 * avoid races with trimming and scrub.
1099	 *
1100	 * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1101	 * completely transactionless, so while it is trimming a range the
1102	 * currently running transaction might finish and a new one start,
1103	 * allowing for new block groups to be created that can reuse the same
1104	 * physical device locations unless we take this special care.
1105	 *
1106	 * There may also be an implicit trim operation if the file system
1107	 * is mounted with -odiscard. The same protections must remain
1108	 * in place until the extents have been discarded completely when
1109	 * the transaction commit has completed.
1110	 */
1111	remove_em = (atomic_read(&block_group->frozen) == 0);
1112	spin_unlock(&block_group->lock);
1113
1114	if (remove_em) {
1115		struct extent_map_tree *em_tree;
1116
1117		em_tree = &fs_info->mapping_tree;
1118		write_lock(&em_tree->lock);
1119		remove_extent_mapping(em_tree, em);
1120		write_unlock(&em_tree->lock);
1121		/* once for the tree */
1122		free_extent_map(em);
1123	}
1124
1125out:
1126	/* Once for the lookup reference */
1127	btrfs_put_block_group(block_group);
1128	if (remove_rsv)
1129		btrfs_delayed_refs_rsv_release(fs_info, 1);
1130	btrfs_free_path(path);
1131	return ret;
1132}
1133
1134struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1135		struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1136{
1137	struct btrfs_root *root = btrfs_block_group_root(fs_info);
1138	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1139	struct extent_map *em;
1140	struct map_lookup *map;
1141	unsigned int num_items;
1142
1143	read_lock(&em_tree->lock);
1144	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1145	read_unlock(&em_tree->lock);
1146	ASSERT(em && em->start == chunk_offset);
1147
1148	/*
1149	 * We need to reserve 3 + N units from the metadata space info in order
1150	 * to remove a block group (done at btrfs_remove_chunk() and at
1151	 * btrfs_remove_block_group()), which are used for:
1152	 *
1153	 * 1 unit for adding the free space inode's orphan (located in the tree
1154	 * of tree roots).
1155	 * 1 unit for deleting the block group item (located in the extent
1156	 * tree).
1157	 * 1 unit for deleting the free space item (located in tree of tree
1158	 * roots).
1159	 * N units for deleting N device extent items corresponding to each
1160	 * stripe (located in the device tree).
1161	 *
1162	 * In order to remove a block group we also need to reserve units in the
1163	 * system space info in order to update the chunk tree (update one or
1164	 * more device items and remove one chunk item), but this is done at
1165	 * btrfs_remove_chunk() through a call to check_system_chunk().
1166	 */
1167	map = em->map_lookup;
1168	num_items = 3 + map->num_stripes;
1169	free_extent_map(em);
1170
1171	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1172}
1173
1174/*
1175 * Mark block group @cache read-only, so later write won't happen to block
1176 * group @cache.
1177 *
1178 * If @force is not set, this function will only mark the block group readonly
1179 * if we have enough free space (1M) in other metadata/system block groups.
1180 * If @force is not set, this function will mark the block group readonly
1181 * without checking free space.
1182 *
1183 * NOTE: This function doesn't care if other block groups can contain all the
1184 * data in this block group. That check should be done by relocation routine,
1185 * not this function.
1186 */
1187static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1188{
1189	struct btrfs_space_info *sinfo = cache->space_info;
1190	u64 num_bytes;
1191	int ret = -ENOSPC;
1192
1193	spin_lock(&sinfo->lock);
1194	spin_lock(&cache->lock);
1195
1196	if (cache->swap_extents) {
1197		ret = -ETXTBSY;
1198		goto out;
1199	}
1200
1201	if (cache->ro) {
1202		cache->ro++;
1203		ret = 0;
1204		goto out;
1205	}
1206
1207	num_bytes = cache->length - cache->reserved - cache->pinned -
1208		    cache->bytes_super - cache->zone_unusable - cache->used;
1209
1210	/*
1211	 * Data never overcommits, even in mixed mode, so do just the straight
1212	 * check of left over space in how much we have allocated.
1213	 */
1214	if (force) {
1215		ret = 0;
1216	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1217		u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1218
1219		/*
1220		 * Here we make sure if we mark this bg RO, we still have enough
1221		 * free space as buffer.
1222		 */
1223		if (sinfo_used + num_bytes <= sinfo->total_bytes)
1224			ret = 0;
1225	} else {
1226		/*
1227		 * We overcommit metadata, so we need to do the
1228		 * btrfs_can_overcommit check here, and we need to pass in
1229		 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1230		 * leeway to allow us to mark this block group as read only.
1231		 */
1232		if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1233					 BTRFS_RESERVE_NO_FLUSH))
1234			ret = 0;
1235	}
1236
1237	if (!ret) {
1238		sinfo->bytes_readonly += num_bytes;
1239		if (btrfs_is_zoned(cache->fs_info)) {
1240			/* Migrate zone_unusable bytes to readonly */
1241			sinfo->bytes_readonly += cache->zone_unusable;
1242			sinfo->bytes_zone_unusable -= cache->zone_unusable;
1243			cache->zone_unusable = 0;
1244		}
1245		cache->ro++;
1246		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1247	}
1248out:
1249	spin_unlock(&cache->lock);
1250	spin_unlock(&sinfo->lock);
1251	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1252		btrfs_info(cache->fs_info,
1253			"unable to make block group %llu ro", cache->start);
1254		btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1255	}
1256	return ret;
1257}
1258
1259static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1260				 struct btrfs_block_group *bg)
1261{
1262	struct btrfs_fs_info *fs_info = bg->fs_info;
1263	struct btrfs_transaction *prev_trans = NULL;
1264	const u64 start = bg->start;
1265	const u64 end = start + bg->length - 1;
1266	int ret;
1267
1268	spin_lock(&fs_info->trans_lock);
1269	if (trans->transaction->list.prev != &fs_info->trans_list) {
1270		prev_trans = list_last_entry(&trans->transaction->list,
1271					     struct btrfs_transaction, list);
1272		refcount_inc(&prev_trans->use_count);
1273	}
1274	spin_unlock(&fs_info->trans_lock);
1275
1276	/*
1277	 * Hold the unused_bg_unpin_mutex lock to avoid racing with
1278	 * btrfs_finish_extent_commit(). If we are at transaction N, another
1279	 * task might be running finish_extent_commit() for the previous
1280	 * transaction N - 1, and have seen a range belonging to the block
1281	 * group in pinned_extents before we were able to clear the whole block
1282	 * group range from pinned_extents. This means that task can lookup for
1283	 * the block group after we unpinned it from pinned_extents and removed
1284	 * it, leading to a BUG_ON() at unpin_extent_range().
1285	 */
1286	mutex_lock(&fs_info->unused_bg_unpin_mutex);
1287	if (prev_trans) {
1288		ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1289					EXTENT_DIRTY);
1290		if (ret)
1291			goto out;
1292	}
1293
1294	ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1295				EXTENT_DIRTY);
1296out:
1297	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1298	if (prev_trans)
1299		btrfs_put_transaction(prev_trans);
1300
1301	return ret == 0;
1302}
1303
1304/*
1305 * Process the unused_bgs list and remove any that don't have any allocated
1306 * space inside of them.
1307 */
1308void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1309{
1310	struct btrfs_block_group *block_group;
1311	struct btrfs_space_info *space_info;
1312	struct btrfs_trans_handle *trans;
1313	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1314	int ret = 0;
1315
1316	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1317		return;
1318
1319	/*
1320	 * Long running balances can keep us blocked here for eternity, so
1321	 * simply skip deletion if we're unable to get the mutex.
1322	 */
1323	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1324		return;
1325
1326	spin_lock(&fs_info->unused_bgs_lock);
1327	while (!list_empty(&fs_info->unused_bgs)) {
1328		int trimming;
1329
1330		block_group = list_first_entry(&fs_info->unused_bgs,
1331					       struct btrfs_block_group,
1332					       bg_list);
1333		list_del_init(&block_group->bg_list);
1334
1335		space_info = block_group->space_info;
1336
1337		if (ret || btrfs_mixed_space_info(space_info)) {
1338			btrfs_put_block_group(block_group);
1339			continue;
1340		}
1341		spin_unlock(&fs_info->unused_bgs_lock);
1342
1343		btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1344
1345		/* Don't want to race with allocators so take the groups_sem */
1346		down_write(&space_info->groups_sem);
1347
1348		/*
1349		 * Async discard moves the final block group discard to be prior
1350		 * to the unused_bgs code path.  Therefore, if it's not fully
1351		 * trimmed, punt it back to the async discard lists.
1352		 */
1353		if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1354		    !btrfs_is_free_space_trimmed(block_group)) {
1355			trace_btrfs_skip_unused_block_group(block_group);
1356			up_write(&space_info->groups_sem);
1357			/* Requeue if we failed because of async discard */
1358			btrfs_discard_queue_work(&fs_info->discard_ctl,
1359						 block_group);
1360			goto next;
1361		}
1362
1363		spin_lock(&block_group->lock);
1364		if (block_group->reserved || block_group->pinned ||
1365		    block_group->used || block_group->ro ||
1366		    list_is_singular(&block_group->list)) {
1367			/*
1368			 * We want to bail if we made new allocations or have
1369			 * outstanding allocations in this block group.  We do
1370			 * the ro check in case balance is currently acting on
1371			 * this block group.
1372			 */
1373			trace_btrfs_skip_unused_block_group(block_group);
1374			spin_unlock(&block_group->lock);
1375			up_write(&space_info->groups_sem);
1376			goto next;
1377		}
1378		spin_unlock(&block_group->lock);
1379
1380		/* We don't want to force the issue, only flip if it's ok. */
1381		ret = inc_block_group_ro(block_group, 0);
1382		up_write(&space_info->groups_sem);
1383		if (ret < 0) {
1384			ret = 0;
1385			goto next;
1386		}
1387
1388		ret = btrfs_zone_finish(block_group);
1389		if (ret < 0) {
1390			btrfs_dec_block_group_ro(block_group);
1391			if (ret == -EAGAIN)
1392				ret = 0;
1393			goto next;
1394		}
1395
1396		/*
1397		 * Want to do this before we do anything else so we can recover
1398		 * properly if we fail to join the transaction.
1399		 */
1400		trans = btrfs_start_trans_remove_block_group(fs_info,
1401						     block_group->start);
1402		if (IS_ERR(trans)) {
1403			btrfs_dec_block_group_ro(block_group);
1404			ret = PTR_ERR(trans);
1405			goto next;
1406		}
1407
1408		/*
1409		 * We could have pending pinned extents for this block group,
1410		 * just delete them, we don't care about them anymore.
1411		 */
1412		if (!clean_pinned_extents(trans, block_group)) {
1413			btrfs_dec_block_group_ro(block_group);
1414			goto end_trans;
1415		}
1416
1417		/*
1418		 * At this point, the block_group is read only and should fail
1419		 * new allocations.  However, btrfs_finish_extent_commit() can
1420		 * cause this block_group to be placed back on the discard
1421		 * lists because now the block_group isn't fully discarded.
1422		 * Bail here and try again later after discarding everything.
1423		 */
1424		spin_lock(&fs_info->discard_ctl.lock);
1425		if (!list_empty(&block_group->discard_list)) {
1426			spin_unlock(&fs_info->discard_ctl.lock);
1427			btrfs_dec_block_group_ro(block_group);
1428			btrfs_discard_queue_work(&fs_info->discard_ctl,
1429						 block_group);
1430			goto end_trans;
1431		}
1432		spin_unlock(&fs_info->discard_ctl.lock);
1433
1434		/* Reset pinned so btrfs_put_block_group doesn't complain */
1435		spin_lock(&space_info->lock);
1436		spin_lock(&block_group->lock);
1437
1438		btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1439						     -block_group->pinned);
1440		space_info->bytes_readonly += block_group->pinned;
1441		block_group->pinned = 0;
1442
1443		spin_unlock(&block_group->lock);
1444		spin_unlock(&space_info->lock);
1445
1446		/*
1447		 * The normal path here is an unused block group is passed here,
1448		 * then trimming is handled in the transaction commit path.
1449		 * Async discard interposes before this to do the trimming
1450		 * before coming down the unused block group path as trimming
1451		 * will no longer be done later in the transaction commit path.
1452		 */
1453		if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1454			goto flip_async;
1455
1456		/*
1457		 * DISCARD can flip during remount. On zoned filesystems, we
1458		 * need to reset sequential-required zones.
1459		 */
1460		trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1461				btrfs_is_zoned(fs_info);
1462
1463		/* Implicit trim during transaction commit. */
1464		if (trimming)
1465			btrfs_freeze_block_group(block_group);
1466
1467		/*
1468		 * Btrfs_remove_chunk will abort the transaction if things go
1469		 * horribly wrong.
1470		 */
1471		ret = btrfs_remove_chunk(trans, block_group->start);
1472
1473		if (ret) {
1474			if (trimming)
1475				btrfs_unfreeze_block_group(block_group);
1476			goto end_trans;
1477		}
1478
1479		/*
1480		 * If we're not mounted with -odiscard, we can just forget
1481		 * about this block group. Otherwise we'll need to wait
1482		 * until transaction commit to do the actual discard.
1483		 */
1484		if (trimming) {
1485			spin_lock(&fs_info->unused_bgs_lock);
1486			/*
1487			 * A concurrent scrub might have added us to the list
1488			 * fs_info->unused_bgs, so use a list_move operation
1489			 * to add the block group to the deleted_bgs list.
1490			 */
1491			list_move(&block_group->bg_list,
1492				  &trans->transaction->deleted_bgs);
1493			spin_unlock(&fs_info->unused_bgs_lock);
1494			btrfs_get_block_group(block_group);
1495		}
1496end_trans:
1497		btrfs_end_transaction(trans);
1498next:
1499		btrfs_put_block_group(block_group);
1500		spin_lock(&fs_info->unused_bgs_lock);
1501	}
1502	spin_unlock(&fs_info->unused_bgs_lock);
1503	mutex_unlock(&fs_info->reclaim_bgs_lock);
1504	return;
1505
1506flip_async:
1507	btrfs_end_transaction(trans);
1508	mutex_unlock(&fs_info->reclaim_bgs_lock);
1509	btrfs_put_block_group(block_group);
1510	btrfs_discard_punt_unused_bgs_list(fs_info);
1511}
1512
1513void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1514{
1515	struct btrfs_fs_info *fs_info = bg->fs_info;
1516
1517	spin_lock(&fs_info->unused_bgs_lock);
1518	if (list_empty(&bg->bg_list)) {
1519		btrfs_get_block_group(bg);
1520		trace_btrfs_add_unused_block_group(bg);
1521		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1522	}
1523	spin_unlock(&fs_info->unused_bgs_lock);
1524}
1525
1526/*
1527 * We want block groups with a low number of used bytes to be in the beginning
1528 * of the list, so they will get reclaimed first.
1529 */
1530static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1531			   const struct list_head *b)
1532{
1533	const struct btrfs_block_group *bg1, *bg2;
1534
1535	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1536	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1537
1538	return bg1->used > bg2->used;
1539}
1540
1541static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
1542{
1543	if (btrfs_is_zoned(fs_info))
1544		return btrfs_zoned_should_reclaim(fs_info);
1545	return true;
1546}
1547
1548void btrfs_reclaim_bgs_work(struct work_struct *work)
1549{
1550	struct btrfs_fs_info *fs_info =
1551		container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1552	struct btrfs_block_group *bg;
1553	struct btrfs_space_info *space_info;
1554
1555	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1556		return;
1557
1558	if (!btrfs_should_reclaim(fs_info))
1559		return;
1560
1561	sb_start_write(fs_info->sb);
1562
1563	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1564		sb_end_write(fs_info->sb);
1565		return;
1566	}
1567
1568	/*
1569	 * Long running balances can keep us blocked here for eternity, so
1570	 * simply skip reclaim if we're unable to get the mutex.
1571	 */
1572	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1573		btrfs_exclop_finish(fs_info);
1574		sb_end_write(fs_info->sb);
1575		return;
1576	}
1577
1578	spin_lock(&fs_info->unused_bgs_lock);
1579	/*
1580	 * Sort happens under lock because we can't simply splice it and sort.
1581	 * The block groups might still be in use and reachable via bg_list,
1582	 * and their presence in the reclaim_bgs list must be preserved.
1583	 */
1584	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1585	while (!list_empty(&fs_info->reclaim_bgs)) {
1586		u64 zone_unusable;
1587		int ret = 0;
1588
1589		bg = list_first_entry(&fs_info->reclaim_bgs,
1590				      struct btrfs_block_group,
1591				      bg_list);
1592		list_del_init(&bg->bg_list);
1593
1594		space_info = bg->space_info;
1595		spin_unlock(&fs_info->unused_bgs_lock);
1596
1597		/* Don't race with allocators so take the groups_sem */
1598		down_write(&space_info->groups_sem);
1599
1600		spin_lock(&bg->lock);
1601		if (bg->reserved || bg->pinned || bg->ro) {
1602			/*
1603			 * We want to bail if we made new allocations or have
1604			 * outstanding allocations in this block group.  We do
1605			 * the ro check in case balance is currently acting on
1606			 * this block group.
1607			 */
1608			spin_unlock(&bg->lock);
1609			up_write(&space_info->groups_sem);
1610			goto next;
1611		}
1612		spin_unlock(&bg->lock);
1613
1614		/* Get out fast, in case we're unmounting the filesystem */
1615		if (btrfs_fs_closing(fs_info)) {
1616			up_write(&space_info->groups_sem);
1617			goto next;
1618		}
1619
1620		/*
1621		 * Cache the zone_unusable value before turning the block group
1622		 * to read only. As soon as the blog group is read only it's
1623		 * zone_unusable value gets moved to the block group's read-only
1624		 * bytes and isn't available for calculations anymore.
1625		 */
1626		zone_unusable = bg->zone_unusable;
1627		ret = inc_block_group_ro(bg, 0);
1628		up_write(&space_info->groups_sem);
1629		if (ret < 0)
1630			goto next;
1631
1632		btrfs_info(fs_info,
1633			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
1634				bg->start, div_u64(bg->used * 100, bg->length),
1635				div64_u64(zone_unusable * 100, bg->length));
1636		trace_btrfs_reclaim_block_group(bg);
1637		ret = btrfs_relocate_chunk(fs_info, bg->start);
1638		if (ret)
1639			btrfs_err(fs_info, "error relocating chunk %llu",
1640				  bg->start);
1641
1642next:
1643		btrfs_put_block_group(bg);
1644		spin_lock(&fs_info->unused_bgs_lock);
1645	}
1646	spin_unlock(&fs_info->unused_bgs_lock);
1647	mutex_unlock(&fs_info->reclaim_bgs_lock);
1648	btrfs_exclop_finish(fs_info);
1649	sb_end_write(fs_info->sb);
1650}
1651
1652void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1653{
1654	spin_lock(&fs_info->unused_bgs_lock);
1655	if (!list_empty(&fs_info->reclaim_bgs))
1656		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
1657	spin_unlock(&fs_info->unused_bgs_lock);
1658}
1659
1660void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1661{
1662	struct btrfs_fs_info *fs_info = bg->fs_info;
1663
1664	spin_lock(&fs_info->unused_bgs_lock);
1665	if (list_empty(&bg->bg_list)) {
1666		btrfs_get_block_group(bg);
1667		trace_btrfs_add_reclaim_block_group(bg);
1668		list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
1669	}
1670	spin_unlock(&fs_info->unused_bgs_lock);
1671}
1672
1673static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1674			   struct btrfs_path *path)
1675{
1676	struct extent_map_tree *em_tree;
1677	struct extent_map *em;
1678	struct btrfs_block_group_item bg;
1679	struct extent_buffer *leaf;
1680	int slot;
1681	u64 flags;
1682	int ret = 0;
1683
1684	slot = path->slots[0];
1685	leaf = path->nodes[0];
1686
1687	em_tree = &fs_info->mapping_tree;
1688	read_lock(&em_tree->lock);
1689	em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
1690	read_unlock(&em_tree->lock);
1691	if (!em) {
1692		btrfs_err(fs_info,
1693			  "logical %llu len %llu found bg but no related chunk",
1694			  key->objectid, key->offset);
1695		return -ENOENT;
1696	}
1697
1698	if (em->start != key->objectid || em->len != key->offset) {
1699		btrfs_err(fs_info,
1700			"block group %llu len %llu mismatch with chunk %llu len %llu",
1701			key->objectid, key->offset, em->start, em->len);
1702		ret = -EUCLEAN;
1703		goto out_free_em;
1704	}
1705
1706	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1707			   sizeof(bg));
1708	flags = btrfs_stack_block_group_flags(&bg) &
1709		BTRFS_BLOCK_GROUP_TYPE_MASK;
1710
1711	if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1712		btrfs_err(fs_info,
1713"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1714			  key->objectid, key->offset, flags,
1715			  (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
1716		ret = -EUCLEAN;
1717	}
1718
1719out_free_em:
1720	free_extent_map(em);
1721	return ret;
1722}
1723
1724static int find_first_block_group(struct btrfs_fs_info *fs_info,
1725				  struct btrfs_path *path,
1726				  struct btrfs_key *key)
1727{
1728	struct btrfs_root *root = btrfs_block_group_root(fs_info);
1729	int ret;
1730	struct btrfs_key found_key;
1731
1732	btrfs_for_each_slot(root, key, &found_key, path, ret) {
1733		if (found_key.objectid >= key->objectid &&
1734		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1735			return read_bg_from_eb(fs_info, &found_key, path);
1736		}
1737	}
1738	return ret;
1739}
1740
1741static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1742{
1743	u64 extra_flags = chunk_to_extended(flags) &
1744				BTRFS_EXTENDED_PROFILE_MASK;
1745
1746	write_seqlock(&fs_info->profiles_lock);
1747	if (flags & BTRFS_BLOCK_GROUP_DATA)
1748		fs_info->avail_data_alloc_bits |= extra_flags;
1749	if (flags & BTRFS_BLOCK_GROUP_METADATA)
1750		fs_info->avail_metadata_alloc_bits |= extra_flags;
1751	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1752		fs_info->avail_system_alloc_bits |= extra_flags;
1753	write_sequnlock(&fs_info->profiles_lock);
1754}
1755
1756/**
1757 * Map a physical disk address to a list of logical addresses
1758 *
1759 * @fs_info:       the filesystem
1760 * @chunk_start:   logical address of block group
1761 * @bdev:	   physical device to resolve, can be NULL to indicate any device
1762 * @physical:	   physical address to map to logical addresses
1763 * @logical:	   return array of logical addresses which map to @physical
1764 * @naddrs:	   length of @logical
1765 * @stripe_len:    size of IO stripe for the given block group
1766 *
1767 * Maps a particular @physical disk address to a list of @logical addresses.
1768 * Used primarily to exclude those portions of a block group that contain super
1769 * block copies.
1770 */
1771int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1772		     struct block_device *bdev, u64 physical, u64 **logical,
1773		     int *naddrs, int *stripe_len)
1774{
1775	struct extent_map *em;
1776	struct map_lookup *map;
1777	u64 *buf;
1778	u64 bytenr;
1779	u64 data_stripe_length;
1780	u64 io_stripe_size;
1781	int i, nr = 0;
1782	int ret = 0;
1783
1784	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1785	if (IS_ERR(em))
1786		return -EIO;
1787
1788	map = em->map_lookup;
1789	data_stripe_length = em->orig_block_len;
1790	io_stripe_size = map->stripe_len;
1791	chunk_start = em->start;
1792
1793	/* For RAID5/6 adjust to a full IO stripe length */
1794	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1795		io_stripe_size = map->stripe_len * nr_data_stripes(map);
1796
1797	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1798	if (!buf) {
1799		ret = -ENOMEM;
1800		goto out;
1801	}
1802
1803	for (i = 0; i < map->num_stripes; i++) {
1804		bool already_inserted = false;
1805		u64 stripe_nr;
1806		u64 offset;
1807		int j;
1808
1809		if (!in_range(physical, map->stripes[i].physical,
1810			      data_stripe_length))
1811			continue;
1812
1813		if (bdev && map->stripes[i].dev->bdev != bdev)
1814			continue;
1815
1816		stripe_nr = physical - map->stripes[i].physical;
1817		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
1818
1819		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1820			stripe_nr = stripe_nr * map->num_stripes + i;
1821			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1822		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1823			stripe_nr = stripe_nr * map->num_stripes + i;
1824		}
1825		/*
1826		 * The remaining case would be for RAID56, multiply by
1827		 * nr_data_stripes().  Alternatively, just use rmap_len below
1828		 * instead of map->stripe_len
1829		 */
1830
1831		bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
1832
1833		/* Ensure we don't add duplicate addresses */
1834		for (j = 0; j < nr; j++) {
1835			if (buf[j] == bytenr) {
1836				already_inserted = true;
1837				break;
1838			}
1839		}
1840
1841		if (!already_inserted)
1842			buf[nr++] = bytenr;
1843	}
1844
1845	*logical = buf;
1846	*naddrs = nr;
1847	*stripe_len = io_stripe_size;
1848out:
1849	free_extent_map(em);
1850	return ret;
1851}
1852
1853static int exclude_super_stripes(struct btrfs_block_group *cache)
1854{
1855	struct btrfs_fs_info *fs_info = cache->fs_info;
1856	const bool zoned = btrfs_is_zoned(fs_info);
1857	u64 bytenr;
1858	u64 *logical;
1859	int stripe_len;
1860	int i, nr, ret;
1861
1862	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1863		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1864		cache->bytes_super += stripe_len;
1865		ret = btrfs_add_excluded_extent(fs_info, cache->start,
1866						stripe_len);
1867		if (ret)
1868			return ret;
1869	}
1870
1871	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1872		bytenr = btrfs_sb_offset(i);
1873		ret = btrfs_rmap_block(fs_info, cache->start, NULL,
1874				       bytenr, &logical, &nr, &stripe_len);
1875		if (ret)
1876			return ret;
1877
1878		/* Shouldn't have super stripes in sequential zones */
1879		if (zoned && nr) {
1880			btrfs_err(fs_info,
1881			"zoned: block group %llu must not contain super block",
1882				  cache->start);
1883			return -EUCLEAN;
1884		}
1885
1886		while (nr--) {
1887			u64 len = min_t(u64, stripe_len,
1888				cache->start + cache->length - logical[nr]);
1889
1890			cache->bytes_super += len;
1891			ret = btrfs_add_excluded_extent(fs_info, logical[nr],
1892							len);
1893			if (ret) {
1894				kfree(logical);
1895				return ret;
1896			}
1897		}
1898
1899		kfree(logical);
1900	}
1901	return 0;
1902}
1903
1904static void link_block_group(struct btrfs_block_group *cache)
1905{
1906	struct btrfs_space_info *space_info = cache->space_info;
1907	int index = btrfs_bg_flags_to_raid_index(cache->flags);
1908
1909	down_write(&space_info->groups_sem);
1910	list_add_tail(&cache->list, &space_info->block_groups[index]);
1911	up_write(&space_info->groups_sem);
1912}
1913
1914static struct btrfs_block_group *btrfs_create_block_group_cache(
1915		struct btrfs_fs_info *fs_info, u64 start)
1916{
1917	struct btrfs_block_group *cache;
1918
1919	cache = kzalloc(sizeof(*cache), GFP_NOFS);
1920	if (!cache)
1921		return NULL;
1922
1923	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1924					GFP_NOFS);
1925	if (!cache->free_space_ctl) {
1926		kfree(cache);
1927		return NULL;
1928	}
1929
1930	cache->start = start;
1931
1932	cache->fs_info = fs_info;
1933	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1934
1935	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1936
1937	refcount_set(&cache->refs, 1);
1938	spin_lock_init(&cache->lock);
1939	init_rwsem(&cache->data_rwsem);
1940	INIT_LIST_HEAD(&cache->list);
1941	INIT_LIST_HEAD(&cache->cluster_list);
1942	INIT_LIST_HEAD(&cache->bg_list);
1943	INIT_LIST_HEAD(&cache->ro_list);
1944	INIT_LIST_HEAD(&cache->discard_list);
1945	INIT_LIST_HEAD(&cache->dirty_list);
1946	INIT_LIST_HEAD(&cache->io_list);
1947	INIT_LIST_HEAD(&cache->active_bg_list);
1948	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
1949	atomic_set(&cache->frozen, 0);
1950	mutex_init(&cache->free_space_lock);
1951	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1952
1953	return cache;
1954}
1955
1956/*
1957 * Iterate all chunks and verify that each of them has the corresponding block
1958 * group
1959 */
1960static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1961{
1962	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1963	struct extent_map *em;
1964	struct btrfs_block_group *bg;
1965	u64 start = 0;
1966	int ret = 0;
1967
1968	while (1) {
1969		read_lock(&map_tree->lock);
1970		/*
1971		 * lookup_extent_mapping will return the first extent map
1972		 * intersecting the range, so setting @len to 1 is enough to
1973		 * get the first chunk.
1974		 */
1975		em = lookup_extent_mapping(map_tree, start, 1);
1976		read_unlock(&map_tree->lock);
1977		if (!em)
1978			break;
1979
1980		bg = btrfs_lookup_block_group(fs_info, em->start);
1981		if (!bg) {
1982			btrfs_err(fs_info,
1983	"chunk start=%llu len=%llu doesn't have corresponding block group",
1984				     em->start, em->len);
1985			ret = -EUCLEAN;
1986			free_extent_map(em);
1987			break;
1988		}
1989		if (bg->start != em->start || bg->length != em->len ||
1990		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1991		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1992			btrfs_err(fs_info,
1993"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1994				em->start, em->len,
1995				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1996				bg->start, bg->length,
1997				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1998			ret = -EUCLEAN;
1999			free_extent_map(em);
2000			btrfs_put_block_group(bg);
2001			break;
2002		}
2003		start = em->start + em->len;
2004		free_extent_map(em);
2005		btrfs_put_block_group(bg);
2006	}
2007	return ret;
2008}
2009
2010static int read_one_block_group(struct btrfs_fs_info *info,
2011				struct btrfs_block_group_item *bgi,
2012				const struct btrfs_key *key,
2013				int need_clear)
2014{
2015	struct btrfs_block_group *cache;
2016	struct btrfs_space_info *space_info;
2017	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2018	int ret;
2019
2020	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2021
2022	cache = btrfs_create_block_group_cache(info, key->objectid);
2023	if (!cache)
2024		return -ENOMEM;
2025
2026	cache->length = key->offset;
2027	cache->used = btrfs_stack_block_group_used(bgi);
2028	cache->flags = btrfs_stack_block_group_flags(bgi);
2029	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2030
2031	set_free_space_tree_thresholds(cache);
2032
2033	if (need_clear) {
2034		/*
2035		 * When we mount with old space cache, we need to
2036		 * set BTRFS_DC_CLEAR and set dirty flag.
2037		 *
2038		 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2039		 *    truncate the old free space cache inode and
2040		 *    setup a new one.
2041		 * b) Setting 'dirty flag' makes sure that we flush
2042		 *    the new space cache info onto disk.
2043		 */
2044		if (btrfs_test_opt(info, SPACE_CACHE))
2045			cache->disk_cache_state = BTRFS_DC_CLEAR;
2046	}
2047	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2048	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2049			btrfs_err(info,
2050"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2051				  cache->start);
2052			ret = -EINVAL;
2053			goto error;
2054	}
2055
2056	ret = btrfs_load_block_group_zone_info(cache, false);
2057	if (ret) {
2058		btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2059			  cache->start);
2060		goto error;
2061	}
2062
2063	/*
2064	 * We need to exclude the super stripes now so that the space info has
2065	 * super bytes accounted for, otherwise we'll think we have more space
2066	 * than we actually do.
2067	 */
2068	ret = exclude_super_stripes(cache);
2069	if (ret) {
2070		/* We may have excluded something, so call this just in case. */
2071		btrfs_free_excluded_extents(cache);
2072		goto error;
2073	}
2074
2075	/*
2076	 * For zoned filesystem, space after the allocation offset is the only
2077	 * free space for a block group. So, we don't need any caching work.
2078	 * btrfs_calc_zone_unusable() will set the amount of free space and
2079	 * zone_unusable space.
2080	 *
2081	 * For regular filesystem, check for two cases, either we are full, and
2082	 * therefore don't need to bother with the caching work since we won't
2083	 * find any space, or we are empty, and we can just add all the space
2084	 * in and be done with it.  This saves us _a_lot_ of time, particularly
2085	 * in the full case.
2086	 */
2087	if (btrfs_is_zoned(info)) {
2088		btrfs_calc_zone_unusable(cache);
2089		/* Should not have any excluded extents. Just in case, though. */
2090		btrfs_free_excluded_extents(cache);
2091	} else if (cache->length == cache->used) {
2092		cache->last_byte_to_unpin = (u64)-1;
2093		cache->cached = BTRFS_CACHE_FINISHED;
2094		btrfs_free_excluded_extents(cache);
2095	} else if (cache->used == 0) {
2096		cache->last_byte_to_unpin = (u64)-1;
2097		cache->cached = BTRFS_CACHE_FINISHED;
2098		add_new_free_space(cache, cache->start,
2099				   cache->start + cache->length);
2100		btrfs_free_excluded_extents(cache);
2101	}
2102
2103	ret = btrfs_add_block_group_cache(info, cache);
2104	if (ret) {
2105		btrfs_remove_free_space_cache(cache);
2106		goto error;
2107	}
2108	trace_btrfs_add_block_group(info, cache, 0);
2109	btrfs_update_space_info(info, cache->flags, cache->length,
2110				cache->used, cache->bytes_super,
2111				cache->zone_unusable, &space_info);
2112
2113	cache->space_info = space_info;
2114
2115	link_block_group(cache);
2116
2117	set_avail_alloc_bits(info, cache->flags);
2118	if (btrfs_chunk_writeable(info, cache->start)) {
2119		if (cache->used == 0) {
2120			ASSERT(list_empty(&cache->bg_list));
2121			if (btrfs_test_opt(info, DISCARD_ASYNC))
2122				btrfs_discard_queue_work(&info->discard_ctl, cache);
2123			else
2124				btrfs_mark_bg_unused(cache);
2125		}
2126	} else {
2127		inc_block_group_ro(cache, 1);
2128	}
2129
2130	return 0;
2131error:
2132	btrfs_put_block_group(cache);
2133	return ret;
2134}
2135
2136static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2137{
2138	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
2139	struct btrfs_space_info *space_info;
2140	struct rb_node *node;
2141	int ret = 0;
2142
2143	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
2144		struct extent_map *em;
2145		struct map_lookup *map;
2146		struct btrfs_block_group *bg;
2147
2148		em = rb_entry(node, struct extent_map, rb_node);
2149		map = em->map_lookup;
2150		bg = btrfs_create_block_group_cache(fs_info, em->start);
2151		if (!bg) {
2152			ret = -ENOMEM;
2153			break;
2154		}
2155
2156		/* Fill dummy cache as FULL */
2157		bg->length = em->len;
2158		bg->flags = map->type;
2159		bg->last_byte_to_unpin = (u64)-1;
2160		bg->cached = BTRFS_CACHE_FINISHED;
2161		bg->used = em->len;
2162		bg->flags = map->type;
2163		ret = btrfs_add_block_group_cache(fs_info, bg);
2164		/*
2165		 * We may have some valid block group cache added already, in
2166		 * that case we skip to the next one.
2167		 */
2168		if (ret == -EEXIST) {
2169			ret = 0;
2170			btrfs_put_block_group(bg);
2171			continue;
2172		}
2173
2174		if (ret) {
2175			btrfs_remove_free_space_cache(bg);
2176			btrfs_put_block_group(bg);
2177			break;
2178		}
2179
2180		btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
2181					0, 0, &space_info);
2182		bg->space_info = space_info;
2183		link_block_group(bg);
2184
2185		set_avail_alloc_bits(fs_info, bg->flags);
2186	}
2187	if (!ret)
2188		btrfs_init_global_block_rsv(fs_info);
2189	return ret;
2190}
2191
2192int btrfs_read_block_groups(struct btrfs_fs_info *info)
2193{
2194	struct btrfs_root *root = btrfs_block_group_root(info);
2195	struct btrfs_path *path;
2196	int ret;
2197	struct btrfs_block_group *cache;
2198	struct btrfs_space_info *space_info;
2199	struct btrfs_key key;
2200	int need_clear = 0;
2201	u64 cache_gen;
2202
2203	if (!root)
2204		return fill_dummy_bgs(info);
2205
2206	key.objectid = 0;
2207	key.offset = 0;
2208	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2209	path = btrfs_alloc_path();
2210	if (!path)
2211		return -ENOMEM;
2212
2213	cache_gen = btrfs_super_cache_generation(info->super_copy);
2214	if (btrfs_test_opt(info, SPACE_CACHE) &&
2215	    btrfs_super_generation(info->super_copy) != cache_gen)
2216		need_clear = 1;
2217	if (btrfs_test_opt(info, CLEAR_CACHE))
2218		need_clear = 1;
2219
2220	while (1) {
2221		struct btrfs_block_group_item bgi;
2222		struct extent_buffer *leaf;
2223		int slot;
2224
2225		ret = find_first_block_group(info, path, &key);
2226		if (ret > 0)
2227			break;
2228		if (ret != 0)
2229			goto error;
2230
2231		leaf = path->nodes[0];
2232		slot = path->slots[0];
2233
2234		read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2235				   sizeof(bgi));
2236
2237		btrfs_item_key_to_cpu(leaf, &key, slot);
2238		btrfs_release_path(path);
2239		ret = read_one_block_group(info, &bgi, &key, need_clear);
2240		if (ret < 0)
2241			goto error;
2242		key.objectid += key.offset;
2243		key.offset = 0;
2244	}
2245	btrfs_release_path(path);
2246
2247	list_for_each_entry(space_info, &info->space_info, list) {
2248		int i;
2249
2250		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2251			if (list_empty(&space_info->block_groups[i]))
2252				continue;
2253			cache = list_first_entry(&space_info->block_groups[i],
2254						 struct btrfs_block_group,
2255						 list);
2256			btrfs_sysfs_add_block_group_type(cache);
2257		}
2258
2259		if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2260		      (BTRFS_BLOCK_GROUP_RAID10 |
2261		       BTRFS_BLOCK_GROUP_RAID1_MASK |
2262		       BTRFS_BLOCK_GROUP_RAID56_MASK |
2263		       BTRFS_BLOCK_GROUP_DUP)))
2264			continue;
2265		/*
2266		 * Avoid allocating from un-mirrored block group if there are
2267		 * mirrored block groups.
2268		 */
2269		list_for_each_entry(cache,
2270				&space_info->block_groups[BTRFS_RAID_RAID0],
2271				list)
2272			inc_block_group_ro(cache, 1);
2273		list_for_each_entry(cache,
2274				&space_info->block_groups[BTRFS_RAID_SINGLE],
2275				list)
2276			inc_block_group_ro(cache, 1);
2277	}
2278
2279	btrfs_init_global_block_rsv(info);
2280	ret = check_chunk_block_group_mappings(info);
2281error:
2282	btrfs_free_path(path);
2283	/*
2284	 * We've hit some error while reading the extent tree, and have
2285	 * rescue=ibadroots mount option.
2286	 * Try to fill the tree using dummy block groups so that the user can
2287	 * continue to mount and grab their data.
2288	 */
2289	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2290		ret = fill_dummy_bgs(info);
2291	return ret;
2292}
2293
2294/*
2295 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2296 * allocation.
2297 *
2298 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2299 * phases.
2300 */
2301static int insert_block_group_item(struct btrfs_trans_handle *trans,
2302				   struct btrfs_block_group *block_group)
2303{
2304	struct btrfs_fs_info *fs_info = trans->fs_info;
2305	struct btrfs_block_group_item bgi;
2306	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2307	struct btrfs_key key;
2308
2309	spin_lock(&block_group->lock);
2310	btrfs_set_stack_block_group_used(&bgi, block_group->used);
2311	btrfs_set_stack_block_group_chunk_objectid(&bgi,
2312						   block_group->global_root_id);
2313	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2314	key.objectid = block_group->start;
2315	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2316	key.offset = block_group->length;
2317	spin_unlock(&block_group->lock);
2318
2319	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2320}
2321
2322static int insert_dev_extent(struct btrfs_trans_handle *trans,
2323			    struct btrfs_device *device, u64 chunk_offset,
2324			    u64 start, u64 num_bytes)
2325{
2326	struct btrfs_fs_info *fs_info = device->fs_info;
2327	struct btrfs_root *root = fs_info->dev_root;
2328	struct btrfs_path *path;
2329	struct btrfs_dev_extent *extent;
2330	struct extent_buffer *leaf;
2331	struct btrfs_key key;
2332	int ret;
2333
2334	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2335	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2336	path = btrfs_alloc_path();
2337	if (!path)
2338		return -ENOMEM;
2339
2340	key.objectid = device->devid;
2341	key.type = BTRFS_DEV_EXTENT_KEY;
2342	key.offset = start;
2343	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2344	if (ret)
2345		goto out;
2346
2347	leaf = path->nodes[0];
2348	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2349	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2350	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2351					    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2352	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2353
2354	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2355	btrfs_mark_buffer_dirty(leaf);
2356out:
2357	btrfs_free_path(path);
2358	return ret;
2359}
2360
2361/*
2362 * This function belongs to phase 2.
2363 *
2364 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2365 * phases.
2366 */
2367static int insert_dev_extents(struct btrfs_trans_handle *trans,
2368				   u64 chunk_offset, u64 chunk_size)
2369{
2370	struct btrfs_fs_info *fs_info = trans->fs_info;
2371	struct btrfs_device *device;
2372	struct extent_map *em;
2373	struct map_lookup *map;
2374	u64 dev_offset;
2375	u64 stripe_size;
2376	int i;
2377	int ret = 0;
2378
2379	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2380	if (IS_ERR(em))
2381		return PTR_ERR(em);
2382
2383	map = em->map_lookup;
2384	stripe_size = em->orig_block_len;
2385
2386	/*
2387	 * Take the device list mutex to prevent races with the final phase of
2388	 * a device replace operation that replaces the device object associated
2389	 * with the map's stripes, because the device object's id can change
2390	 * at any time during that final phase of the device replace operation
2391	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2392	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2393	 * resulting in persisting a device extent item with such ID.
2394	 */
2395	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2396	for (i = 0; i < map->num_stripes; i++) {
2397		device = map->stripes[i].dev;
2398		dev_offset = map->stripes[i].physical;
2399
2400		ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2401				       stripe_size);
2402		if (ret)
2403			break;
2404	}
2405	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2406
2407	free_extent_map(em);
2408	return ret;
2409}
2410
2411/*
2412 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2413 * chunk allocation.
2414 *
2415 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2416 * phases.
2417 */
2418void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2419{
2420	struct btrfs_fs_info *fs_info = trans->fs_info;
2421	struct btrfs_block_group *block_group;
2422	int ret = 0;
2423
2424	while (!list_empty(&trans->new_bgs)) {
2425		int index;
2426
2427		block_group = list_first_entry(&trans->new_bgs,
2428					       struct btrfs_block_group,
2429					       bg_list);
2430		if (ret)
2431			goto next;
2432
2433		index = btrfs_bg_flags_to_raid_index(block_group->flags);
2434
2435		ret = insert_block_group_item(trans, block_group);
2436		if (ret)
2437			btrfs_abort_transaction(trans, ret);
2438		if (!block_group->chunk_item_inserted) {
2439			mutex_lock(&fs_info->chunk_mutex);
2440			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2441			mutex_unlock(&fs_info->chunk_mutex);
2442			if (ret)
2443				btrfs_abort_transaction(trans, ret);
2444		}
2445		ret = insert_dev_extents(trans, block_group->start,
2446					 block_group->length);
2447		if (ret)
2448			btrfs_abort_transaction(trans, ret);
2449		add_block_group_free_space(trans, block_group);
2450
2451		/*
2452		 * If we restriped during balance, we may have added a new raid
2453		 * type, so now add the sysfs entries when it is safe to do so.
2454		 * We don't have to worry about locking here as it's handled in
2455		 * btrfs_sysfs_add_block_group_type.
2456		 */
2457		if (block_group->space_info->block_group_kobjs[index] == NULL)
2458			btrfs_sysfs_add_block_group_type(block_group);
2459
2460		/* Already aborted the transaction if it failed. */
2461next:
2462		btrfs_delayed_refs_rsv_release(fs_info, 1);
2463		list_del_init(&block_group->bg_list);
2464	}
2465	btrfs_trans_release_chunk_metadata(trans);
2466}
2467
2468/*
2469 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
2470 * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2471 */
2472static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
2473{
2474	u64 div = SZ_1G;
2475	u64 index;
2476
2477	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2478		return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2479
2480	/* If we have a smaller fs index based on 128MiB. */
2481	if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2482		div = SZ_128M;
2483
2484	offset = div64_u64(offset, div);
2485	div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2486	return index;
2487}
2488
2489struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2490						 u64 bytes_used, u64 type,
2491						 u64 chunk_offset, u64 size)
2492{
2493	struct btrfs_fs_info *fs_info = trans->fs_info;
2494	struct btrfs_block_group *cache;
2495	int ret;
2496
2497	btrfs_set_log_full_commit(trans);
2498
2499	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2500	if (!cache)
2501		return ERR_PTR(-ENOMEM);
2502
2503	cache->length = size;
2504	set_free_space_tree_thresholds(cache);
2505	cache->used = bytes_used;
2506	cache->flags = type;
2507	cache->last_byte_to_unpin = (u64)-1;
2508	cache->cached = BTRFS_CACHE_FINISHED;
2509	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2510
2511	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2512		cache->needs_free_space = 1;
2513
2514	ret = btrfs_load_block_group_zone_info(cache, true);
2515	if (ret) {
2516		btrfs_put_block_group(cache);
2517		return ERR_PTR(ret);
2518	}
2519
2520	ret = exclude_super_stripes(cache);
2521	if (ret) {
2522		/* We may have excluded something, so call this just in case */
2523		btrfs_free_excluded_extents(cache);
2524		btrfs_put_block_group(cache);
2525		return ERR_PTR(ret);
2526	}
2527
2528	add_new_free_space(cache, chunk_offset, chunk_offset + size);
2529
2530	btrfs_free_excluded_extents(cache);
2531
2532#ifdef CONFIG_BTRFS_DEBUG
2533	if (btrfs_should_fragment_free_space(cache)) {
2534		u64 new_bytes_used = size - bytes_used;
2535
2536		bytes_used += new_bytes_used >> 1;
2537		fragment_free_space(cache);
2538	}
2539#endif
2540	/*
2541	 * Ensure the corresponding space_info object is created and
2542	 * assigned to our block group. We want our bg to be added to the rbtree
2543	 * with its ->space_info set.
2544	 */
2545	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2546	ASSERT(cache->space_info);
2547
2548	ret = btrfs_add_block_group_cache(fs_info, cache);
2549	if (ret) {
2550		btrfs_remove_free_space_cache(cache);
2551		btrfs_put_block_group(cache);
2552		return ERR_PTR(ret);
2553	}
2554
2555	/*
2556	 * Now that our block group has its ->space_info set and is inserted in
2557	 * the rbtree, update the space info's counters.
2558	 */
2559	trace_btrfs_add_block_group(fs_info, cache, 1);
2560	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2561				cache->bytes_super, cache->zone_unusable,
2562				&cache->space_info);
2563	btrfs_update_global_block_rsv(fs_info);
2564
2565	link_block_group(cache);
2566
2567	list_add_tail(&cache->bg_list, &trans->new_bgs);
2568	trans->delayed_ref_updates++;
2569	btrfs_update_delayed_refs_rsv(trans);
2570
2571	set_avail_alloc_bits(fs_info, type);
2572	return cache;
2573}
2574
2575/*
2576 * Mark one block group RO, can be called several times for the same block
2577 * group.
2578 *
2579 * @cache:		the destination block group
2580 * @do_chunk_alloc:	whether need to do chunk pre-allocation, this is to
2581 * 			ensure we still have some free space after marking this
2582 * 			block group RO.
2583 */
2584int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2585			     bool do_chunk_alloc)
2586{
2587	struct btrfs_fs_info *fs_info = cache->fs_info;
2588	struct btrfs_trans_handle *trans;
2589	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2590	u64 alloc_flags;
2591	int ret;
2592	bool dirty_bg_running;
2593
2594	/*
2595	 * This can only happen when we are doing read-only scrub on read-only
2596	 * mount.
2597	 * In that case we should not start a new transaction on read-only fs.
2598	 * Thus here we skip all chunk allocations.
2599	 */
2600	if (sb_rdonly(fs_info->sb)) {
2601		mutex_lock(&fs_info->ro_block_group_mutex);
2602		ret = inc_block_group_ro(cache, 0);
2603		mutex_unlock(&fs_info->ro_block_group_mutex);
2604		return ret;
2605	}
2606
2607	do {
2608		trans = btrfs_join_transaction(root);
2609		if (IS_ERR(trans))
2610			return PTR_ERR(trans);
2611
2612		dirty_bg_running = false;
2613
2614		/*
2615		 * We're not allowed to set block groups readonly after the dirty
2616		 * block group cache has started writing.  If it already started,
2617		 * back off and let this transaction commit.
2618		 */
2619		mutex_lock(&fs_info->ro_block_group_mutex);
2620		if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2621			u64 transid = trans->transid;
2622
2623			mutex_unlock(&fs_info->ro_block_group_mutex);
2624			btrfs_end_transaction(trans);
2625
2626			ret = btrfs_wait_for_commit(fs_info, transid);
2627			if (ret)
2628				return ret;
2629			dirty_bg_running = true;
2630		}
2631	} while (dirty_bg_running);
2632
2633	if (do_chunk_alloc) {
2634		/*
2635		 * If we are changing raid levels, try to allocate a
2636		 * corresponding block group with the new raid level.
2637		 */
2638		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2639		if (alloc_flags != cache->flags) {
2640			ret = btrfs_chunk_alloc(trans, alloc_flags,
2641						CHUNK_ALLOC_FORCE);
2642			/*
2643			 * ENOSPC is allowed here, we may have enough space
2644			 * already allocated at the new raid level to carry on
2645			 */
2646			if (ret == -ENOSPC)
2647				ret = 0;
2648			if (ret < 0)
2649				goto out;
2650		}
2651	}
2652
2653	ret = inc_block_group_ro(cache, 0);
2654	if (!do_chunk_alloc || ret == -ETXTBSY)
2655		goto unlock_out;
2656	if (!ret)
2657		goto out;
2658	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2659	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2660	if (ret < 0)
2661		goto out;
2662	ret = inc_block_group_ro(cache, 0);
2663	if (ret == -ETXTBSY)
2664		goto unlock_out;
2665out:
2666	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2667		alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2668		mutex_lock(&fs_info->chunk_mutex);
2669		check_system_chunk(trans, alloc_flags);
2670		mutex_unlock(&fs_info->chunk_mutex);
2671	}
2672unlock_out:
2673	mutex_unlock(&fs_info->ro_block_group_mutex);
2674
2675	btrfs_end_transaction(trans);
2676	return ret;
2677}
2678
2679void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2680{
2681	struct btrfs_space_info *sinfo = cache->space_info;
2682	u64 num_bytes;
2683
2684	BUG_ON(!cache->ro);
2685
2686	spin_lock(&sinfo->lock);
2687	spin_lock(&cache->lock);
2688	if (!--cache->ro) {
2689		if (btrfs_is_zoned(cache->fs_info)) {
2690			/* Migrate zone_unusable bytes back */
2691			cache->zone_unusable =
2692				(cache->alloc_offset - cache->used) +
2693				(cache->length - cache->zone_capacity);
2694			sinfo->bytes_zone_unusable += cache->zone_unusable;
2695			sinfo->bytes_readonly -= cache->zone_unusable;
2696		}
2697		num_bytes = cache->length - cache->reserved -
2698			    cache->pinned - cache->bytes_super -
2699			    cache->zone_unusable - cache->used;
2700		sinfo->bytes_readonly -= num_bytes;
2701		list_del_init(&cache->ro_list);
2702	}
2703	spin_unlock(&cache->lock);
2704	spin_unlock(&sinfo->lock);
2705}
2706
2707static int update_block_group_item(struct btrfs_trans_handle *trans,
2708				   struct btrfs_path *path,
2709				   struct btrfs_block_group *cache)
2710{
2711	struct btrfs_fs_info *fs_info = trans->fs_info;
2712	int ret;
2713	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2714	unsigned long bi;
2715	struct extent_buffer *leaf;
2716	struct btrfs_block_group_item bgi;
2717	struct btrfs_key key;
2718
2719	key.objectid = cache->start;
2720	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2721	key.offset = cache->length;
2722
2723	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2724	if (ret) {
2725		if (ret > 0)
2726			ret = -ENOENT;
2727		goto fail;
2728	}
2729
2730	leaf = path->nodes[0];
2731	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2732	btrfs_set_stack_block_group_used(&bgi, cache->used);
2733	btrfs_set_stack_block_group_chunk_objectid(&bgi,
2734						   cache->global_root_id);
2735	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2736	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2737	btrfs_mark_buffer_dirty(leaf);
2738fail:
2739	btrfs_release_path(path);
2740	return ret;
2741
2742}
2743
2744static int cache_save_setup(struct btrfs_block_group *block_group,
2745			    struct btrfs_trans_handle *trans,
2746			    struct btrfs_path *path)
2747{
2748	struct btrfs_fs_info *fs_info = block_group->fs_info;
2749	struct btrfs_root *root = fs_info->tree_root;
2750	struct inode *inode = NULL;
2751	struct extent_changeset *data_reserved = NULL;
2752	u64 alloc_hint = 0;
2753	int dcs = BTRFS_DC_ERROR;
2754	u64 cache_size = 0;
2755	int retries = 0;
2756	int ret = 0;
2757
2758	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
2759		return 0;
2760
2761	/*
2762	 * If this block group is smaller than 100 megs don't bother caching the
2763	 * block group.
2764	 */
2765	if (block_group->length < (100 * SZ_1M)) {
2766		spin_lock(&block_group->lock);
2767		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2768		spin_unlock(&block_group->lock);
2769		return 0;
2770	}
2771
2772	if (TRANS_ABORTED(trans))
2773		return 0;
2774again:
2775	inode = lookup_free_space_inode(block_group, path);
2776	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2777		ret = PTR_ERR(inode);
2778		btrfs_release_path(path);
2779		goto out;
2780	}
2781
2782	if (IS_ERR(inode)) {
2783		BUG_ON(retries);
2784		retries++;
2785
2786		if (block_group->ro)
2787			goto out_free;
2788
2789		ret = create_free_space_inode(trans, block_group, path);
2790		if (ret)
2791			goto out_free;
2792		goto again;
2793	}
2794
2795	/*
2796	 * We want to set the generation to 0, that way if anything goes wrong
2797	 * from here on out we know not to trust this cache when we load up next
2798	 * time.
2799	 */
2800	BTRFS_I(inode)->generation = 0;
2801	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2802	if (ret) {
2803		/*
2804		 * So theoretically we could recover from this, simply set the
2805		 * super cache generation to 0 so we know to invalidate the
2806		 * cache, but then we'd have to keep track of the block groups
2807		 * that fail this way so we know we _have_ to reset this cache
2808		 * before the next commit or risk reading stale cache.  So to
2809		 * limit our exposure to horrible edge cases lets just abort the
2810		 * transaction, this only happens in really bad situations
2811		 * anyway.
2812		 */
2813		btrfs_abort_transaction(trans, ret);
2814		goto out_put;
2815	}
2816	WARN_ON(ret);
2817
2818	/* We've already setup this transaction, go ahead and exit */
2819	if (block_group->cache_generation == trans->transid &&
2820	    i_size_read(inode)) {
2821		dcs = BTRFS_DC_SETUP;
2822		goto out_put;
2823	}
2824
2825	if (i_size_read(inode) > 0) {
2826		ret = btrfs_check_trunc_cache_free_space(fs_info,
2827					&fs_info->global_block_rsv);
2828		if (ret)
2829			goto out_put;
2830
2831		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2832		if (ret)
2833			goto out_put;
2834	}
2835
2836	spin_lock(&block_group->lock);
2837	if (block_group->cached != BTRFS_CACHE_FINISHED ||
2838	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2839		/*
2840		 * don't bother trying to write stuff out _if_
2841		 * a) we're not cached,
2842		 * b) we're with nospace_cache mount option,
2843		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
2844		 */
2845		dcs = BTRFS_DC_WRITTEN;
2846		spin_unlock(&block_group->lock);
2847		goto out_put;
2848	}
2849	spin_unlock(&block_group->lock);
2850
2851	/*
2852	 * We hit an ENOSPC when setting up the cache in this transaction, just
2853	 * skip doing the setup, we've already cleared the cache so we're safe.
2854	 */
2855	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2856		ret = -ENOSPC;
2857		goto out_put;
2858	}
2859
2860	/*
2861	 * Try to preallocate enough space based on how big the block group is.
2862	 * Keep in mind this has to include any pinned space which could end up
2863	 * taking up quite a bit since it's not folded into the other space
2864	 * cache.
2865	 */
2866	cache_size = div_u64(block_group->length, SZ_256M);
2867	if (!cache_size)
2868		cache_size = 1;
2869
2870	cache_size *= 16;
2871	cache_size *= fs_info->sectorsize;
2872
2873	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
2874					  cache_size);
2875	if (ret)
2876		goto out_put;
2877
2878	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
2879					      cache_size, cache_size,
2880					      &alloc_hint);
2881	/*
2882	 * Our cache requires contiguous chunks so that we don't modify a bunch
2883	 * of metadata or split extents when writing the cache out, which means
2884	 * we can enospc if we are heavily fragmented in addition to just normal
2885	 * out of space conditions.  So if we hit this just skip setting up any
2886	 * other block groups for this transaction, maybe we'll unpin enough
2887	 * space the next time around.
2888	 */
2889	if (!ret)
2890		dcs = BTRFS_DC_SETUP;
2891	else if (ret == -ENOSPC)
2892		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2893
2894out_put:
2895	iput(inode);
2896out_free:
2897	btrfs_release_path(path);
2898out:
2899	spin_lock(&block_group->lock);
2900	if (!ret && dcs == BTRFS_DC_SETUP)
2901		block_group->cache_generation = trans->transid;
2902	block_group->disk_cache_state = dcs;
2903	spin_unlock(&block_group->lock);
2904
2905	extent_changeset_free(data_reserved);
2906	return ret;
2907}
2908
2909int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2910{
2911	struct btrfs_fs_info *fs_info = trans->fs_info;
2912	struct btrfs_block_group *cache, *tmp;
2913	struct btrfs_transaction *cur_trans = trans->transaction;
2914	struct btrfs_path *path;
2915
2916	if (list_empty(&cur_trans->dirty_bgs) ||
2917	    !btrfs_test_opt(fs_info, SPACE_CACHE))
2918		return 0;
2919
2920	path = btrfs_alloc_path();
2921	if (!path)
2922		return -ENOMEM;
2923
2924	/* Could add new block groups, use _safe just in case */
2925	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2926				 dirty_list) {
2927		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2928			cache_save_setup(cache, trans, path);
2929	}
2930
2931	btrfs_free_path(path);
2932	return 0;
2933}
2934
2935/*
2936 * Transaction commit does final block group cache writeback during a critical
2937 * section where nothing is allowed to change the FS.  This is required in
2938 * order for the cache to actually match the block group, but can introduce a
2939 * lot of latency into the commit.
2940 *
2941 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2942 * There's a chance we'll have to redo some of it if the block group changes
2943 * again during the commit, but it greatly reduces the commit latency by
2944 * getting rid of the easy block groups while we're still allowing others to
2945 * join the commit.
2946 */
2947int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2948{
2949	struct btrfs_fs_info *fs_info = trans->fs_info;
2950	struct btrfs_block_group *cache;
2951	struct btrfs_transaction *cur_trans = trans->transaction;
2952	int ret = 0;
2953	int should_put;
2954	struct btrfs_path *path = NULL;
2955	LIST_HEAD(dirty);
2956	struct list_head *io = &cur_trans->io_bgs;
2957	int loops = 0;
2958
2959	spin_lock(&cur_trans->dirty_bgs_lock);
2960	if (list_empty(&cur_trans->dirty_bgs)) {
2961		spin_unlock(&cur_trans->dirty_bgs_lock);
2962		return 0;
2963	}
2964	list_splice_init(&cur_trans->dirty_bgs, &dirty);
2965	spin_unlock(&cur_trans->dirty_bgs_lock);
2966
2967again:
2968	/* Make sure all the block groups on our dirty list actually exist */
2969	btrfs_create_pending_block_groups(trans);
2970
2971	if (!path) {
2972		path = btrfs_alloc_path();
2973		if (!path) {
2974			ret = -ENOMEM;
2975			goto out;
2976		}
2977	}
2978
2979	/*
2980	 * cache_write_mutex is here only to save us from balance or automatic
2981	 * removal of empty block groups deleting this block group while we are
2982	 * writing out the cache
2983	 */
2984	mutex_lock(&trans->transaction->cache_write_mutex);
2985	while (!list_empty(&dirty)) {
2986		bool drop_reserve = true;
2987
2988		cache = list_first_entry(&dirty, struct btrfs_block_group,
2989					 dirty_list);
2990		/*
2991		 * This can happen if something re-dirties a block group that
2992		 * is already under IO.  Just wait for it to finish and then do
2993		 * it all again
2994		 */
2995		if (!list_empty(&cache->io_list)) {
2996			list_del_init(&cache->io_list);
2997			btrfs_wait_cache_io(trans, cache, path);
2998			btrfs_put_block_group(cache);
2999		}
3000
3001
3002		/*
3003		 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
3004		 * it should update the cache_state.  Don't delete until after
3005		 * we wait.
3006		 *
3007		 * Since we're not running in the commit critical section
3008		 * we need the dirty_bgs_lock to protect from update_block_group
3009		 */
3010		spin_lock(&cur_trans->dirty_bgs_lock);
3011		list_del_init(&cache->dirty_list);
3012		spin_unlock(&cur_trans->dirty_bgs_lock);
3013
3014		should_put = 1;
3015
3016		cache_save_setup(cache, trans, path);
3017
3018		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3019			cache->io_ctl.inode = NULL;
3020			ret = btrfs_write_out_cache(trans, cache, path);
3021			if (ret == 0 && cache->io_ctl.inode) {
3022				should_put = 0;
3023
3024				/*
3025				 * The cache_write_mutex is protecting the
3026				 * io_list, also refer to the definition of
3027				 * btrfs_transaction::io_bgs for more details
3028				 */
3029				list_add_tail(&cache->io_list, io);
3030			} else {
3031				/*
3032				 * If we failed to write the cache, the
3033				 * generation will be bad and life goes on
3034				 */
3035				ret = 0;
3036			}
3037		}
3038		if (!ret) {
3039			ret = update_block_group_item(trans, path, cache);
3040			/*
3041			 * Our block group might still be attached to the list
3042			 * of new block groups in the transaction handle of some
3043			 * other task (struct btrfs_trans_handle->new_bgs). This
3044			 * means its block group item isn't yet in the extent
3045			 * tree. If this happens ignore the error, as we will
3046			 * try again later in the critical section of the
3047			 * transaction commit.
3048			 */
3049			if (ret == -ENOENT) {
3050				ret = 0;
3051				spin_lock(&cur_trans->dirty_bgs_lock);
3052				if (list_empty(&cache->dirty_list)) {
3053					list_add_tail(&cache->dirty_list,
3054						      &cur_trans->dirty_bgs);
3055					btrfs_get_block_group(cache);
3056					drop_reserve = false;
3057				}
3058				spin_unlock(&cur_trans->dirty_bgs_lock);
3059			} else if (ret) {
3060				btrfs_abort_transaction(trans, ret);
3061			}
3062		}
3063
3064		/* If it's not on the io list, we need to put the block group */
3065		if (should_put)
3066			btrfs_put_block_group(cache);
3067		if (drop_reserve)
3068			btrfs_delayed_refs_rsv_release(fs_info, 1);
3069		/*
3070		 * Avoid blocking other tasks for too long. It might even save
3071		 * us from writing caches for block groups that are going to be
3072		 * removed.
3073		 */
3074		mutex_unlock(&trans->transaction->cache_write_mutex);
3075		if (ret)
3076			goto out;
3077		mutex_lock(&trans->transaction->cache_write_mutex);
3078	}
3079	mutex_unlock(&trans->transaction->cache_write_mutex);
3080
3081	/*
3082	 * Go through delayed refs for all the stuff we've just kicked off
3083	 * and then loop back (just once)
3084	 */
3085	if (!ret)
3086		ret = btrfs_run_delayed_refs(trans, 0);
3087	if (!ret && loops == 0) {
3088		loops++;
3089		spin_lock(&cur_trans->dirty_bgs_lock);
3090		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3091		/*
3092		 * dirty_bgs_lock protects us from concurrent block group
3093		 * deletes too (not just cache_write_mutex).
3094		 */
3095		if (!list_empty(&dirty)) {
3096			spin_unlock(&cur_trans->dirty_bgs_lock);
3097			goto again;
3098		}
3099		spin_unlock(&cur_trans->dirty_bgs_lock);
3100	}
3101out:
3102	if (ret < 0) {
3103		spin_lock(&cur_trans->dirty_bgs_lock);
3104		list_splice_init(&dirty, &cur_trans->dirty_bgs);
3105		spin_unlock(&cur_trans->dirty_bgs_lock);
3106		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3107	}
3108
3109	btrfs_free_path(path);
3110	return ret;
3111}
3112
3113int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3114{
3115	struct btrfs_fs_info *fs_info = trans->fs_info;
3116	struct btrfs_block_group *cache;
3117	struct btrfs_transaction *cur_trans = trans->transaction;
3118	int ret = 0;
3119	int should_put;
3120	struct btrfs_path *path;
3121	struct list_head *io = &cur_trans->io_bgs;
3122
3123	path = btrfs_alloc_path();
3124	if (!path)
3125		return -ENOMEM;
3126
3127	/*
3128	 * Even though we are in the critical section of the transaction commit,
3129	 * we can still have concurrent tasks adding elements to this
3130	 * transaction's list of dirty block groups. These tasks correspond to
3131	 * endio free space workers started when writeback finishes for a
3132	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3133	 * allocate new block groups as a result of COWing nodes of the root
3134	 * tree when updating the free space inode. The writeback for the space
3135	 * caches is triggered by an earlier call to
3136	 * btrfs_start_dirty_block_groups() and iterations of the following
3137	 * loop.
3138	 * Also we want to do the cache_save_setup first and then run the
3139	 * delayed refs to make sure we have the best chance at doing this all
3140	 * in one shot.
3141	 */
3142	spin_lock(&cur_trans->dirty_bgs_lock);
3143	while (!list_empty(&cur_trans->dirty_bgs)) {
3144		cache = list_first_entry(&cur_trans->dirty_bgs,
3145					 struct btrfs_block_group,
3146					 dirty_list);
3147
3148		/*
3149		 * This can happen if cache_save_setup re-dirties a block group
3150		 * that is already under IO.  Just wait for it to finish and
3151		 * then do it all again
3152		 */
3153		if (!list_empty(&cache->io_list)) {
3154			spin_unlock(&cur_trans->dirty_bgs_lock);
3155			list_del_init(&cache->io_list);
3156			btrfs_wait_cache_io(trans, cache, path);
3157			btrfs_put_block_group(cache);
3158			spin_lock(&cur_trans->dirty_bgs_lock);
3159		}
3160
3161		/*
3162		 * Don't remove from the dirty list until after we've waited on
3163		 * any pending IO
3164		 */
3165		list_del_init(&cache->dirty_list);
3166		spin_unlock(&cur_trans->dirty_bgs_lock);
3167		should_put = 1;
3168
3169		cache_save_setup(cache, trans, path);
3170
3171		if (!ret)
3172			ret = btrfs_run_delayed_refs(trans,
3173						     (unsigned long) -1);
3174
3175		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3176			cache->io_ctl.inode = NULL;
3177			ret = btrfs_write_out_cache(trans, cache, path);
3178			if (ret == 0 && cache->io_ctl.inode) {
3179				should_put = 0;
3180				list_add_tail(&cache->io_list, io);
3181			} else {
3182				/*
3183				 * If we failed to write the cache, the
3184				 * generation will be bad and life goes on
3185				 */
3186				ret = 0;
3187			}
3188		}
3189		if (!ret) {
3190			ret = update_block_group_item(trans, path, cache);
3191			/*
3192			 * One of the free space endio workers might have
3193			 * created a new block group while updating a free space
3194			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
3195			 * and hasn't released its transaction handle yet, in
3196			 * which case the new block group is still attached to
3197			 * its transaction handle and its creation has not
3198			 * finished yet (no block group item in the extent tree
3199			 * yet, etc). If this is the case, wait for all free
3200			 * space endio workers to finish and retry. This is a
3201			 * very rare case so no need for a more efficient and
3202			 * complex approach.
3203			 */
3204			if (ret == -ENOENT) {
3205				wait_event(cur_trans->writer_wait,
3206				   atomic_read(&cur_trans->num_writers) == 1);
3207				ret = update_block_group_item(trans, path, cache);
3208			}
3209			if (ret)
3210				btrfs_abort_transaction(trans, ret);
3211		}
3212
3213		/* If its not on the io list, we need to put the block group */
3214		if (should_put)
3215			btrfs_put_block_group(cache);
3216		btrfs_delayed_refs_rsv_release(fs_info, 1);
3217		spin_lock(&cur_trans->dirty_bgs_lock);
3218	}
3219	spin_unlock(&cur_trans->dirty_bgs_lock);
3220
3221	/*
3222	 * Refer to the definition of io_bgs member for details why it's safe
3223	 * to use it without any locking
3224	 */
3225	while (!list_empty(io)) {
3226		cache = list_first_entry(io, struct btrfs_block_group,
3227					 io_list);
3228		list_del_init(&cache->io_list);
3229		btrfs_wait_cache_io(trans, cache, path);
3230		btrfs_put_block_group(cache);
3231	}
3232
3233	btrfs_free_path(path);
3234	return ret;
3235}
3236
3237static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
3238					      u64 bytes_freed)
3239{
3240	const struct btrfs_space_info *space_info = bg->space_info;
3241	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
3242	const u64 new_val = bg->used;
3243	const u64 old_val = new_val + bytes_freed;
3244	u64 thresh;
3245
3246	if (reclaim_thresh == 0)
3247		return false;
3248
3249	thresh = div_factor_fine(bg->length, reclaim_thresh);
3250
3251	/*
3252	 * If we were below the threshold before don't reclaim, we are likely a
3253	 * brand new block group and we don't want to relocate new block groups.
3254	 */
3255	if (old_val < thresh)
3256		return false;
3257	if (new_val >= thresh)
3258		return false;
3259	return true;
3260}
3261
3262int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3263			     u64 bytenr, u64 num_bytes, bool alloc)
3264{
3265	struct btrfs_fs_info *info = trans->fs_info;
3266	struct btrfs_block_group *cache = NULL;
3267	u64 total = num_bytes;
3268	u64 old_val;
3269	u64 byte_in_group;
3270	int factor;
3271	int ret = 0;
3272
3273	/* Block accounting for super block */
3274	spin_lock(&info->delalloc_root_lock);
3275	old_val = btrfs_super_bytes_used(info->super_copy);
3276	if (alloc)
3277		old_val += num_bytes;
3278	else
3279		old_val -= num_bytes;
3280	btrfs_set_super_bytes_used(info->super_copy, old_val);
3281	spin_unlock(&info->delalloc_root_lock);
3282
3283	while (total) {
3284		bool reclaim;
3285
3286		cache = btrfs_lookup_block_group(info, bytenr);
3287		if (!cache) {
3288			ret = -ENOENT;
3289			break;
3290		}
3291		factor = btrfs_bg_type_to_factor(cache->flags);
3292
3293		/*
3294		 * If this block group has free space cache written out, we
3295		 * need to make sure to load it if we are removing space.  This
3296		 * is because we need the unpinning stage to actually add the
3297		 * space back to the block group, otherwise we will leak space.
3298		 */
3299		if (!alloc && !btrfs_block_group_done(cache))
3300			btrfs_cache_block_group(cache, 1);
3301
3302		byte_in_group = bytenr - cache->start;
3303		WARN_ON(byte_in_group > cache->length);
3304
3305		spin_lock(&cache->space_info->lock);
3306		spin_lock(&cache->lock);
3307
3308		if (btrfs_test_opt(info, SPACE_CACHE) &&
3309		    cache->disk_cache_state < BTRFS_DC_CLEAR)
3310			cache->disk_cache_state = BTRFS_DC_CLEAR;
3311
3312		old_val = cache->used;
3313		num_bytes = min(total, cache->length - byte_in_group);
3314		if (alloc) {
3315			old_val += num_bytes;
3316			cache->used = old_val;
3317			cache->reserved -= num_bytes;
3318			cache->space_info->bytes_reserved -= num_bytes;
3319			cache->space_info->bytes_used += num_bytes;
3320			cache->space_info->disk_used += num_bytes * factor;
3321			spin_unlock(&cache->lock);
3322			spin_unlock(&cache->space_info->lock);
3323		} else {
3324			old_val -= num_bytes;
3325			cache->used = old_val;
3326			cache->pinned += num_bytes;
3327			btrfs_space_info_update_bytes_pinned(info,
3328					cache->space_info, num_bytes);
3329			cache->space_info->bytes_used -= num_bytes;
3330			cache->space_info->disk_used -= num_bytes * factor;
3331
3332			reclaim = should_reclaim_block_group(cache, num_bytes);
3333			spin_unlock(&cache->lock);
3334			spin_unlock(&cache->space_info->lock);
3335
3336			set_extent_dirty(&trans->transaction->pinned_extents,
3337					 bytenr, bytenr + num_bytes - 1,
3338					 GFP_NOFS | __GFP_NOFAIL);
3339		}
3340
3341		spin_lock(&trans->transaction->dirty_bgs_lock);
3342		if (list_empty(&cache->dirty_list)) {
3343			list_add_tail(&cache->dirty_list,
3344				      &trans->transaction->dirty_bgs);
3345			trans->delayed_ref_updates++;
3346			btrfs_get_block_group(cache);
3347		}
3348		spin_unlock(&trans->transaction->dirty_bgs_lock);
3349
3350		/*
3351		 * No longer have used bytes in this block group, queue it for
3352		 * deletion. We do this after adding the block group to the
3353		 * dirty list to avoid races between cleaner kthread and space
3354		 * cache writeout.
3355		 */
3356		if (!alloc && old_val == 0) {
3357			if (!btrfs_test_opt(info, DISCARD_ASYNC))
3358				btrfs_mark_bg_unused(cache);
3359		} else if (!alloc && reclaim) {
3360			btrfs_mark_bg_to_reclaim(cache);
3361		}
3362
3363		btrfs_put_block_group(cache);
3364		total -= num_bytes;
3365		bytenr += num_bytes;
3366	}
3367
3368	/* Modified block groups are accounted for in the delayed_refs_rsv. */
3369	btrfs_update_delayed_refs_rsv(trans);
3370	return ret;
3371}
3372
3373/**
3374 * btrfs_add_reserved_bytes - update the block_group and space info counters
3375 * @cache:	The cache we are manipulating
3376 * @ram_bytes:  The number of bytes of file content, and will be same to
3377 *              @num_bytes except for the compress path.
3378 * @num_bytes:	The number of bytes in question
3379 * @delalloc:   The blocks are allocated for the delalloc write
3380 *
3381 * This is called by the allocator when it reserves space. If this is a
3382 * reservation and the block group has become read only we cannot make the
3383 * reservation and return -EAGAIN, otherwise this function always succeeds.
3384 */
3385int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3386			     u64 ram_bytes, u64 num_bytes, int delalloc)
3387{
3388	struct btrfs_space_info *space_info = cache->space_info;
3389	int ret = 0;
3390
3391	spin_lock(&space_info->lock);
3392	spin_lock(&cache->lock);
3393	if (cache->ro) {
3394		ret = -EAGAIN;
3395	} else {
3396		cache->reserved += num_bytes;
3397		space_info->bytes_reserved += num_bytes;
3398		trace_btrfs_space_reservation(cache->fs_info, "space_info",
3399					      space_info->flags, num_bytes, 1);
3400		btrfs_space_info_update_bytes_may_use(cache->fs_info,
3401						      space_info, -ram_bytes);
3402		if (delalloc)
3403			cache->delalloc_bytes += num_bytes;
3404
3405		/*
3406		 * Compression can use less space than we reserved, so wake
3407		 * tickets if that happens
3408		 */
3409		if (num_bytes < ram_bytes)
3410			btrfs_try_granting_tickets(cache->fs_info, space_info);
3411	}
3412	spin_unlock(&cache->lock);
3413	spin_unlock(&space_info->lock);
3414	return ret;
3415}
3416
3417/**
3418 * btrfs_free_reserved_bytes - update the block_group and space info counters
3419 * @cache:      The cache we are manipulating
3420 * @num_bytes:  The number of bytes in question
3421 * @delalloc:   The blocks are allocated for the delalloc write
3422 *
3423 * This is called by somebody who is freeing space that was never actually used
3424 * on disk.  For example if you reserve some space for a new leaf in transaction
3425 * A and before transaction A commits you free that leaf, you call this with
3426 * reserve set to 0 in order to clear the reservation.
3427 */
3428void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3429			       u64 num_bytes, int delalloc)
3430{
3431	struct btrfs_space_info *space_info = cache->space_info;
3432
3433	spin_lock(&space_info->lock);
3434	spin_lock(&cache->lock);
3435	if (cache->ro)
3436		space_info->bytes_readonly += num_bytes;
3437	cache->reserved -= num_bytes;
3438	space_info->bytes_reserved -= num_bytes;
3439	space_info->max_extent_size = 0;
3440
3441	if (delalloc)
3442		cache->delalloc_bytes -= num_bytes;
3443	spin_unlock(&cache->lock);
3444
3445	btrfs_try_granting_tickets(cache->fs_info, space_info);
3446	spin_unlock(&space_info->lock);
3447}
3448
3449static void force_metadata_allocation(struct btrfs_fs_info *info)
3450{
3451	struct list_head *head = &info->space_info;
3452	struct btrfs_space_info *found;
3453
3454	list_for_each_entry(found, head, list) {
3455		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3456			found->force_alloc = CHUNK_ALLOC_FORCE;
3457	}
3458}
3459
3460static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3461			      struct btrfs_space_info *sinfo, int force)
3462{
3463	u64 bytes_used = btrfs_space_info_used(sinfo, false);
3464	u64 thresh;
3465
3466	if (force == CHUNK_ALLOC_FORCE)
3467		return 1;
3468
3469	/*
3470	 * in limited mode, we want to have some free space up to
3471	 * about 1% of the FS size.
3472	 */
3473	if (force == CHUNK_ALLOC_LIMITED) {
3474		thresh = btrfs_super_total_bytes(fs_info->super_copy);
3475		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3476
3477		if (sinfo->total_bytes - bytes_used < thresh)
3478			return 1;
3479	}
3480
3481	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3482		return 0;
3483	return 1;
3484}
3485
3486int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3487{
3488	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3489
3490	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3491}
3492
3493static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
3494{
3495	struct btrfs_block_group *bg;
3496	int ret;
3497
3498	/*
3499	 * Check if we have enough space in the system space info because we
3500	 * will need to update device items in the chunk btree and insert a new
3501	 * chunk item in the chunk btree as well. This will allocate a new
3502	 * system block group if needed.
3503	 */
3504	check_system_chunk(trans, flags);
3505
3506	bg = btrfs_create_chunk(trans, flags);
3507	if (IS_ERR(bg)) {
3508		ret = PTR_ERR(bg);
3509		goto out;
3510	}
3511
3512	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3513	/*
3514	 * Normally we are not expected to fail with -ENOSPC here, since we have
3515	 * previously reserved space in the system space_info and allocated one
3516	 * new system chunk if necessary. However there are three exceptions:
3517	 *
3518	 * 1) We may have enough free space in the system space_info but all the
3519	 *    existing system block groups have a profile which can not be used
3520	 *    for extent allocation.
3521	 *
3522	 *    This happens when mounting in degraded mode. For example we have a
3523	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
3524	 *    using the other device in degraded mode. If we then allocate a chunk,
3525	 *    we may have enough free space in the existing system space_info, but
3526	 *    none of the block groups can be used for extent allocation since they
3527	 *    have a RAID1 profile, and because we are in degraded mode with a
3528	 *    single device, we are forced to allocate a new system chunk with a
3529	 *    SINGLE profile. Making check_system_chunk() iterate over all system
3530	 *    block groups and check if they have a usable profile and enough space
3531	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
3532	 *    try again after forcing allocation of a new system chunk. Like this
3533	 *    we avoid paying the cost of that search in normal circumstances, when
3534	 *    we were not mounted in degraded mode;
3535	 *
3536	 * 2) We had enough free space info the system space_info, and one suitable
3537	 *    block group to allocate from when we called check_system_chunk()
3538	 *    above. However right after we called it, the only system block group
3539	 *    with enough free space got turned into RO mode by a running scrub,
3540	 *    and in this case we have to allocate a new one and retry. We only
3541	 *    need do this allocate and retry once, since we have a transaction
3542	 *    handle and scrub uses the commit root to search for block groups;
3543	 *
3544	 * 3) We had one system block group with enough free space when we called
3545	 *    check_system_chunk(), but after that, right before we tried to
3546	 *    allocate the last extent buffer we needed, a discard operation came
3547	 *    in and it temporarily removed the last free space entry from the
3548	 *    block group (discard removes a free space entry, discards it, and
3549	 *    then adds back the entry to the block group cache).
3550	 */
3551	if (ret == -ENOSPC) {
3552		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3553		struct btrfs_block_group *sys_bg;
3554
3555		sys_bg = btrfs_create_chunk(trans, sys_flags);
3556		if (IS_ERR(sys_bg)) {
3557			ret = PTR_ERR(sys_bg);
3558			btrfs_abort_transaction(trans, ret);
3559			goto out;
3560		}
3561
3562		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3563		if (ret) {
3564			btrfs_abort_transaction(trans, ret);
3565			goto out;
3566		}
3567
3568		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3569		if (ret) {
3570			btrfs_abort_transaction(trans, ret);
3571			goto out;
3572		}
3573	} else if (ret) {
3574		btrfs_abort_transaction(trans, ret);
3575		goto out;
3576	}
3577out:
3578	btrfs_trans_release_chunk_metadata(trans);
3579
3580	if (ret)
3581		return ERR_PTR(ret);
3582
3583	btrfs_get_block_group(bg);
3584	return bg;
3585}
3586
3587/*
3588 * Chunk allocation is done in 2 phases:
3589 *
3590 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
3591 *    the chunk, the chunk mapping, create its block group and add the items
3592 *    that belong in the chunk btree to it - more specifically, we need to
3593 *    update device items in the chunk btree and add a new chunk item to it.
3594 *
3595 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
3596 *    group item to the extent btree and the device extent items to the devices
3597 *    btree.
3598 *
3599 * This is done to prevent deadlocks. For example when COWing a node from the
3600 * extent btree we are holding a write lock on the node's parent and if we
3601 * trigger chunk allocation and attempted to insert the new block group item
3602 * in the extent btree right way, we could deadlock because the path for the
3603 * insertion can include that parent node. At first glance it seems impossible
3604 * to trigger chunk allocation after starting a transaction since tasks should
3605 * reserve enough transaction units (metadata space), however while that is true
3606 * most of the time, chunk allocation may still be triggered for several reasons:
3607 *
3608 * 1) When reserving metadata, we check if there is enough free space in the
3609 *    metadata space_info and therefore don't trigger allocation of a new chunk.
3610 *    However later when the task actually tries to COW an extent buffer from
3611 *    the extent btree or from the device btree for example, it is forced to
3612 *    allocate a new block group (chunk) because the only one that had enough
3613 *    free space was just turned to RO mode by a running scrub for example (or
3614 *    device replace, block group reclaim thread, etc), so we can not use it
3615 *    for allocating an extent and end up being forced to allocate a new one;
3616 *
3617 * 2) Because we only check that the metadata space_info has enough free bytes,
3618 *    we end up not allocating a new metadata chunk in that case. However if
3619 *    the filesystem was mounted in degraded mode, none of the existing block
3620 *    groups might be suitable for extent allocation due to their incompatible
3621 *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
3622 *    use a RAID1 profile, in degraded mode using a single device). In this case
3623 *    when the task attempts to COW some extent buffer of the extent btree for
3624 *    example, it will trigger allocation of a new metadata block group with a
3625 *    suitable profile (SINGLE profile in the example of the degraded mount of
3626 *    the RAID1 filesystem);
3627 *
3628 * 3) The task has reserved enough transaction units / metadata space, but when
3629 *    it attempts to COW an extent buffer from the extent or device btree for
3630 *    example, it does not find any free extent in any metadata block group,
3631 *    therefore forced to try to allocate a new metadata block group.
3632 *    This is because some other task allocated all available extents in the
3633 *    meanwhile - this typically happens with tasks that don't reserve space
3634 *    properly, either intentionally or as a bug. One example where this is
3635 *    done intentionally is fsync, as it does not reserve any transaction units
3636 *    and ends up allocating a variable number of metadata extents for log
3637 *    tree extent buffers;
3638 *
3639 * 4) The task has reserved enough transaction units / metadata space, but right
3640 *    before it tries to allocate the last extent buffer it needs, a discard
3641 *    operation comes in and, temporarily, removes the last free space entry from
3642 *    the only metadata block group that had free space (discard starts by
3643 *    removing a free space entry from a block group, then does the discard
3644 *    operation and, once it's done, it adds back the free space entry to the
3645 *    block group).
3646 *
3647 * We also need this 2 phases setup when adding a device to a filesystem with
3648 * a seed device - we must create new metadata and system chunks without adding
3649 * any of the block group items to the chunk, extent and device btrees. If we
3650 * did not do it this way, we would get ENOSPC when attempting to update those
3651 * btrees, since all the chunks from the seed device are read-only.
3652 *
3653 * Phase 1 does the updates and insertions to the chunk btree because if we had
3654 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
3655 * parallel, we risk having too many system chunks allocated by many tasks if
3656 * many tasks reach phase 1 without the previous ones completing phase 2. In the
3657 * extreme case this leads to exhaustion of the system chunk array in the
3658 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
3659 * and with RAID filesystems (so we have more device items in the chunk btree).
3660 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
3661 * the system chunk array due to concurrent allocations") provides more details.
3662 *
3663 * Allocation of system chunks does not happen through this function. A task that
3664 * needs to update the chunk btree (the only btree that uses system chunks), must
3665 * preallocate chunk space by calling either check_system_chunk() or
3666 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
3667 * metadata chunk or when removing a chunk, while the later is used before doing
3668 * a modification to the chunk btree - use cases for the later are adding,
3669 * removing and resizing a device as well as relocation of a system chunk.
3670 * See the comment below for more details.
3671 *
3672 * The reservation of system space, done through check_system_chunk(), as well
3673 * as all the updates and insertions into the chunk btree must be done while
3674 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
3675 * an extent buffer from the chunks btree we never trigger allocation of a new
3676 * system chunk, which would result in a deadlock (trying to lock twice an
3677 * extent buffer of the chunk btree, first time before triggering the chunk
3678 * allocation and the second time during chunk allocation while attempting to
3679 * update the chunks btree). The system chunk array is also updated while holding
3680 * that mutex. The same logic applies to removing chunks - we must reserve system
3681 * space, update the chunk btree and the system chunk array in the superblock
3682 * while holding fs_info->chunk_mutex.
3683 *
3684 * This function, btrfs_chunk_alloc(), belongs to phase 1.
3685 *
3686 * If @force is CHUNK_ALLOC_FORCE:
3687 *    - return 1 if it successfully allocates a chunk,
3688 *    - return errors including -ENOSPC otherwise.
3689 * If @force is NOT CHUNK_ALLOC_FORCE:
3690 *    - return 0 if it doesn't need to allocate a new chunk,
3691 *    - return 1 if it successfully allocates a chunk,
3692 *    - return errors including -ENOSPC otherwise.
3693 */
3694int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3695		      enum btrfs_chunk_alloc_enum force)
3696{
3697	struct btrfs_fs_info *fs_info = trans->fs_info;
3698	struct btrfs_space_info *space_info;
3699	struct btrfs_block_group *ret_bg;
3700	bool wait_for_alloc = false;
3701	bool should_alloc = false;
3702	bool from_extent_allocation = false;
3703	int ret = 0;
3704
3705	if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
3706		from_extent_allocation = true;
3707		force = CHUNK_ALLOC_FORCE;
3708	}
3709
3710	/* Don't re-enter if we're already allocating a chunk */
3711	if (trans->allocating_chunk)
3712		return -ENOSPC;
3713	/*
3714	 * Allocation of system chunks can not happen through this path, as we
3715	 * could end up in a deadlock if we are allocating a data or metadata
3716	 * chunk and there is another task modifying the chunk btree.
3717	 *
3718	 * This is because while we are holding the chunk mutex, we will attempt
3719	 * to add the new chunk item to the chunk btree or update an existing
3720	 * device item in the chunk btree, while the other task that is modifying
3721	 * the chunk btree is attempting to COW an extent buffer while holding a
3722	 * lock on it and on its parent - if the COW operation triggers a system
3723	 * chunk allocation, then we can deadlock because we are holding the
3724	 * chunk mutex and we may need to access that extent buffer or its parent
3725	 * in order to add the chunk item or update a device item.
3726	 *
3727	 * Tasks that want to modify the chunk tree should reserve system space
3728	 * before updating the chunk btree, by calling either
3729	 * btrfs_reserve_chunk_metadata() or check_system_chunk().
3730	 * It's possible that after a task reserves the space, it still ends up
3731	 * here - this happens in the cases described above at do_chunk_alloc().
3732	 * The task will have to either retry or fail.
3733	 */
3734	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3735		return -ENOSPC;
3736
3737	space_info = btrfs_find_space_info(fs_info, flags);
3738	ASSERT(space_info);
3739
3740	do {
3741		spin_lock(&space_info->lock);
3742		if (force < space_info->force_alloc)
3743			force = space_info->force_alloc;
3744		should_alloc = should_alloc_chunk(fs_info, space_info, force);
3745		if (space_info->full) {
3746			/* No more free physical space */
3747			if (should_alloc)
3748				ret = -ENOSPC;
3749			else
3750				ret = 0;
3751			spin_unlock(&space_info->lock);
3752			return ret;
3753		} else if (!should_alloc) {
3754			spin_unlock(&space_info->lock);
3755			return 0;
3756		} else if (space_info->chunk_alloc) {
3757			/*
3758			 * Someone is already allocating, so we need to block
3759			 * until this someone is finished and then loop to
3760			 * recheck if we should continue with our allocation
3761			 * attempt.
3762			 */
3763			wait_for_alloc = true;
3764			spin_unlock(&space_info->lock);
3765			mutex_lock(&fs_info->chunk_mutex);
3766			mutex_unlock(&fs_info->chunk_mutex);
3767		} else {
3768			/* Proceed with allocation */
3769			space_info->chunk_alloc = 1;
3770			wait_for_alloc = false;
3771			spin_unlock(&space_info->lock);
3772		}
3773
3774		cond_resched();
3775	} while (wait_for_alloc);
3776
3777	mutex_lock(&fs_info->chunk_mutex);
3778	trans->allocating_chunk = true;
3779
3780	/*
3781	 * If we have mixed data/metadata chunks we want to make sure we keep
3782	 * allocating mixed chunks instead of individual chunks.
3783	 */
3784	if (btrfs_mixed_space_info(space_info))
3785		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3786
3787	/*
3788	 * if we're doing a data chunk, go ahead and make sure that
3789	 * we keep a reasonable number of metadata chunks allocated in the
3790	 * FS as well.
3791	 */
3792	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3793		fs_info->data_chunk_allocations++;
3794		if (!(fs_info->data_chunk_allocations %
3795		      fs_info->metadata_ratio))
3796			force_metadata_allocation(fs_info);
3797	}
3798
3799	ret_bg = do_chunk_alloc(trans, flags);
3800	trans->allocating_chunk = false;
3801
3802	if (IS_ERR(ret_bg)) {
3803		ret = PTR_ERR(ret_bg);
3804	} else if (from_extent_allocation) {
3805		/*
3806		 * New block group is likely to be used soon. Try to activate
3807		 * it now. Failure is OK for now.
3808		 */
3809		btrfs_zone_activate(ret_bg);
3810	}
3811
3812	if (!ret)
3813		btrfs_put_block_group(ret_bg);
3814
3815	spin_lock(&space_info->lock);
3816	if (ret < 0) {
3817		if (ret == -ENOSPC)
3818			space_info->full = 1;
3819		else
3820			goto out;
3821	} else {
3822		ret = 1;
3823		space_info->max_extent_size = 0;
3824	}
3825
3826	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3827out:
3828	space_info->chunk_alloc = 0;
3829	spin_unlock(&space_info->lock);
3830	mutex_unlock(&fs_info->chunk_mutex);
3831
3832	return ret;
3833}
3834
3835static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3836{
3837	u64 num_dev;
3838
3839	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3840	if (!num_dev)
3841		num_dev = fs_info->fs_devices->rw_devices;
3842
3843	return num_dev;
3844}
3845
3846static void reserve_chunk_space(struct btrfs_trans_handle *trans,
3847				u64 bytes,
3848				u64 type)
3849{
3850	struct btrfs_fs_info *fs_info = trans->fs_info;
3851	struct btrfs_space_info *info;
3852	u64 left;
3853	int ret = 0;
3854
3855	/*
3856	 * Needed because we can end up allocating a system chunk and for an
3857	 * atomic and race free space reservation in the chunk block reserve.
3858	 */
3859	lockdep_assert_held(&fs_info->chunk_mutex);
3860
3861	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3862	spin_lock(&info->lock);
3863	left = info->total_bytes - btrfs_space_info_used(info, true);
3864	spin_unlock(&info->lock);
3865
3866	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3867		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3868			   left, bytes, type);
3869		btrfs_dump_space_info(fs_info, info, 0, 0);
3870	}
3871
3872	if (left < bytes) {
3873		u64 flags = btrfs_system_alloc_profile(fs_info);
3874		struct btrfs_block_group *bg;
3875
3876		/*
3877		 * Ignore failure to create system chunk. We might end up not
3878		 * needing it, as we might not need to COW all nodes/leafs from
3879		 * the paths we visit in the chunk tree (they were already COWed
3880		 * or created in the current transaction for example).
3881		 */
3882		bg = btrfs_create_chunk(trans, flags);
3883		if (IS_ERR(bg)) {
3884			ret = PTR_ERR(bg);
3885		} else {
3886			/*
3887			 * If we fail to add the chunk item here, we end up
3888			 * trying again at phase 2 of chunk allocation, at
3889			 * btrfs_create_pending_block_groups(). So ignore
3890			 * any error here. An ENOSPC here could happen, due to
3891			 * the cases described at do_chunk_alloc() - the system
3892			 * block group we just created was just turned into RO
3893			 * mode by a scrub for example, or a running discard
3894			 * temporarily removed its free space entries, etc.
3895			 */
3896			btrfs_chunk_alloc_add_chunk_item(trans, bg);
3897		}
3898	}
3899
3900	if (!ret) {
3901		ret = btrfs_block_rsv_add(fs_info,
3902					  &fs_info->chunk_block_rsv,
3903					  bytes, BTRFS_RESERVE_NO_FLUSH);
3904		if (!ret)
3905			trans->chunk_bytes_reserved += bytes;
3906	}
3907}
3908
3909/*
3910 * Reserve space in the system space for allocating or removing a chunk.
3911 * The caller must be holding fs_info->chunk_mutex.
3912 */
3913void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3914{
3915	struct btrfs_fs_info *fs_info = trans->fs_info;
3916	const u64 num_devs = get_profile_num_devs(fs_info, type);
3917	u64 bytes;
3918
3919	/* num_devs device items to update and 1 chunk item to add or remove. */
3920	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
3921		btrfs_calc_insert_metadata_size(fs_info, 1);
3922
3923	reserve_chunk_space(trans, bytes, type);
3924}
3925
3926/*
3927 * Reserve space in the system space, if needed, for doing a modification to the
3928 * chunk btree.
3929 *
3930 * @trans:		A transaction handle.
3931 * @is_item_insertion:	Indicate if the modification is for inserting a new item
3932 *			in the chunk btree or if it's for the deletion or update
3933 *			of an existing item.
3934 *
3935 * This is used in a context where we need to update the chunk btree outside
3936 * block group allocation and removal, to avoid a deadlock with a concurrent
3937 * task that is allocating a metadata or data block group and therefore needs to
3938 * update the chunk btree while holding the chunk mutex. After the update to the
3939 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
3940 *
3941 */
3942void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
3943				  bool is_item_insertion)
3944{
3945	struct btrfs_fs_info *fs_info = trans->fs_info;
3946	u64 bytes;
3947
3948	if (is_item_insertion)
3949		bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
3950	else
3951		bytes = btrfs_calc_metadata_size(fs_info, 1);
3952
3953	mutex_lock(&fs_info->chunk_mutex);
3954	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
3955	mutex_unlock(&fs_info->chunk_mutex);
3956}
3957
3958void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3959{
3960	struct btrfs_block_group *block_group;
3961	u64 last = 0;
3962
3963	while (1) {
3964		struct inode *inode;
3965
3966		block_group = btrfs_lookup_first_block_group(info, last);
3967		while (block_group) {
3968			btrfs_wait_block_group_cache_done(block_group);
3969			spin_lock(&block_group->lock);
3970			if (block_group->iref)
3971				break;
3972			spin_unlock(&block_group->lock);
3973			block_group = btrfs_next_block_group(block_group);
3974		}
3975		if (!block_group) {
3976			if (last == 0)
3977				break;
3978			last = 0;
3979			continue;
3980		}
3981
3982		inode = block_group->inode;
3983		block_group->iref = 0;
3984		block_group->inode = NULL;
3985		spin_unlock(&block_group->lock);
3986		ASSERT(block_group->io_ctl.inode == NULL);
3987		iput(inode);
3988		last = block_group->start + block_group->length;
3989		btrfs_put_block_group(block_group);
3990	}
3991}
3992
3993/*
3994 * Must be called only after stopping all workers, since we could have block
3995 * group caching kthreads running, and therefore they could race with us if we
3996 * freed the block groups before stopping them.
3997 */
3998int btrfs_free_block_groups(struct btrfs_fs_info *info)
3999{
4000	struct btrfs_block_group *block_group;
4001	struct btrfs_space_info *space_info;
4002	struct btrfs_caching_control *caching_ctl;
4003	struct rb_node *n;
4004
4005	write_lock(&info->block_group_cache_lock);
4006	while (!list_empty(&info->caching_block_groups)) {
4007		caching_ctl = list_entry(info->caching_block_groups.next,
4008					 struct btrfs_caching_control, list);
4009		list_del(&caching_ctl->list);
4010		btrfs_put_caching_control(caching_ctl);
4011	}
4012	write_unlock(&info->block_group_cache_lock);
4013
4014	spin_lock(&info->unused_bgs_lock);
4015	while (!list_empty(&info->unused_bgs)) {
4016		block_group = list_first_entry(&info->unused_bgs,
4017					       struct btrfs_block_group,
4018					       bg_list);
4019		list_del_init(&block_group->bg_list);
4020		btrfs_put_block_group(block_group);
4021	}
4022
4023	while (!list_empty(&info->reclaim_bgs)) {
4024		block_group = list_first_entry(&info->reclaim_bgs,
4025					       struct btrfs_block_group,
4026					       bg_list);
4027		list_del_init(&block_group->bg_list);
4028		btrfs_put_block_group(block_group);
4029	}
4030	spin_unlock(&info->unused_bgs_lock);
4031
4032	spin_lock(&info->zone_active_bgs_lock);
4033	while (!list_empty(&info->zone_active_bgs)) {
4034		block_group = list_first_entry(&info->zone_active_bgs,
4035					       struct btrfs_block_group,
4036					       active_bg_list);
4037		list_del_init(&block_group->active_bg_list);
4038		btrfs_put_block_group(block_group);
4039	}
4040	spin_unlock(&info->zone_active_bgs_lock);
4041
4042	write_lock(&info->block_group_cache_lock);
4043	while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4044		block_group = rb_entry(n, struct btrfs_block_group,
4045				       cache_node);
4046		rb_erase_cached(&block_group->cache_node,
4047				&info->block_group_cache_tree);
4048		RB_CLEAR_NODE(&block_group->cache_node);
4049		write_unlock(&info->block_group_cache_lock);
4050
4051		down_write(&block_group->space_info->groups_sem);
4052		list_del(&block_group->list);
4053		up_write(&block_group->space_info->groups_sem);
4054
4055		/*
4056		 * We haven't cached this block group, which means we could
4057		 * possibly have excluded extents on this block group.
4058		 */
4059		if (block_group->cached == BTRFS_CACHE_NO ||
4060		    block_group->cached == BTRFS_CACHE_ERROR)
4061			btrfs_free_excluded_extents(block_group);
4062
4063		btrfs_remove_free_space_cache(block_group);
4064		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4065		ASSERT(list_empty(&block_group->dirty_list));
4066		ASSERT(list_empty(&block_group->io_list));
4067		ASSERT(list_empty(&block_group->bg_list));
4068		ASSERT(refcount_read(&block_group->refs) == 1);
4069		ASSERT(block_group->swap_extents == 0);
4070		btrfs_put_block_group(block_group);
4071
4072		write_lock(&info->block_group_cache_lock);
4073	}
4074	write_unlock(&info->block_group_cache_lock);
4075
4076	btrfs_release_global_block_rsv(info);
4077
4078	while (!list_empty(&info->space_info)) {
4079		space_info = list_entry(info->space_info.next,
4080					struct btrfs_space_info,
4081					list);
4082
4083		/*
4084		 * Do not hide this behind enospc_debug, this is actually
4085		 * important and indicates a real bug if this happens.
4086		 */
4087		if (WARN_ON(space_info->bytes_pinned > 0 ||
4088			    space_info->bytes_may_use > 0))
4089			btrfs_dump_space_info(info, space_info, 0, 0);
4090
4091		/*
4092		 * If there was a failure to cleanup a log tree, very likely due
4093		 * to an IO failure on a writeback attempt of one or more of its
4094		 * extent buffers, we could not do proper (and cheap) unaccounting
4095		 * of their reserved space, so don't warn on bytes_reserved > 0 in
4096		 * that case.
4097		 */
4098		if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4099		    !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4100			if (WARN_ON(space_info->bytes_reserved > 0))
4101				btrfs_dump_space_info(info, space_info, 0, 0);
4102		}
4103
4104		WARN_ON(space_info->reclaim_size > 0);
4105		list_del(&space_info->list);
4106		btrfs_sysfs_remove_space_info(space_info);
4107	}
4108	return 0;
4109}
4110
4111void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4112{
4113	atomic_inc(&cache->frozen);
4114}
4115
4116void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4117{
4118	struct btrfs_fs_info *fs_info = block_group->fs_info;
4119	struct extent_map_tree *em_tree;
4120	struct extent_map *em;
4121	bool cleanup;
4122
4123	spin_lock(&block_group->lock);
4124	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4125		   block_group->removed);
4126	spin_unlock(&block_group->lock);
4127
4128	if (cleanup) {
4129		em_tree = &fs_info->mapping_tree;
4130		write_lock(&em_tree->lock);
4131		em = lookup_extent_mapping(em_tree, block_group->start,
4132					   1);
4133		BUG_ON(!em); /* logic error, can't happen */
4134		remove_extent_mapping(em_tree, em);
4135		write_unlock(&em_tree->lock);
4136
4137		/* once for us and once for the tree */
4138		free_extent_map(em);
4139		free_extent_map(em);
4140
4141		/*
4142		 * We may have left one free space entry and other possible
4143		 * tasks trimming this block group have left 1 entry each one.
4144		 * Free them if any.
4145		 */
4146		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
4147	}
4148}
4149
4150bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4151{
4152	bool ret = true;
4153
4154	spin_lock(&bg->lock);
4155	if (bg->ro)
4156		ret = false;
4157	else
4158		bg->swap_extents++;
4159	spin_unlock(&bg->lock);
4160
4161	return ret;
4162}
4163
4164void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4165{
4166	spin_lock(&bg->lock);
4167	ASSERT(!bg->ro);
4168	ASSERT(bg->swap_extents >= amount);
4169	bg->swap_extents -= amount;
4170	spin_unlock(&bg->lock);
4171}
Configure Feed

Configure Feed