fs/btrfs/defrag.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / defrag.c
at master 42 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include "ctree.h"
   8#include "disk-io.h"
   9#include "transaction.h"
  10#include "locking.h"
  11#include "accessors.h"
  12#include "messages.h"
  13#include "delalloc-space.h"
  14#include "subpage.h"
  15#include "defrag.h"
  16#include "file-item.h"
  17#include "super.h"
  18#include "compression.h"
  19
  20static struct kmem_cache *btrfs_inode_defrag_cachep;
  21
  22/*
  23 * When auto defrag is enabled we queue up these defrag structs to remember
  24 * which inodes need defragging passes.
  25 */
  26struct inode_defrag {
  27	struct rb_node rb_node;
  28	/* Inode number */
  29	u64 ino;
  30	/*
  31	 * Transid where the defrag was added, we search for extents newer than
  32	 * this.
  33	 */
  34	u64 transid;
  35
  36	/* Root objectid */
  37	u64 root;
  38
  39	/*
  40	 * The extent size threshold for autodefrag.
  41	 *
  42	 * This value is different for compressed/non-compressed extents, thus
  43	 * needs to be passed from higher layer.
  44	 * (aka, inode_should_defrag())
  45	 */
  46	u32 extent_thresh;
  47};
  48
  49static int compare_inode_defrag(const struct inode_defrag *defrag1,
  50				const struct inode_defrag *defrag2)
  51{
  52	if (defrag1->root > defrag2->root)
  53		return 1;
  54	else if (defrag1->root < defrag2->root)
  55		return -1;
  56	else if (defrag1->ino > defrag2->ino)
  57		return 1;
  58	else if (defrag1->ino < defrag2->ino)
  59		return -1;
  60	else
  61		return 0;
  62}
  63
  64static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing)
  65{
  66	const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node);
  67	const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node);
  68
  69	return compare_inode_defrag(new_defrag, existing_defrag);
  70}
  71
  72/*
  73 * Insert a record for an inode into the defrag tree.  The lock must be held
  74 * already.
  75 *
  76 * If you're inserting a record for an older transid than an existing record,
  77 * the transid already in the tree is lowered.
  78 */
  79static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
  80				     struct inode_defrag *defrag)
  81{
  82	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  83	struct rb_node *node;
  84
  85	node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp);
  86	if (node) {
  87		struct inode_defrag *entry;
  88
  89		entry = rb_entry(node, struct inode_defrag, rb_node);
  90		/*
  91		 * If we're reinserting an entry for an old defrag run, make
  92		 * sure to lower the transid of our existing record.
  93		 */
  94		if (defrag->transid < entry->transid)
  95			entry->transid = defrag->transid;
  96		entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh);
  97		return -EEXIST;
  98	}
  99	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
 100	return 0;
 101}
 102
 103static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info)
 104{
 105	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
 106		return false;
 107
 108	if (btrfs_fs_closing(fs_info))
 109		return false;
 110
 111	return true;
 112}
 113
 114/*
 115 * Insert a defrag record for this inode if auto defrag is enabled. No errors
 116 * returned as they're not considered fatal.
 117 */
 118void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh)
 119{
 120	struct btrfs_root *root = inode->root;
 121	struct btrfs_fs_info *fs_info = root->fs_info;
 122	struct inode_defrag *defrag;
 123	int ret;
 124
 125	if (!need_auto_defrag(fs_info))
 126		return;
 127
 128	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
 129		return;
 130
 131	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 132	if (!defrag)
 133		return;
 134
 135	defrag->ino = btrfs_ino(inode);
 136	defrag->transid = btrfs_get_root_last_trans(root);
 137	defrag->root = btrfs_root_id(root);
 138	defrag->extent_thresh = extent_thresh;
 139
 140	spin_lock(&fs_info->defrag_inodes_lock);
 141	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
 142		/*
 143		 * If we set IN_DEFRAG flag and evict the inode from memory,
 144		 * and then re-read this inode, this new inode doesn't have
 145		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
 146		 */
 147		ret = btrfs_insert_inode_defrag(inode, defrag);
 148		if (ret)
 149			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 150	} else {
 151		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 152	}
 153	spin_unlock(&fs_info->defrag_inodes_lock);
 154}
 155
 156/*
 157 * Pick the defraggable inode that we want, if it doesn't exist, we will get the
 158 * next one.
 159 */
 160static struct inode_defrag *btrfs_pick_defrag_inode(
 161			struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 162{
 163	struct inode_defrag *entry = NULL;
 164	struct inode_defrag tmp;
 165	struct rb_node *p;
 166	struct rb_node *parent = NULL;
 167	int ret;
 168
 169	tmp.ino = ino;
 170	tmp.root = root;
 171
 172	spin_lock(&fs_info->defrag_inodes_lock);
 173	p = fs_info->defrag_inodes.rb_node;
 174	while (p) {
 175		parent = p;
 176		entry = rb_entry(parent, struct inode_defrag, rb_node);
 177
 178		ret = compare_inode_defrag(&tmp, entry);
 179		if (ret < 0)
 180			p = parent->rb_left;
 181		else if (ret > 0)
 182			p = parent->rb_right;
 183		else
 184			goto out;
 185	}
 186
 187	if (parent && compare_inode_defrag(&tmp, entry) > 0) {
 188		parent = rb_next(parent);
 189		entry = rb_entry_safe(parent, struct inode_defrag, rb_node);
 190	}
 191out:
 192	if (entry)
 193		rb_erase(parent, &fs_info->defrag_inodes);
 194	spin_unlock(&fs_info->defrag_inodes_lock);
 195	return entry;
 196}
 197
 198void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 199{
 200	struct inode_defrag *defrag, *next;
 201
 202	spin_lock(&fs_info->defrag_inodes_lock);
 203
 204	rbtree_postorder_for_each_entry_safe(defrag, next,
 205					     &fs_info->defrag_inodes, rb_node)
 206		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 207
 208	fs_info->defrag_inodes = RB_ROOT;
 209
 210	spin_unlock(&fs_info->defrag_inodes_lock);
 211}
 212
 213#define BTRFS_DEFRAG_BATCH	1024
 214
 215static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 216				  struct inode_defrag *defrag,
 217				  struct file_ra_state *ra)
 218{
 219	struct btrfs_root *inode_root;
 220	struct btrfs_inode *inode;
 221	struct btrfs_ioctl_defrag_range_args range;
 222	int ret = 0;
 223	u64 cur = 0;
 224
 225again:
 226	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
 227		goto cleanup;
 228	if (!need_auto_defrag(fs_info))
 229		goto cleanup;
 230
 231	/* Get the inode */
 232	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
 233	if (IS_ERR(inode_root)) {
 234		ret = PTR_ERR(inode_root);
 235		goto cleanup;
 236	}
 237
 238	inode = btrfs_iget(defrag->ino, inode_root);
 239	btrfs_put_root(inode_root);
 240	if (IS_ERR(inode)) {
 241		ret = PTR_ERR(inode);
 242		goto cleanup;
 243	}
 244
 245	if (cur >= i_size_read(&inode->vfs_inode)) {
 246		iput(&inode->vfs_inode);
 247		goto cleanup;
 248	}
 249
 250	/* Do a chunk of defrag */
 251	clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
 252	memset(&range, 0, sizeof(range));
 253	range.len = (u64)-1;
 254	range.start = cur;
 255	range.extent_thresh = defrag->extent_thresh;
 256	file_ra_state_init(ra, inode->vfs_inode.i_mapping);
 257
 258	scoped_guard(super_write, fs_info->sb)
 259		ret = btrfs_defrag_file(inode, ra, &range,
 260					defrag->transid, BTRFS_DEFRAG_BATCH);
 261	iput(&inode->vfs_inode);
 262
 263	if (ret < 0)
 264		goto cleanup;
 265
 266	cur = max(cur + fs_info->sectorsize, range.start);
 267	goto again;
 268
 269cleanup:
 270	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
 271	return ret;
 272}
 273
 274/*
 275 * Run through the list of inodes in the FS that need defragging.
 276 */
 277int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 278{
 279	struct inode_defrag *defrag;
 280	u64 first_ino = 0;
 281	u64 root_objectid = 0;
 282
 283	atomic_inc(&fs_info->defrag_running);
 284	while (1) {
 285		struct file_ra_state ra = { 0 };
 286
 287		/* Pause the auto defragger. */
 288		if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
 289			break;
 290
 291		if (!need_auto_defrag(fs_info))
 292			break;
 293
 294		/* find an inode to defrag */
 295		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
 296		if (!defrag) {
 297			if (root_objectid || first_ino) {
 298				root_objectid = 0;
 299				first_ino = 0;
 300				continue;
 301			} else {
 302				break;
 303			}
 304		}
 305
 306		first_ino = defrag->ino + 1;
 307		root_objectid = defrag->root;
 308
 309		btrfs_run_defrag_inode(fs_info, defrag, &ra);
 310	}
 311	atomic_dec(&fs_info->defrag_running);
 312
 313	/*
 314	 * During unmount, we use the transaction_wait queue to wait for the
 315	 * defragger to stop.
 316	 */
 317	wake_up(&fs_info->transaction_wait);
 318	return 0;
 319}
 320
 321/*
 322 * Check if two blocks addresses are close, used by defrag.
 323 */
 324static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
 325{
 326	if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
 327		return true;
 328	if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
 329		return true;
 330	return false;
 331}
 332
 333/*
 334 * Go through all the leaves pointed to by a node and reallocate them so that
 335 * disk order is close to key order.
 336 */
 337static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 338			      struct btrfs_root *root,
 339			      struct extent_buffer *parent,
 340			      int start_slot, u64 *last_ret,
 341			      struct btrfs_key *progress)
 342{
 343	struct btrfs_fs_info *fs_info = root->fs_info;
 344	const u32 blocksize = fs_info->nodesize;
 345	const int end_slot = btrfs_header_nritems(parent) - 1;
 346	u64 search_start = *last_ret;
 347	u64 last_block = 0;
 348	int ret = 0;
 349	bool progress_passed = false;
 350
 351	/*
 352	 * COWing must happen through a running transaction, which always
 353	 * matches the current fs generation (it's a transaction with a state
 354	 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
 355	 * into error state to prevent the commit of any transaction.
 356	 */
 357	if (unlikely(trans->transaction != fs_info->running_transaction ||
 358		     trans->transid != fs_info->generation)) {
 359		btrfs_abort_transaction(trans, -EUCLEAN);
 360		btrfs_crit(fs_info,
 361"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
 362			   parent->start, btrfs_root_id(root), trans->transid,
 363			   fs_info->running_transaction->transid,
 364			   fs_info->generation);
 365		return -EUCLEAN;
 366	}
 367
 368	if (btrfs_header_nritems(parent) <= 1)
 369		return 0;
 370
 371	for (int i = start_slot; i <= end_slot; i++) {
 372		struct extent_buffer *cur;
 373		struct btrfs_disk_key disk_key;
 374		u64 blocknr;
 375		u64 other;
 376		bool close = true;
 377
 378		btrfs_node_key(parent, &disk_key, i);
 379		if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
 380			continue;
 381
 382		progress_passed = true;
 383		blocknr = btrfs_node_blockptr(parent, i);
 384		if (last_block == 0)
 385			last_block = blocknr;
 386
 387		if (i > 0) {
 388			other = btrfs_node_blockptr(parent, i - 1);
 389			close = close_blocks(blocknr, other, blocksize);
 390		}
 391		if (!close && i < end_slot) {
 392			other = btrfs_node_blockptr(parent, i + 1);
 393			close = close_blocks(blocknr, other, blocksize);
 394		}
 395		if (close) {
 396			last_block = blocknr;
 397			continue;
 398		}
 399
 400		cur = btrfs_read_node_slot(parent, i);
 401		if (IS_ERR(cur))
 402			return PTR_ERR(cur);
 403		if (search_start == 0)
 404			search_start = last_block;
 405
 406		btrfs_tree_lock(cur);
 407		ret = btrfs_force_cow_block(trans, root, cur, parent, i,
 408					    &cur, search_start,
 409					    min(16 * blocksize,
 410						(end_slot - i) * blocksize),
 411					    BTRFS_NESTING_COW);
 412		if (ret) {
 413			btrfs_tree_unlock(cur);
 414			free_extent_buffer(cur);
 415			break;
 416		}
 417		search_start = cur->start;
 418		last_block = cur->start;
 419		*last_ret = search_start;
 420		btrfs_tree_unlock(cur);
 421		free_extent_buffer(cur);
 422	}
 423	return ret;
 424}
 425
 426/*
 427 * Defrag all the leaves in a given btree.
 428 * Read all the leaves and try to get key order to
 429 * better reflect disk order
 430 */
 431
 432static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 433			       struct btrfs_root *root)
 434{
 435	struct btrfs_path *path = NULL;
 436	struct btrfs_key key;
 437	int ret = 0;
 438	int wret;
 439	int level;
 440	int next_key_ret = 0;
 441	u64 last_ret = 0;
 442
 443	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 444		goto out;
 445
 446	path = btrfs_alloc_path();
 447	if (!path) {
 448		ret = -ENOMEM;
 449		goto out;
 450	}
 451
 452	level = btrfs_header_level(root->node);
 453
 454	if (level == 0)
 455		goto out;
 456
 457	if (root->defrag_progress.objectid == 0) {
 458		struct extent_buffer *root_node;
 459		u32 nritems;
 460
 461		root_node = btrfs_lock_root_node(root);
 462		nritems = btrfs_header_nritems(root_node);
 463		root->defrag_max.objectid = 0;
 464		/* from above we know this is not a leaf */
 465		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
 466				      nritems - 1);
 467		btrfs_tree_unlock(root_node);
 468		free_extent_buffer(root_node);
 469		memset(&key, 0, sizeof(key));
 470	} else {
 471		memcpy(&key, &root->defrag_progress, sizeof(key));
 472	}
 473
 474	path->keep_locks = true;
 475
 476	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
 477	if (ret < 0)
 478		goto out;
 479	if (ret > 0) {
 480		ret = 0;
 481		goto out;
 482	}
 483	btrfs_release_path(path);
 484	/*
 485	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
 486	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
 487	 * a deadlock (attempting to write lock an already write locked leaf).
 488	 */
 489	path->lowest_level = 1;
 490	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 491
 492	if (wret < 0) {
 493		ret = wret;
 494		goto out;
 495	}
 496	if (!path->nodes[1]) {
 497		ret = 0;
 498		goto out;
 499	}
 500	/*
 501	 * The node at level 1 must always be locked when our path has
 502	 * keep_locks set and lowest_level is 1, regardless of the value of
 503	 * path->slots[1].
 504	 */
 505	ASSERT(path->locks[1] != 0);
 506	ret = btrfs_realloc_node(trans, root,
 507				 path->nodes[1], 0,
 508				 &last_ret,
 509				 &root->defrag_progress);
 510	if (ret) {
 511		WARN_ON(ret == -EAGAIN);
 512		goto out;
 513	}
 514	/*
 515	 * Now that we reallocated the node we can find the next key. Note that
 516	 * btrfs_find_next_key() can release our path and do another search
 517	 * without COWing, this is because even with path->keep_locks == true,
 518	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
 519	 * node when path->slots[node_level - 1] does not point to the last
 520	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
 521	 * we search for the next key after reallocating our node.
 522	 */
 523	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
 524	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
 525					   BTRFS_OLDEST_GENERATION);
 526	if (next_key_ret == 0) {
 527		memcpy(&root->defrag_progress, &key, sizeof(key));
 528		ret = -EAGAIN;
 529	}
 530out:
 531	btrfs_free_path(path);
 532	if (ret == -EAGAIN) {
 533		if (root->defrag_max.objectid > root->defrag_progress.objectid)
 534			goto done;
 535		if (root->defrag_max.type > root->defrag_progress.type)
 536			goto done;
 537		if (root->defrag_max.offset > root->defrag_progress.offset)
 538			goto done;
 539		ret = 0;
 540	}
 541done:
 542	if (ret != -EAGAIN)
 543		memset(&root->defrag_progress, 0,
 544		       sizeof(root->defrag_progress));
 545
 546	return ret;
 547}
 548
 549/*
 550 * Defrag a given btree.  Every leaf in the btree is read and defragmented.
 551 */
 552int btrfs_defrag_root(struct btrfs_root *root)
 553{
 554	struct btrfs_fs_info *fs_info = root->fs_info;
 555	int ret;
 556
 557	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
 558		return 0;
 559
 560	while (1) {
 561		struct btrfs_trans_handle *trans;
 562
 563		trans = btrfs_start_transaction(root, 0);
 564		if (IS_ERR(trans)) {
 565			ret = PTR_ERR(trans);
 566			break;
 567		}
 568
 569		ret = btrfs_defrag_leaves(trans, root);
 570
 571		btrfs_end_transaction(trans);
 572		btrfs_btree_balance_dirty(fs_info);
 573		cond_resched();
 574
 575		if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
 576			break;
 577
 578		if (btrfs_defrag_cancelled(fs_info)) {
 579			btrfs_debug(fs_info, "defrag_root cancelled");
 580			ret = -EAGAIN;
 581			break;
 582		}
 583	}
 584	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
 585	return ret;
 586}
 587
 588/*
 589 * Defrag specific helper to get an extent map.
 590 *
 591 * Differences between this and btrfs_get_extent() are:
 592 *
 593 * - No extent_map will be added to inode->extent_tree
 594 *   To reduce memory usage in the long run.
 595 *
 596 * - Extra optimization to skip file extents older than @newer_than
 597 *   By using btrfs_search_forward() we can skip entire file ranges that
 598 *   have extents created in past transactions, because btrfs_search_forward()
 599 *   will not visit leaves and nodes with a generation smaller than given
 600 *   minimal generation threshold (@newer_than).
 601 *
 602 * Return valid em if we find a file extent matching the requirement.
 603 * Return NULL if we can not find a file extent matching the requirement.
 604 *
 605 * Return ERR_PTR() for error.
 606 */
 607static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
 608					    u64 start, u64 newer_than)
 609{
 610	struct btrfs_root *root = inode->root;
 611	struct btrfs_file_extent_item *fi;
 612	struct btrfs_path path = { 0 };
 613	struct extent_map *em;
 614	struct btrfs_key key;
 615	u64 ino = btrfs_ino(inode);
 616	int ret;
 617
 618	em = btrfs_alloc_extent_map();
 619	if (!em) {
 620		ret = -ENOMEM;
 621		goto err;
 622	}
 623
 624	key.objectid = ino;
 625	key.type = BTRFS_EXTENT_DATA_KEY;
 626	key.offset = start;
 627
 628	if (newer_than) {
 629		ret = btrfs_search_forward(root, &key, &path, newer_than);
 630		if (ret < 0)
 631			goto err;
 632		/* Can't find anything newer */
 633		if (ret > 0)
 634			goto not_found;
 635	} else {
 636		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
 637		if (ret < 0)
 638			goto err;
 639	}
 640	if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
 641		/*
 642		 * If btrfs_search_slot() makes path to point beyond nritems,
 643		 * we should not have an empty leaf, as this inode must at
 644		 * least have its INODE_ITEM.
 645		 */
 646		ASSERT(btrfs_header_nritems(path.nodes[0]));
 647		path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
 648	}
 649	btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
 650	/* Perfect match, no need to go one slot back */
 651	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
 652	    key.offset == start)
 653		goto iterate;
 654
 655	/* We didn't find a perfect match, needs to go one slot back */
 656	if (path.slots[0] > 0) {
 657		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
 658		if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
 659			path.slots[0]--;
 660	}
 661
 662iterate:
 663	/* Iterate through the path to find a file extent covering @start */
 664	while (true) {
 665		u64 extent_end;
 666
 667		if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
 668			goto next;
 669
 670		btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
 671
 672		/*
 673		 * We may go one slot back to INODE_REF/XATTR item, then
 674		 * need to go forward until we reach an EXTENT_DATA.
 675		 * But we should still has the correct ino as key.objectid.
 676		 */
 677		if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
 678			goto next;
 679
 680		/* It's beyond our target range, definitely not extent found */
 681		if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
 682			goto not_found;
 683
 684		/*
 685		 *	|	|<- File extent ->|
 686		 *	\- start
 687		 *
 688		 * This means there is a hole between start and key.offset.
 689		 */
 690		if (key.offset > start) {
 691			em->start = start;
 692			em->disk_bytenr = EXTENT_MAP_HOLE;
 693			em->disk_num_bytes = 0;
 694			em->ram_bytes = 0;
 695			em->offset = 0;
 696			em->len = key.offset - start;
 697			break;
 698		}
 699
 700		fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
 701				    struct btrfs_file_extent_item);
 702		extent_end = btrfs_file_extent_end(&path);
 703
 704		/*
 705		 *	|<- file extent ->|	|
 706		 *				\- start
 707		 *
 708		 * We haven't reached start, search next slot.
 709		 */
 710		if (extent_end <= start)
 711			goto next;
 712
 713		/* Now this extent covers @start, convert it to em */
 714		btrfs_extent_item_to_extent_map(inode, &path, fi, em);
 715		break;
 716next:
 717		ret = btrfs_next_item(root, &path);
 718		if (ret < 0)
 719			goto err;
 720		if (ret > 0)
 721			goto not_found;
 722	}
 723	btrfs_release_path(&path);
 724	return em;
 725
 726not_found:
 727	btrfs_release_path(&path);
 728	btrfs_free_extent_map(em);
 729	return NULL;
 730
 731err:
 732	btrfs_release_path(&path);
 733	btrfs_free_extent_map(em);
 734	return ERR_PTR(ret);
 735}
 736
 737static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
 738					       u64 newer_than, bool locked)
 739{
 740	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 741	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 742	struct extent_map *em;
 743	const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
 744
 745	/*
 746	 * Hopefully we have this extent in the tree already, try without the
 747	 * full extent lock.
 748	 */
 749	read_lock(&em_tree->lock);
 750	em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize);
 751	read_unlock(&em_tree->lock);
 752
 753	/*
 754	 * We can get a merged extent, in that case, we need to re-search
 755	 * tree to get the original em for defrag.
 756	 *
 757	 * This is because even if we have adjacent extents that are contiguous
 758	 * and compatible (same type and flags), we still want to defrag them
 759	 * so that we use less metadata (extent items in the extent tree and
 760	 * file extent items in the inode's subvolume tree).
 761	 */
 762	if (em && (em->flags & EXTENT_FLAG_MERGED)) {
 763		btrfs_free_extent_map(em);
 764		em = NULL;
 765	}
 766
 767	if (!em) {
 768		struct extent_state *cached = NULL;
 769		u64 end = start + sectorsize - 1;
 770
 771		/* Get the big lock and read metadata off disk. */
 772		if (!locked)
 773			btrfs_lock_extent(io_tree, start, end, &cached);
 774		em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
 775		if (!locked)
 776			btrfs_unlock_extent(io_tree, start, end, &cached);
 777
 778		if (IS_ERR(em))
 779			return NULL;
 780	}
 781
 782	return em;
 783}
 784
 785static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
 786				   const struct extent_map *em)
 787{
 788	if (btrfs_extent_map_is_compressed(em))
 789		return BTRFS_MAX_COMPRESSED;
 790	return fs_info->max_extent_size;
 791}
 792
 793static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
 794				     u32 extent_thresh, u64 newer_than, bool locked)
 795{
 796	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 797	struct extent_map *next;
 798	bool ret = false;
 799
 800	/* This is the last extent */
 801	if (em->start + em->len >= i_size_read(inode))
 802		return false;
 803
 804	/*
 805	 * Here we need to pass @newer_then when checking the next extent, or
 806	 * we will hit a case we mark current extent for defrag, but the next
 807	 * one will not be a target.
 808	 * This will just cause extra IO without really reducing the fragments.
 809	 */
 810	next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
 811	/* No more em or hole */
 812	if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE)
 813		goto out;
 814	if (next->flags & EXTENT_FLAG_PREALLOC)
 815		goto out;
 816	/*
 817	 * If the next extent is at its max capacity, defragging current extent
 818	 * makes no sense, as the total number of extents won't change.
 819	 */
 820	if (next->len >= get_extent_max_capacity(fs_info, em))
 821		goto out;
 822	/* Skip older extent */
 823	if (next->generation < newer_than)
 824		goto out;
 825	/* Also check extent size */
 826	if (next->len >= extent_thresh)
 827		goto out;
 828
 829	ret = true;
 830out:
 831	btrfs_free_extent_map(next);
 832	return ret;
 833}
 834
 835/*
 836 * Prepare one page to be defragged.
 837 *
 838 * This will ensure:
 839 *
 840 * - Returned page is locked and has been set up properly.
 841 * - No ordered extent exists in the page.
 842 * - The page is uptodate.
 843 *
 844 * NOTE: Caller should also wait for page writeback after the cluster is
 845 * prepared, here we don't do writeback wait for each page.
 846 */
 847static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
 848{
 849	struct address_space *mapping = inode->vfs_inode.i_mapping;
 850	gfp_t mask = btrfs_alloc_write_mask(mapping);
 851	u64 lock_start;
 852	u64 lock_end;
 853	struct extent_state *cached_state = NULL;
 854	struct folio *folio;
 855	int ret;
 856
 857again:
 858	/* TODO: Add order fgp order flags when large folios are fully enabled. */
 859	folio = __filemap_get_folio(mapping, index,
 860				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
 861	if (IS_ERR(folio))
 862		return folio;
 863
 864	/*
 865	 * Since we can defragment files opened read-only, we can encounter
 866	 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
 867	 *
 868	 * The IO for such large folios is not fully tested, thus return
 869	 * an error to reject such folios unless it's an experimental build.
 870	 *
 871	 * Filesystem transparent huge pages are typically only used for
 872	 * executables that explicitly enable them, so this isn't very
 873	 * restrictive.
 874	 */
 875	if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
 876		folio_unlock(folio);
 877		folio_put(folio);
 878		return ERR_PTR(-ETXTBSY);
 879	}
 880
 881	ret = set_folio_extent_mapped(folio);
 882	if (ret < 0) {
 883		folio_unlock(folio);
 884		folio_put(folio);
 885		return ERR_PTR(ret);
 886	}
 887
 888	lock_start = folio_pos(folio);
 889	lock_end = folio_next_pos(folio) - 1;
 890	/* Wait for any existing ordered extent in the range */
 891	while (1) {
 892		struct btrfs_ordered_extent *ordered;
 893
 894		btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
 895		ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio));
 896		btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
 897		if (!ordered)
 898			break;
 899
 900		folio_unlock(folio);
 901		btrfs_start_ordered_extent(ordered);
 902		btrfs_put_ordered_extent(ordered);
 903		folio_lock(folio);
 904		/*
 905		 * We unlocked the folio above, so we need check if it was
 906		 * released or not.
 907		 */
 908		if (folio->mapping != mapping || !folio->private) {
 909			folio_unlock(folio);
 910			folio_put(folio);
 911			goto again;
 912		}
 913	}
 914
 915	/*
 916	 * Now the page range has no ordered extent any more.  Read the page to
 917	 * make it uptodate.
 918	 */
 919	if (!folio_test_uptodate(folio)) {
 920		btrfs_read_folio(NULL, folio);
 921		folio_lock(folio);
 922		if (folio->mapping != mapping || !folio->private) {
 923			folio_unlock(folio);
 924			folio_put(folio);
 925			goto again;
 926		}
 927		if (unlikely(!folio_test_uptodate(folio))) {
 928			folio_unlock(folio);
 929			folio_put(folio);
 930			return ERR_PTR(-EIO);
 931		}
 932	}
 933	return folio;
 934}
 935
 936struct defrag_target_range {
 937	struct list_head list;
 938	u64 start;
 939	u64 len;
 940};
 941
 942/*
 943 * Collect all valid target extents.
 944 *
 945 * @start:	   file offset to lookup
 946 * @len:	   length to lookup
 947 * @extent_thresh: file extent size threshold, any extent size >= this value
 948 *		   will be ignored
 949 * @newer_than:    only defrag extents newer than this value
 950 * @do_compress:   whether the defrag is doing compression or no-compression
 951 *		   if true, @extent_thresh will be ignored and all regular
 952 *		   file extents meeting @newer_than will be targets.
 953 * @locked:	   if the range has already held extent lock
 954 * @target_list:   list of targets file extents
 955 */
 956static int defrag_collect_targets(struct btrfs_inode *inode,
 957				  u64 start, u64 len, u32 extent_thresh,
 958				  u64 newer_than, bool do_compress,
 959				  bool locked, struct list_head *target_list,
 960				  u64 *last_scanned_ret)
 961{
 962	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 963	bool last_is_target = false;
 964	u64 cur = start;
 965	int ret = 0;
 966
 967	while (cur < start + len) {
 968		struct extent_map *em;
 969		struct defrag_target_range *new;
 970		bool next_mergeable = true;
 971		u64 range_len;
 972
 973		last_is_target = false;
 974		em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
 975		if (!em)
 976			break;
 977
 978		/*
 979		 * If the file extent is an inlined one, we may still want to
 980		 * defrag it (fallthrough) if it will cause a regular extent.
 981		 * This is for users who want to convert inline extents to
 982		 * regular ones through max_inline= mount option.
 983		 */
 984		if (em->disk_bytenr == EXTENT_MAP_INLINE &&
 985		    em->len <= inode->root->fs_info->max_inline)
 986			goto next;
 987
 988		/* Skip holes and preallocated extents. */
 989		if (em->disk_bytenr == EXTENT_MAP_HOLE ||
 990		    (em->flags & EXTENT_FLAG_PREALLOC))
 991			goto next;
 992
 993		/* Skip older extent */
 994		if (em->generation < newer_than)
 995			goto next;
 996
 997		/* This em is under writeback, no need to defrag */
 998		if (em->generation == (u64)-1)
 999			goto next;
1000
1001		/*
1002		 * Our start offset might be in the middle of an existing extent
1003		 * map, so take that into account.
1004		 */
1005		range_len = em->len - (cur - em->start);
1006		/*
1007		 * If this range of the extent map is already flagged for delalloc,
1008		 * skip it, because:
1009		 *
1010		 * 1) We could deadlock later, when trying to reserve space for
1011		 *    delalloc, because in case we can't immediately reserve space
1012		 *    the flusher can start delalloc and wait for the respective
1013		 *    ordered extents to complete. The deadlock would happen
1014		 *    because we do the space reservation while holding the range
1015		 *    locked, and starting writeback, or finishing an ordered
1016		 *    extent, requires locking the range;
1017		 *
1018		 * 2) If there's delalloc there, it means there's dirty pages for
1019		 *    which writeback has not started yet (we clean the delalloc
1020		 *    flag when starting writeback and after creating an ordered
1021		 *    extent). If we mark pages in an adjacent range for defrag,
1022		 *    then we will have a larger contiguous range for delalloc,
1023		 *    very likely resulting in a larger extent after writeback is
1024		 *    triggered (except in a case of free space fragmentation).
1025		 */
1026		if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
1027						EXTENT_DELALLOC))
1028			goto next;
1029
1030		/*
1031		 * For do_compress case, we want to compress all valid file
1032		 * extents, thus no @extent_thresh or mergeable check.
1033		 */
1034		if (do_compress)
1035			goto add;
1036
1037		/* Skip too large extent */
1038		if (em->len >= extent_thresh)
1039			goto next;
1040
1041		/*
1042		 * Skip extents already at its max capacity, this is mostly for
1043		 * compressed extents, which max cap is only 128K.
1044		 */
1045		if (em->len >= get_extent_max_capacity(fs_info, em))
1046			goto next;
1047
1048		/*
1049		 * Normally there are no more extents after an inline one, thus
1050		 * @next_mergeable will normally be false and not defragged.
1051		 * So if an inline extent passed all above checks, just add it
1052		 * for defrag, and be converted to regular extents.
1053		 */
1054		if (em->disk_bytenr == EXTENT_MAP_INLINE)
1055			goto add;
1056
1057		next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
1058						extent_thresh, newer_than, locked);
1059		if (!next_mergeable) {
1060			struct defrag_target_range *last;
1061
1062			/* Empty target list, no way to merge with last entry */
1063			if (list_empty(target_list))
1064				goto next;
1065			last = list_last_entry(target_list,
1066					       struct defrag_target_range, list);
1067			/* Not mergeable with last entry */
1068			if (last->start + last->len != cur)
1069				goto next;
1070
1071			/* Mergeable, fall through to add it to @target_list. */
1072		}
1073
1074add:
1075		last_is_target = true;
1076		range_len = min(btrfs_extent_map_end(em), start + len) - cur;
1077		/*
1078		 * This one is a good target, check if it can be merged into
1079		 * last range of the target list.
1080		 */
1081		if (!list_empty(target_list)) {
1082			struct defrag_target_range *last;
1083
1084			last = list_last_entry(target_list,
1085					       struct defrag_target_range, list);
1086			ASSERT(last->start + last->len <= cur);
1087			if (last->start + last->len == cur) {
1088				/* Mergeable, enlarge the last entry */
1089				last->len += range_len;
1090				goto next;
1091			}
1092			/* Fall through to allocate a new entry */
1093		}
1094
1095		/* Allocate new defrag_target_range */
1096		new = kmalloc(sizeof(*new), GFP_NOFS);
1097		if (!new) {
1098			btrfs_free_extent_map(em);
1099			ret = -ENOMEM;
1100			break;
1101		}
1102		new->start = cur;
1103		new->len = range_len;
1104		list_add_tail(&new->list, target_list);
1105
1106next:
1107		cur = btrfs_extent_map_end(em);
1108		btrfs_free_extent_map(em);
1109	}
1110	if (ret < 0) {
1111		struct defrag_target_range *entry;
1112		struct defrag_target_range *tmp;
1113
1114		list_for_each_entry_safe(entry, tmp, target_list, list) {
1115			list_del_init(&entry->list);
1116			kfree(entry);
1117		}
1118	}
1119	if (!ret && last_scanned_ret) {
1120		/*
1121		 * If the last extent is not a target, the caller can skip to
1122		 * the end of that extent.
1123		 * Otherwise, we can only go the end of the specified range.
1124		 */
1125		if (!last_is_target)
1126			*last_scanned_ret = max(cur, *last_scanned_ret);
1127		else
1128			*last_scanned_ret = max(start + len, *last_scanned_ret);
1129	}
1130	return ret;
1131}
1132
1133#define CLUSTER_SIZE	(SZ_256K)
1134static_assert(PAGE_ALIGNED(CLUSTER_SIZE));
1135
1136/*
1137 * Defrag one contiguous target range.
1138 *
1139 * @inode:	target inode
1140 * @target:	target range to defrag
1141 * @pages:	locked pages covering the defrag range
1142 * @nr_pages:	number of locked pages
1143 *
1144 * Caller should ensure:
1145 *
1146 * - Pages are prepared
1147 *   Pages should be locked, no ordered extent in the pages range,
1148 *   no writeback.
1149 *
1150 * - Extent bits are locked
1151 */
1152static int defrag_one_locked_target(struct btrfs_inode *inode,
1153				    struct defrag_target_range *target,
1154				    struct folio **folios, int nr_pages,
1155				    struct extent_state **cached_state)
1156{
1157	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1158	struct extent_changeset *data_reserved = NULL;
1159	const u64 start = target->start;
1160	const u64 len = target->len;
1161	int ret = 0;
1162
1163	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
1164	if (ret < 0)
1165		return ret;
1166	btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1,
1167			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1168			       EXTENT_DEFRAG, cached_state);
1169	btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1,
1170			     EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
1171
1172	/*
1173	 * Update the page status.
1174	 * Due to possible large folios, we have to check all folios one by one.
1175	 */
1176	for (int i = 0; i < nr_pages && folios[i]; i++) {
1177		struct folio *folio = folios[i];
1178
1179		if (!folio)
1180			break;
1181		if (start >= folio_next_pos(folio) ||
1182		    start + len <= folio_pos(folio))
1183			continue;
1184		btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
1185		btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
1186	}
1187	btrfs_delalloc_release_extents(inode, len);
1188	extent_changeset_free(data_reserved);
1189
1190	return ret;
1191}
1192
1193static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1194			    u32 extent_thresh, u64 newer_than, bool do_compress,
1195			    u64 *last_scanned_ret)
1196{
1197	struct extent_state *cached_state = NULL;
1198	struct defrag_target_range *entry;
1199	struct defrag_target_range *tmp;
1200	LIST_HEAD(target_list);
1201	struct folio **folios;
1202	const u32 sectorsize = inode->root->fs_info->sectorsize;
1203	u64 cur = start;
1204	const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) -
1205				      (start >> PAGE_SHIFT) + 1;
1206	int ret = 0;
1207
1208	ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1209	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1210
1211	folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
1212	if (!folios)
1213		return -ENOMEM;
1214
1215	/* Prepare all pages */
1216	for (int i = 0; cur < start + len && i < nr_pages; i++) {
1217		folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT);
1218		if (IS_ERR(folios[i])) {
1219			ret = PTR_ERR(folios[i]);
1220			folios[i] = NULL;
1221			goto free_folios;
1222		}
1223		cur = folio_next_pos(folios[i]);
1224	}
1225	for (int i = 0; i < nr_pages; i++) {
1226		if (!folios[i])
1227			break;
1228		folio_wait_writeback(folios[i]);
1229	}
1230
1231	/* We should get at least one folio. */
1232	ASSERT(folios[0]);
1233	/* Lock the pages range */
1234	btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
1235	/*
1236	 * Now we have a consistent view about the extent map, re-check
1237	 * which range really needs to be defragged.
1238	 *
1239	 * And this time we have extent locked already, pass @locked = true
1240	 * so that we won't relock the extent range and cause deadlock.
1241	 */
1242	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1243				     newer_than, do_compress, true,
1244				     &target_list, last_scanned_ret);
1245	if (ret < 0)
1246		goto unlock_extent;
1247
1248	list_for_each_entry(entry, &target_list, list) {
1249		ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
1250					       &cached_state);
1251		if (ret < 0)
1252			break;
1253	}
1254
1255	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1256		list_del_init(&entry->list);
1257		kfree(entry);
1258	}
1259unlock_extent:
1260	btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
1261free_folios:
1262	for (int i = 0; i < nr_pages; i++) {
1263		if (!folios[i])
1264			break;
1265		folio_unlock(folios[i]);
1266		folio_put(folios[i]);
1267	}
1268	kfree(folios);
1269	return ret;
1270}
1271
1272static int defrag_one_cluster(struct btrfs_inode *inode,
1273			      struct file_ra_state *ra,
1274			      u64 start, u32 len, u32 extent_thresh,
1275			      u64 newer_than, bool do_compress,
1276			      unsigned long *sectors_defragged,
1277			      unsigned long max_sectors,
1278			      u64 *last_scanned_ret)
1279{
1280	const u32 sectorsize = inode->root->fs_info->sectorsize;
1281	struct defrag_target_range *entry;
1282	struct defrag_target_range *tmp;
1283	LIST_HEAD(target_list);
1284	int ret;
1285
1286	ret = defrag_collect_targets(inode, start, len, extent_thresh,
1287				     newer_than, do_compress, false,
1288				     &target_list, NULL);
1289	if (ret < 0)
1290		goto out;
1291
1292	list_for_each_entry(entry, &target_list, list) {
1293		u32 range_len = entry->len;
1294
1295		/* Reached or beyond the limit */
1296		if (max_sectors && *sectors_defragged >= max_sectors) {
1297			ret = 1;
1298			break;
1299		}
1300
1301		if (max_sectors)
1302			range_len = min_t(u32, range_len,
1303				(max_sectors - *sectors_defragged) * sectorsize);
1304
1305		/*
1306		 * If defrag_one_range() has updated last_scanned_ret,
1307		 * our range may already be invalid (e.g. hole punched).
1308		 * Skip if our range is before last_scanned_ret, as there is
1309		 * no need to defrag the range anymore.
1310		 */
1311		if (entry->start + range_len <= *last_scanned_ret)
1312			continue;
1313
1314		page_cache_sync_readahead(inode->vfs_inode.i_mapping,
1315				ra, NULL, entry->start >> PAGE_SHIFT,
1316				((entry->start + range_len - 1) >> PAGE_SHIFT) -
1317				(entry->start >> PAGE_SHIFT) + 1);
1318		/*
1319		 * Here we may not defrag any range if holes are punched before
1320		 * we locked the pages.
1321		 * But that's fine, it only affects the @sectors_defragged
1322		 * accounting.
1323		 */
1324		ret = defrag_one_range(inode, entry->start, range_len,
1325				       extent_thresh, newer_than, do_compress,
1326				       last_scanned_ret);
1327		if (ret < 0)
1328			break;
1329		*sectors_defragged += range_len >>
1330				      inode->root->fs_info->sectorsize_bits;
1331	}
1332out:
1333	list_for_each_entry_safe(entry, tmp, &target_list, list) {
1334		list_del_init(&entry->list);
1335		kfree(entry);
1336	}
1337	if (ret >= 0)
1338		*last_scanned_ret = max(*last_scanned_ret, start + len);
1339	return ret;
1340}
1341
1342/*
1343 * Entry point to file defragmentation.
1344 *
1345 * @inode:	   inode to be defragged
1346 * @ra:		   readahead state
1347 * @range:	   defrag options including range and flags
1348 * @newer_than:	   minimum transid to defrag
1349 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1350 *		   will be defragged.
1351 *
1352 * Return <0 for error.
1353 * Return >=0 for the number of sectors defragged, and range->start will be updated
1354 * to indicate the file offset where next defrag should be started at.
1355 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1356 *  defragging all the range).
1357 */
1358int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
1359		      struct btrfs_ioctl_defrag_range_args *range,
1360		      u64 newer_than, unsigned long max_to_defrag)
1361{
1362	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1363	unsigned long sectors_defragged = 0;
1364	u64 isize = i_size_read(&inode->vfs_inode);
1365	u64 cur;
1366	u64 last_byte;
1367	bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
1368	bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
1369	int compress_type = BTRFS_COMPRESS_ZLIB;
1370	int compress_level = 0;
1371	int ret = 0;
1372	u32 extent_thresh = range->extent_thresh;
1373	pgoff_t start_index;
1374
1375	ASSERT(ra);
1376
1377	if (isize == 0)
1378		return 0;
1379
1380	if (range->start >= isize)
1381		return -EINVAL;
1382
1383	if (do_compress) {
1384		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) {
1385			if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES)
1386				return -EINVAL;
1387			if (range->compress.type) {
1388				compress_type  = range->compress.type;
1389				compress_level = range->compress.level;
1390				if (!btrfs_compress_level_valid(compress_type, compress_level))
1391					return -EINVAL;
1392			}
1393		} else {
1394			if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1395				return -EINVAL;
1396			if (range->compress_type)
1397				compress_type = range->compress_type;
1398		}
1399	} else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
1400		compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
1401		compress_level = 1;
1402	}
1403
1404	if (extent_thresh == 0)
1405		extent_thresh = SZ_256K;
1406
1407	if (range->start + range->len > range->start) {
1408		/* Got a specific range */
1409		last_byte = min(isize, range->start + range->len);
1410	} else {
1411		/* Defrag until file end */
1412		last_byte = isize;
1413	}
1414
1415	/* Align the range */
1416	cur = round_down(range->start, fs_info->sectorsize);
1417	last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1418
1419	/*
1420	 * Make writeback start from the beginning of the range, so that the
1421	 * defrag range can be written sequentially.
1422	 */
1423	start_index = cur >> PAGE_SHIFT;
1424	if (start_index < inode->vfs_inode.i_mapping->writeback_index)
1425		inode->vfs_inode.i_mapping->writeback_index = start_index;
1426
1427	while (cur < last_byte) {
1428		const unsigned long prev_sectors_defragged = sectors_defragged;
1429		u64 last_scanned = cur;
1430		u64 cluster_end;
1431
1432		if (btrfs_defrag_cancelled(fs_info)) {
1433			ret = -EAGAIN;
1434			break;
1435		}
1436
1437		/* We want the cluster end at page boundary when possible */
1438		cluster_end = (((cur >> PAGE_SHIFT) +
1439			       (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
1440		cluster_end = min(cluster_end, last_byte);
1441
1442		btrfs_inode_lock(inode, 0);
1443		if (IS_SWAPFILE(&inode->vfs_inode)) {
1444			ret = -ETXTBSY;
1445			btrfs_inode_unlock(inode, 0);
1446			break;
1447		}
1448		if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) {
1449			btrfs_inode_unlock(inode, 0);
1450			break;
1451		}
1452		if (do_compress || no_compress) {
1453			inode->defrag_compress = compress_type;
1454			inode->defrag_compress_level = compress_level;
1455		}
1456		ret = defrag_one_cluster(inode, ra, cur,
1457				cluster_end + 1 - cur, extent_thresh,
1458				newer_than, do_compress || no_compress,
1459				&sectors_defragged,
1460				max_to_defrag, &last_scanned);
1461
1462		if (sectors_defragged > prev_sectors_defragged)
1463			balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping);
1464
1465		btrfs_inode_unlock(inode, 0);
1466		if (ret < 0)
1467			break;
1468		cur = max(cluster_end + 1, last_scanned);
1469		if (ret > 0) {
1470			ret = 0;
1471			break;
1472		}
1473		cond_resched();
1474	}
1475
1476	/*
1477	 * Update range.start for autodefrag, this will indicate where to start
1478	 * in next run.
1479	 */
1480	range->start = cur;
1481	if (sectors_defragged) {
1482		/*
1483		 * We have defragged some sectors, for compression case they
1484		 * need to be written back immediately.
1485		 */
1486		if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1487			filemap_flush(inode->vfs_inode.i_mapping);
1488			if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1489				     &inode->runtime_flags))
1490				filemap_flush(inode->vfs_inode.i_mapping);
1491		}
1492		if (range->compress_type == BTRFS_COMPRESS_LZO)
1493			btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1494		else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1495			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1496		ret = sectors_defragged;
1497	}
1498	if (do_compress || no_compress) {
1499		btrfs_inode_lock(inode, 0);
1500		inode->defrag_compress = BTRFS_COMPRESS_NONE;
1501		btrfs_inode_unlock(inode, 0);
1502	}
1503	return ret;
1504}
1505
1506void __cold btrfs_auto_defrag_exit(void)
1507{
1508	kmem_cache_destroy(btrfs_inode_defrag_cachep);
1509}
1510
1511int __init btrfs_auto_defrag_init(void)
1512{
1513	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
1514					sizeof(struct inode_defrag), 0, 0, NULL);
1515	if (!btrfs_inode_defrag_cachep)
1516		return -ENOMEM;
1517
1518	return 0;
1519}