fs/xfs/xfs_zone_gc.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / xfs / xfs_zone_gc.c
at master 30 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2023-2025 Christoph Hellwig.
   4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
   5 */
   6#include "xfs.h"
   7#include "xfs_shared.h"
   8#include "xfs_format.h"
   9#include "xfs_log_format.h"
  10#include "xfs_trans_resv.h"
  11#include "xfs_mount.h"
  12#include "xfs_inode.h"
  13#include "xfs_btree.h"
  14#include "xfs_trans.h"
  15#include "xfs_icache.h"
  16#include "xfs_rmap.h"
  17#include "xfs_rtbitmap.h"
  18#include "xfs_rtrmap_btree.h"
  19#include "xfs_zone_alloc.h"
  20#include "xfs_zone_priv.h"
  21#include "xfs_zones.h"
  22#include "xfs_trace.h"
  23
  24/*
  25 * Implement Garbage Collection (GC) of partially used zoned.
  26 *
  27 * To support the purely sequential writes in each zone, zoned XFS needs to be
  28 * able to move data remaining in a zone out of it to reset the zone to prepare
  29 * for writing to it again.
  30 *
  31 * This is done by the GC thread implemented in this file.  To support that a
  32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
  33 * write the garbage collected data into.
  34 *
  35 * Whenever the available space is below the chosen threshold, the GC thread
  36 * looks for potential non-empty but not fully used zones that are worth
  37 * reclaiming.  Once found the rmap for the victim zone is queried, and after
  38 * a bit of sorting to reduce fragmentation, the still live extents are read
  39 * into memory and written to the GC target zone, and the bmap btree of the
  40 * files is updated to point to the new location.  To avoid taking the IOLOCK
  41 * and MMAPLOCK for the entire GC process and thus affecting the latency of
  42 * user reads and writes to the files, the GC writes are speculative and the
  43 * I/O completion checks that no other writes happened for the affected regions
  44 * before remapping.
  45 *
  46 * Once a zone does not contain any valid data, be that through GC or user
  47 * block removal, it is queued for for a zone reset.  The reset operation
  48 * carefully ensures that the RT device cache is flushed and all transactions
  49 * referencing the rmap have been committed to disk.
  50 */
  51
  52/*
  53 * Size of each GC scratch pad.  This is also the upper bound for each
  54 * GC I/O, which helps to keep latency down.
  55 */
  56#define XFS_GC_CHUNK_SIZE	SZ_1M
  57
  58/*
  59 * Scratchpad data to read GCed data into.
  60 *
  61 * The offset member tracks where the next allocation starts, and freed tracks
  62 * the amount of space that is not used anymore.
  63 */
  64#define XFS_ZONE_GC_NR_SCRATCH	2
  65struct xfs_zone_scratch {
  66	struct folio			*folio;
  67	unsigned int			offset;
  68	unsigned int			freed;
  69};
  70
  71/*
  72 * Chunk that is read and written for each GC operation.
  73 *
  74 * Note that for writes to actual zoned devices, the chunk can be split when
  75 * reaching the hardware limit.
  76 */
  77struct xfs_gc_bio {
  78	struct xfs_zone_gc_data		*data;
  79
  80	/*
  81	 * Entry into the reading/writing/resetting list.  Only accessed from
  82	 * the GC thread, so no locking needed.
  83	 */
  84	struct list_head		entry;
  85
  86	/*
  87	 * State of this gc_bio.  Done means the current I/O completed.
  88	 * Set from the bio end I/O handler, read from the GC thread.
  89	 */
  90	enum {
  91		XFS_GC_BIO_NEW,
  92		XFS_GC_BIO_DONE,
  93	} state;
  94
  95	/*
  96	 * Pointer to the inode and byte range in the inode that this
  97	 * GC chunk is operating on.
  98	 */
  99	struct xfs_inode		*ip;
 100	loff_t				offset;
 101	unsigned int			len;
 102
 103	/*
 104	 * Existing startblock (in the zone to be freed) and newly assigned
 105	 * daddr in the zone GCed into.
 106	 */
 107	xfs_fsblock_t			old_startblock;
 108	xfs_daddr_t			new_daddr;
 109	struct xfs_zone_scratch		*scratch;
 110
 111	/* Are we writing to a sequential write required zone? */
 112	bool				is_seq;
 113
 114	/* Open Zone being written to */
 115	struct xfs_open_zone		*oz;
 116
 117	struct xfs_rtgroup		*victim_rtg;
 118
 119	/* Bio used for reads and writes, including the bvec used by it */
 120	struct bio			bio;	/* must be last */
 121};
 122
 123#define XFS_ZONE_GC_RECS		1024
 124
 125/* iterator, needs to be reinitialized for each victim zone */
 126struct xfs_zone_gc_iter {
 127	struct xfs_rtgroup		*victim_rtg;
 128	unsigned int			rec_count;
 129	unsigned int			rec_idx;
 130	xfs_agblock_t			next_startblock;
 131	struct xfs_rmap_irec		*recs;
 132};
 133
 134/*
 135 * Per-mount GC state.
 136 */
 137struct xfs_zone_gc_data {
 138	struct xfs_mount		*mp;
 139
 140	/* bioset used to allocate the gc_bios */
 141	struct bio_set			bio_set;
 142
 143	/*
 144	 * Scratchpad used, and index to indicated which one is used.
 145	 */
 146	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
 147	unsigned int			scratch_idx;
 148
 149	/*
 150	 * List of bios currently being read, written and reset.
 151	 * These lists are only accessed by the GC thread itself, and must only
 152	 * be processed in order.
 153	 */
 154	struct list_head		reading;
 155	struct list_head		writing;
 156	struct list_head		resetting;
 157
 158	/*
 159	 * Iterator for the victim zone.
 160	 */
 161	struct xfs_zone_gc_iter		iter;
 162};
 163
 164/*
 165 * We aim to keep enough zones free in stock to fully use the open zone limit
 166 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
 167 * can be set to make sure a fraction of the unused blocks are available for
 168 * writing.
 169 */
 170bool
 171xfs_zoned_need_gc(
 172	struct xfs_mount	*mp)
 173{
 174	s64			available, free, threshold;
 175	s32			remainder;
 176
 177	if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
 178		return false;
 179
 180	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
 181
 182	if (available <
 183	    xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
 184		return true;
 185
 186	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
 187
 188	threshold = div_s64_rem(free, 100, &remainder);
 189	threshold = threshold * mp->m_zonegc_low_space +
 190		    remainder * div_s64(mp->m_zonegc_low_space, 100);
 191
 192	if (available < threshold)
 193		return true;
 194
 195	return false;
 196}
 197
 198static struct xfs_zone_gc_data *
 199xfs_zone_gc_data_alloc(
 200	struct xfs_mount	*mp)
 201{
 202	struct xfs_zone_gc_data	*data;
 203	int			i;
 204
 205	data = kzalloc(sizeof(*data), GFP_KERNEL);
 206	if (!data)
 207		return NULL;
 208	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
 209			GFP_KERNEL);
 210	if (!data->iter.recs)
 211		goto out_free_data;
 212
 213	/*
 214	 * We actually only need a single bio_vec.  It would be nice to have
 215	 * a flag that only allocates the inline bvecs and not the separate
 216	 * bvec pool.
 217	 */
 218	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
 219			BIOSET_NEED_BVECS))
 220		goto out_free_recs;
 221	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
 222		data->scratch[i].folio =
 223			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
 224		if (!data->scratch[i].folio)
 225			goto out_free_scratch;
 226	}
 227	INIT_LIST_HEAD(&data->reading);
 228	INIT_LIST_HEAD(&data->writing);
 229	INIT_LIST_HEAD(&data->resetting);
 230	data->mp = mp;
 231	return data;
 232
 233out_free_scratch:
 234	while (--i >= 0)
 235		folio_put(data->scratch[i].folio);
 236	bioset_exit(&data->bio_set);
 237out_free_recs:
 238	kfree(data->iter.recs);
 239out_free_data:
 240	kfree(data);
 241	return NULL;
 242}
 243
 244static void
 245xfs_zone_gc_data_free(
 246	struct xfs_zone_gc_data	*data)
 247{
 248	int			i;
 249
 250	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
 251		folio_put(data->scratch[i].folio);
 252	bioset_exit(&data->bio_set);
 253	kfree(data->iter.recs);
 254	kfree(data);
 255}
 256
 257static void
 258xfs_zone_gc_iter_init(
 259	struct xfs_zone_gc_iter	*iter,
 260	struct xfs_rtgroup	*victim_rtg)
 261
 262{
 263	iter->next_startblock = 0;
 264	iter->rec_count = 0;
 265	iter->rec_idx = 0;
 266	iter->victim_rtg = victim_rtg;
 267	atomic_inc(&victim_rtg->rtg_gccount);
 268}
 269
 270/*
 271 * Query the rmap of the victim zone to gather the records to evacuate.
 272 */
 273static int
 274xfs_zone_gc_query_cb(
 275	struct xfs_btree_cur	*cur,
 276	const struct xfs_rmap_irec *irec,
 277	void			*private)
 278{
 279	struct xfs_zone_gc_iter	*iter = private;
 280
 281	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
 282	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
 283	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
 284
 285	iter->recs[iter->rec_count] = *irec;
 286	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
 287		iter->next_startblock =
 288			irec->rm_startblock + irec->rm_blockcount;
 289		return 1;
 290	}
 291	return 0;
 292}
 293
 294static int
 295xfs_zone_gc_rmap_rec_cmp(
 296	const void			*a,
 297	const void			*b)
 298{
 299	const struct xfs_rmap_irec	*reca = a;
 300	const struct xfs_rmap_irec	*recb = b;
 301	int				diff;
 302
 303	diff = cmp_int(reca->rm_owner, recb->rm_owner);
 304	if (diff)
 305		return diff;
 306	return cmp_int(reca->rm_offset, recb->rm_offset);
 307}
 308
 309static int
 310xfs_zone_gc_query(
 311	struct xfs_mount	*mp,
 312	struct xfs_zone_gc_iter	*iter)
 313{
 314	struct xfs_rtgroup	*rtg = iter->victim_rtg;
 315	struct xfs_rmap_irec	ri_low = { };
 316	struct xfs_rmap_irec	ri_high;
 317	struct xfs_btree_cur	*cur;
 318	struct xfs_trans	*tp;
 319	int			error;
 320
 321	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
 322	if (iter->next_startblock == rtg_blocks(rtg))
 323		goto done;
 324
 325	ASSERT(iter->next_startblock < rtg_blocks(rtg));
 326	ri_low.rm_startblock = iter->next_startblock;
 327	memset(&ri_high, 0xFF, sizeof(ri_high));
 328
 329	iter->rec_idx = 0;
 330	iter->rec_count = 0;
 331
 332	tp = xfs_trans_alloc_empty(mp);
 333	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
 334	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
 335	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
 336			xfs_zone_gc_query_cb, iter);
 337	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
 338	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
 339	xfs_trans_cancel(tp);
 340
 341	if (error < 0)
 342		return error;
 343
 344	/*
 345	 * Sort the rmap records by inode number and increasing offset to
 346	 * defragment the mappings.
 347	 *
 348	 * This could be further enhanced by an even bigger look ahead window,
 349	 * but that's better left until we have better detection of changes to
 350	 * inode mapping to avoid the potential of GCing already dead data.
 351	 */
 352	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
 353			xfs_zone_gc_rmap_rec_cmp, NULL);
 354
 355	if (error == 0) {
 356		/*
 357		 * We finished iterating through the zone.
 358		 */
 359		iter->next_startblock = rtg_blocks(rtg);
 360		if (iter->rec_count == 0)
 361			goto done;
 362	}
 363
 364	return 0;
 365done:
 366	atomic_dec(&iter->victim_rtg->rtg_gccount);
 367	xfs_rtgroup_rele(iter->victim_rtg);
 368	iter->victim_rtg = NULL;
 369	return 0;
 370}
 371
 372static bool
 373xfs_zone_gc_iter_next(
 374	struct xfs_mount	*mp,
 375	struct xfs_zone_gc_iter	*iter,
 376	struct xfs_rmap_irec	*chunk_rec,
 377	struct xfs_inode	**ipp)
 378{
 379	struct xfs_rmap_irec	*irec;
 380	int			error;
 381
 382	if (!iter->victim_rtg)
 383		return false;
 384
 385retry:
 386	if (iter->rec_idx == iter->rec_count) {
 387		error = xfs_zone_gc_query(mp, iter);
 388		if (error)
 389			goto fail;
 390		if (!iter->victim_rtg)
 391			return false;
 392	}
 393
 394	irec = &iter->recs[iter->rec_idx];
 395	error = xfs_iget(mp, NULL, irec->rm_owner,
 396			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
 397	if (error) {
 398		/*
 399		 * If the inode was already deleted, skip over it.
 400		 */
 401		if (error == -ENOENT) {
 402			iter->rec_idx++;
 403			goto retry;
 404		}
 405		goto fail;
 406	}
 407
 408	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
 409		iter->rec_idx++;
 410		xfs_irele(*ipp);
 411		goto retry;
 412	}
 413
 414	*chunk_rec = *irec;
 415	return true;
 416
 417fail:
 418	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 419	return false;
 420}
 421
 422static void
 423xfs_zone_gc_iter_advance(
 424	struct xfs_zone_gc_iter	*iter,
 425	xfs_extlen_t		count_fsb)
 426{
 427	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
 428
 429	irec->rm_offset += count_fsb;
 430	irec->rm_startblock += count_fsb;
 431	irec->rm_blockcount -= count_fsb;
 432	if (!irec->rm_blockcount)
 433		iter->rec_idx++;
 434}
 435
 436static struct xfs_rtgroup *
 437xfs_zone_gc_pick_victim_from(
 438	struct xfs_mount	*mp,
 439	uint32_t		bucket)
 440{
 441	struct xfs_zone_info	*zi = mp->m_zone_info;
 442	uint32_t		victim_used = U32_MAX;
 443	struct xfs_rtgroup	*victim_rtg = NULL;
 444	uint32_t		bit;
 445
 446	if (!zi->zi_used_bucket_entries[bucket])
 447		return NULL;
 448
 449	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
 450			mp->m_sb.sb_rgcount) {
 451		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
 452
 453		if (!rtg)
 454			continue;
 455
 456		/*
 457		 * If the zone is already undergoing GC, don't pick it again.
 458		 *
 459		 * This prevents us from picking one of the zones for which we
 460		 * already submitted GC I/O, but for which the remapping hasn't
 461		 * concluded yet.  This won't cause data corruption, but
 462		 * increases write amplification and slows down GC, so this is
 463		 * a bad thing.
 464		 */
 465		if (atomic_read(&rtg->rtg_gccount)) {
 466			xfs_rtgroup_rele(rtg);
 467			continue;
 468		}
 469
 470		/* skip zones that are just waiting for a reset */
 471		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
 472		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
 473			xfs_rtgroup_rele(rtg);
 474			continue;
 475		}
 476
 477		if (victim_rtg)
 478			xfs_rtgroup_rele(victim_rtg);
 479		victim_rtg = rtg;
 480		victim_used = rtg_rmap(rtg)->i_used_blocks;
 481
 482		/*
 483		 * Any zone that is less than 1 percent used is fair game for
 484		 * instant reclaim. All of these zones are in the last
 485		 * bucket, so avoid the expensive division for the zones
 486		 * in the other buckets.
 487		 */
 488		if (bucket == 0 &&
 489		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
 490			break;
 491	}
 492
 493	return victim_rtg;
 494}
 495
 496/*
 497 * Iterate through all zones marked as reclaimable and find a candidate to
 498 * reclaim.
 499 */
 500static bool
 501xfs_zone_gc_select_victim(
 502	struct xfs_zone_gc_data	*data)
 503{
 504	struct xfs_zone_gc_iter	*iter = &data->iter;
 505	struct xfs_mount	*mp = data->mp;
 506	struct xfs_zone_info	*zi = mp->m_zone_info;
 507	struct xfs_rtgroup	*victim_rtg = NULL;
 508	unsigned int		bucket;
 509
 510	spin_lock(&zi->zi_used_buckets_lock);
 511	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
 512		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
 513		if (victim_rtg)
 514			break;
 515	}
 516	spin_unlock(&zi->zi_used_buckets_lock);
 517
 518	if (!victim_rtg)
 519		return false;
 520
 521	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
 522	xfs_zone_gc_iter_init(iter, victim_rtg);
 523	return true;
 524}
 525
 526static struct xfs_open_zone *
 527xfs_zone_gc_steal_open(
 528	struct xfs_zone_info	*zi)
 529{
 530	struct xfs_open_zone	*oz, *found = NULL;
 531
 532	spin_lock(&zi->zi_open_zones_lock);
 533	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
 534		if (!found || oz->oz_allocated < found->oz_allocated)
 535			found = oz;
 536	}
 537
 538	if (found) {
 539		found->oz_is_gc = true;
 540		list_del_init(&found->oz_entry);
 541		zi->zi_nr_open_zones--;
 542	}
 543
 544	spin_unlock(&zi->zi_open_zones_lock);
 545	return found;
 546}
 547
 548static struct xfs_open_zone *
 549xfs_zone_gc_select_target(
 550	struct xfs_mount	*mp)
 551{
 552	struct xfs_zone_info	*zi = mp->m_zone_info;
 553	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
 554
 555	/*
 556	 * We need to wait for pending writes to finish.
 557	 */
 558	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
 559		return NULL;
 560
 561	ASSERT(zi->zi_nr_open_zones <=
 562		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
 563	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
 564	if (oz)
 565		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
 566	spin_lock(&zi->zi_open_zones_lock);
 567	zi->zi_open_gc_zone = oz;
 568	spin_unlock(&zi->zi_open_zones_lock);
 569	return oz;
 570}
 571
 572/*
 573 * Ensure we have a valid open zone to write the GC data to.
 574 *
 575 * If the current target zone has space keep writing to it, else first wait for
 576 * all pending writes and then pick a new one.
 577 */
 578static struct xfs_open_zone *
 579xfs_zone_gc_ensure_target(
 580	struct xfs_mount	*mp)
 581{
 582	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
 583
 584	if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
 585		return xfs_zone_gc_select_target(mp);
 586	return oz;
 587}
 588
 589static unsigned int
 590xfs_zone_gc_scratch_available(
 591	struct xfs_zone_gc_data	*data)
 592{
 593	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
 594}
 595
 596static bool
 597xfs_zone_gc_space_available(
 598	struct xfs_zone_gc_data	*data)
 599{
 600	struct xfs_open_zone	*oz;
 601
 602	oz = xfs_zone_gc_ensure_target(data->mp);
 603	if (!oz)
 604		return false;
 605	return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
 606		xfs_zone_gc_scratch_available(data);
 607}
 608
 609static void
 610xfs_zone_gc_end_io(
 611	struct bio		*bio)
 612{
 613	struct xfs_gc_bio	*chunk =
 614		container_of(bio, struct xfs_gc_bio, bio);
 615	struct xfs_zone_gc_data	*data = chunk->data;
 616
 617	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
 618	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
 619}
 620
 621static struct xfs_open_zone *
 622xfs_zone_gc_alloc_blocks(
 623	struct xfs_zone_gc_data	*data,
 624	xfs_extlen_t		*count_fsb,
 625	xfs_daddr_t		*daddr,
 626	bool			*is_seq)
 627{
 628	struct xfs_mount	*mp = data->mp;
 629	struct xfs_open_zone	*oz;
 630
 631	oz = xfs_zone_gc_ensure_target(mp);
 632	if (!oz)
 633		return NULL;
 634
 635	*count_fsb = min(*count_fsb,
 636		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
 637
 638	/*
 639	 * Directly allocate GC blocks from the reserved pool.
 640	 *
 641	 * If we'd take them from the normal pool we could be stealing blocks
 642	 * from a regular writer, which would then have to wait for GC and
 643	 * deadlock.
 644	 */
 645	spin_lock(&mp->m_sb_lock);
 646	*count_fsb = min(*count_fsb,
 647			rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
 648	*count_fsb = min3(*count_fsb,
 649			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
 650			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
 651	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
 652	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
 653	spin_unlock(&mp->m_sb_lock);
 654
 655	if (!*count_fsb)
 656		return NULL;
 657
 658	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
 659	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
 660	if (!*is_seq)
 661		*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
 662	oz->oz_allocated += *count_fsb;
 663	atomic_inc(&oz->oz_ref);
 664	return oz;
 665}
 666
 667static bool
 668xfs_zone_gc_start_chunk(
 669	struct xfs_zone_gc_data	*data)
 670{
 671	struct xfs_zone_gc_iter	*iter = &data->iter;
 672	struct xfs_mount	*mp = data->mp;
 673	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
 674	struct xfs_open_zone	*oz;
 675	struct xfs_rmap_irec	irec;
 676	struct xfs_gc_bio	*chunk;
 677	struct xfs_inode	*ip;
 678	struct bio		*bio;
 679	xfs_daddr_t		daddr;
 680	bool			is_seq;
 681
 682	if (xfs_is_shutdown(mp))
 683		return false;
 684
 685	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
 686		return false;
 687	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
 688			&is_seq);
 689	if (!oz) {
 690		xfs_irele(ip);
 691		return false;
 692	}
 693
 694	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
 695
 696	chunk = container_of(bio, struct xfs_gc_bio, bio);
 697	chunk->ip = ip;
 698	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
 699	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
 700	chunk->old_startblock =
 701		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 702	chunk->new_daddr = daddr;
 703	chunk->is_seq = is_seq;
 704	chunk->scratch = &data->scratch[data->scratch_idx];
 705	chunk->data = data;
 706	chunk->oz = oz;
 707	chunk->victim_rtg = iter->victim_rtg;
 708	atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
 709	atomic_inc(&chunk->victim_rtg->rtg_gccount);
 710
 711	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 712	bio->bi_end_io = xfs_zone_gc_end_io;
 713	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
 714			chunk->scratch->offset);
 715	chunk->scratch->offset += chunk->len;
 716	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
 717		data->scratch_idx =
 718			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
 719	}
 720	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 721	list_add_tail(&chunk->entry, &data->reading);
 722	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
 723
 724	submit_bio(bio);
 725	return true;
 726}
 727
 728static void
 729xfs_zone_gc_free_chunk(
 730	struct xfs_gc_bio	*chunk)
 731{
 732	atomic_dec(&chunk->victim_rtg->rtg_gccount);
 733	xfs_rtgroup_rele(chunk->victim_rtg);
 734	list_del(&chunk->entry);
 735	xfs_open_zone_put(chunk->oz);
 736	xfs_irele(chunk->ip);
 737	bio_put(&chunk->bio);
 738}
 739
 740static void
 741xfs_zone_gc_submit_write(
 742	struct xfs_zone_gc_data	*data,
 743	struct xfs_gc_bio	*chunk)
 744{
 745	if (chunk->is_seq) {
 746		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
 747		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
 748	}
 749	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
 750	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
 751	submit_bio(&chunk->bio);
 752}
 753
 754static struct xfs_gc_bio *
 755xfs_zone_gc_split_write(
 756	struct xfs_zone_gc_data	*data,
 757	struct xfs_gc_bio	*chunk)
 758{
 759	struct queue_limits	*lim =
 760		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
 761	struct xfs_gc_bio	*split_chunk;
 762	int			split_sectors;
 763	unsigned int		split_len;
 764	struct bio		*split;
 765	unsigned int		nsegs;
 766
 767	if (!chunk->is_seq)
 768		return NULL;
 769
 770	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
 771			lim->max_zone_append_sectors << SECTOR_SHIFT);
 772	if (!split_sectors)
 773		return NULL;
 774
 775	/* ensure the split chunk is still block size aligned */
 776	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
 777			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
 778	split_len = split_sectors << SECTOR_SHIFT;
 779
 780	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
 781	split_chunk = container_of(split, struct xfs_gc_bio, bio);
 782	split_chunk->data = data;
 783	ihold(VFS_I(chunk->ip));
 784	split_chunk->ip = chunk->ip;
 785	split_chunk->is_seq = chunk->is_seq;
 786	split_chunk->scratch = chunk->scratch;
 787	split_chunk->offset = chunk->offset;
 788	split_chunk->len = split_len;
 789	split_chunk->old_startblock = chunk->old_startblock;
 790	split_chunk->new_daddr = chunk->new_daddr;
 791	split_chunk->oz = chunk->oz;
 792	atomic_inc(&chunk->oz->oz_ref);
 793
 794	split_chunk->victim_rtg = chunk->victim_rtg;
 795	atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
 796	atomic_inc(&chunk->victim_rtg->rtg_gccount);
 797
 798	chunk->offset += split_len;
 799	chunk->len -= split_len;
 800	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
 801
 802	/* add right before the original chunk */
 803	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
 804	list_add_tail(&split_chunk->entry, &chunk->entry);
 805	return split_chunk;
 806}
 807
 808static void
 809xfs_zone_gc_write_chunk(
 810	struct xfs_gc_bio	*chunk)
 811{
 812	struct xfs_zone_gc_data	*data = chunk->data;
 813	struct xfs_mount	*mp = chunk->ip->i_mount;
 814	phys_addr_t		bvec_paddr =
 815		bvec_phys(bio_first_bvec_all(&chunk->bio));
 816	struct xfs_gc_bio	*split_chunk;
 817
 818	if (chunk->bio.bi_status)
 819		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 820	if (xfs_is_shutdown(mp)) {
 821		xfs_zone_gc_free_chunk(chunk);
 822		return;
 823	}
 824
 825	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 826	list_move_tail(&chunk->entry, &data->writing);
 827
 828	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
 829	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
 830			offset_in_folio(chunk->scratch->folio, bvec_paddr));
 831
 832	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
 833		xfs_zone_gc_submit_write(data, split_chunk);
 834	xfs_zone_gc_submit_write(data, chunk);
 835}
 836
 837static void
 838xfs_zone_gc_finish_chunk(
 839	struct xfs_gc_bio	*chunk)
 840{
 841	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 842	struct xfs_inode	*ip = chunk->ip;
 843	struct xfs_mount	*mp = ip->i_mount;
 844	int			error;
 845
 846	if (chunk->bio.bi_status)
 847		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 848	if (xfs_is_shutdown(mp)) {
 849		xfs_zone_gc_free_chunk(chunk);
 850		return;
 851	}
 852
 853	chunk->scratch->freed += chunk->len;
 854	if (chunk->scratch->freed == chunk->scratch->offset) {
 855		chunk->scratch->offset = 0;
 856		chunk->scratch->freed = 0;
 857	}
 858
 859	/*
 860	 * Cycle through the iolock and wait for direct I/O and layouts to
 861	 * ensure no one is reading from the old mapping before it goes away.
 862	 *
 863	 * Note that xfs_zoned_end_io() below checks that no other writer raced
 864	 * with us to update the mapping by checking that the old startblock
 865	 * didn't change.
 866	 */
 867	xfs_ilock(ip, iolock);
 868	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
 869	if (!error)
 870		inode_dio_wait(VFS_I(ip));
 871	xfs_iunlock(ip, iolock);
 872	if (error)
 873		goto free;
 874
 875	if (chunk->is_seq)
 876		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
 877	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
 878			chunk->new_daddr, chunk->oz, chunk->old_startblock);
 879free:
 880	if (error)
 881		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 882	xfs_zone_gc_free_chunk(chunk);
 883}
 884
 885static void
 886xfs_zone_gc_finish_reset(
 887	struct xfs_gc_bio	*chunk)
 888{
 889	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
 890	struct xfs_mount	*mp = rtg_mount(rtg);
 891	struct xfs_zone_info	*zi = mp->m_zone_info;
 892
 893	if (chunk->bio.bi_status) {
 894		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 895		goto out;
 896	}
 897
 898	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
 899	atomic_inc(&zi->zi_nr_free_zones);
 900
 901	xfs_zoned_add_available(mp, rtg_blocks(rtg));
 902
 903	wake_up_all(&zi->zi_zone_wait);
 904out:
 905	list_del(&chunk->entry);
 906	bio_put(&chunk->bio);
 907}
 908
 909static bool
 910xfs_zone_gc_prepare_reset(
 911	struct bio		*bio,
 912	struct xfs_rtgroup	*rtg)
 913{
 914	trace_xfs_zone_reset(rtg);
 915
 916	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
 917	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
 918	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
 919		if (!bdev_max_discard_sectors(bio->bi_bdev))
 920			return false;
 921		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
 922		bio->bi_iter.bi_size =
 923			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
 924	}
 925
 926	return true;
 927}
 928
 929int
 930xfs_zone_gc_reset_sync(
 931	struct xfs_rtgroup	*rtg)
 932{
 933	int			error = 0;
 934	struct bio		bio;
 935
 936	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
 937			REQ_OP_ZONE_RESET);
 938	if (xfs_zone_gc_prepare_reset(&bio, rtg))
 939		error = submit_bio_wait(&bio);
 940	bio_uninit(&bio);
 941
 942	return error;
 943}
 944
 945static void
 946xfs_zone_gc_reset_zones(
 947	struct xfs_zone_gc_data	*data,
 948	struct xfs_group	*reset_list)
 949{
 950	struct xfs_group	*next = reset_list;
 951
 952	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
 953		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
 954		return;
 955	}
 956
 957	do {
 958		struct xfs_rtgroup	*rtg = to_rtg(next);
 959		struct xfs_gc_bio	*chunk;
 960		struct bio		*bio;
 961
 962		xfs_log_force_inode(rtg_rmap(rtg));
 963
 964		next = rtg_group(rtg)->xg_next_reset;
 965		rtg_group(rtg)->xg_next_reset = NULL;
 966
 967		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
 968				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
 969		bio->bi_private = rtg;
 970		bio->bi_end_io = xfs_zone_gc_end_io;
 971
 972		chunk = container_of(bio, struct xfs_gc_bio, bio);
 973		chunk->data = data;
 974		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 975		list_add_tail(&chunk->entry, &data->resetting);
 976
 977		/*
 978		 * Also use the bio to drive the state machine when neither
 979		 * zone reset nor discard is supported to keep things simple.
 980		 */
 981		if (xfs_zone_gc_prepare_reset(bio, rtg))
 982			submit_bio(bio);
 983		else
 984			bio_endio(bio);
 985	} while (next);
 986}
 987
 988static bool
 989xfs_zone_gc_should_start_new_work(
 990	struct xfs_zone_gc_data	*data)
 991{
 992	if (xfs_is_shutdown(data->mp))
 993		return false;
 994	if (!xfs_zone_gc_space_available(data))
 995		return false;
 996
 997	if (!data->iter.victim_rtg) {
 998		if (kthread_should_stop() || kthread_should_park())
 999			return false;
1000		if (!xfs_zoned_need_gc(data->mp))
1001			return false;
1002		if (!xfs_zone_gc_select_victim(data))
1003			return false;
1004	}
1005
1006	return true;
1007}
1008
1009/*
1010 * Handle the work to read and write data for GC and to reset the zones,
1011 * including handling all completions.
1012 *
1013 * Note that the order of the chunks is preserved so that we don't undo the
1014 * optimal order established by xfs_zone_gc_query().
1015 */
1016static void
1017xfs_zone_gc_handle_work(
1018	struct xfs_zone_gc_data	*data)
1019{
1020	struct xfs_zone_info	*zi = data->mp->m_zone_info;
1021	struct xfs_gc_bio	*chunk, *next;
1022	struct xfs_group	*reset_list;
1023	struct blk_plug		plug;
1024
1025	spin_lock(&zi->zi_reset_list_lock);
1026	reset_list = zi->zi_reset_list;
1027	zi->zi_reset_list = NULL;
1028	spin_unlock(&zi->zi_reset_list_lock);
1029
1030	if (reset_list) {
1031		set_current_state(TASK_RUNNING);
1032		xfs_zone_gc_reset_zones(data, reset_list);
1033	}
1034
1035	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1036		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1037			break;
1038		set_current_state(TASK_RUNNING);
1039		xfs_zone_gc_finish_reset(chunk);
1040	}
1041
1042	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1043		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1044			break;
1045		set_current_state(TASK_RUNNING);
1046		xfs_zone_gc_finish_chunk(chunk);
1047	}
1048
1049	blk_start_plug(&plug);
1050	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1051		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1052			break;
1053		set_current_state(TASK_RUNNING);
1054		xfs_zone_gc_write_chunk(chunk);
1055	}
1056	blk_finish_plug(&plug);
1057
1058	if (xfs_zone_gc_should_start_new_work(data)) {
1059		set_current_state(TASK_RUNNING);
1060		blk_start_plug(&plug);
1061		while (xfs_zone_gc_start_chunk(data))
1062			;
1063		blk_finish_plug(&plug);
1064	}
1065}
1066
1067/*
1068 * Note that the current GC algorithm would break reflinks and thus duplicate
1069 * data that was shared by multiple owners before.  Because of that reflinks
1070 * are currently not supported on zoned file systems and can't be created or
1071 * mounted.
1072 */
1073static int
1074xfs_zoned_gcd(
1075	void			*private)
1076{
1077	struct xfs_zone_gc_data	*data = private;
1078	struct xfs_mount	*mp = data->mp;
1079	struct xfs_zone_info	*zi = mp->m_zone_info;
1080	unsigned int		nofs_flag;
1081
1082	nofs_flag = memalloc_nofs_save();
1083	set_freezable();
1084
1085	for (;;) {
1086		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1087		xfs_set_zonegc_running(mp);
1088
1089		xfs_zone_gc_handle_work(data);
1090
1091		/*
1092		 * Only sleep if nothing set the state to running.  Else check for
1093		 * work again as someone might have queued up more work and woken
1094		 * us in the meantime.
1095		 */
1096		if (get_current_state() == TASK_RUNNING) {
1097			try_to_freeze();
1098			continue;
1099		}
1100
1101		if (list_empty(&data->reading) &&
1102		    list_empty(&data->writing) &&
1103		    list_empty(&data->resetting) &&
1104		    !zi->zi_reset_list) {
1105			xfs_clear_zonegc_running(mp);
1106			xfs_zoned_resv_wake_all(mp);
1107
1108			if (kthread_should_stop()) {
1109				__set_current_state(TASK_RUNNING);
1110				break;
1111			}
1112
1113			if (kthread_should_park()) {
1114				__set_current_state(TASK_RUNNING);
1115				kthread_parkme();
1116				continue;
1117			}
1118		}
1119
1120		schedule();
1121	}
1122	xfs_clear_zonegc_running(mp);
1123
1124	if (data->iter.victim_rtg)
1125		xfs_rtgroup_rele(data->iter.victim_rtg);
1126
1127	memalloc_nofs_restore(nofs_flag);
1128	xfs_zone_gc_data_free(data);
1129	return 0;
1130}
1131
1132void
1133xfs_zone_gc_start(
1134	struct xfs_mount	*mp)
1135{
1136	if (xfs_has_zoned(mp))
1137		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1138}
1139
1140void
1141xfs_zone_gc_stop(
1142	struct xfs_mount	*mp)
1143{
1144	if (xfs_has_zoned(mp))
1145		kthread_park(mp->m_zone_info->zi_gc_thread);
1146}
1147
1148int
1149xfs_zone_gc_mount(
1150	struct xfs_mount	*mp)
1151{
1152	struct xfs_zone_info	*zi = mp->m_zone_info;
1153	struct xfs_zone_gc_data	*data;
1154	struct xfs_open_zone	*oz;
1155	int			error;
1156
1157	/*
1158	 * If there are no free zones available for GC, pick the open zone with
1159	 * the least used space to GC into.  This should only happen after an
1160	 * unclean shutdown near ENOSPC while GC was ongoing.
1161	 *
1162	 * We also need to do this for the first gc zone allocation if we
1163	 * unmounted while at the open limit.
1164	 */
1165	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1166	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1167		oz = xfs_zone_gc_steal_open(zi);
1168	else
1169		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1170	if (!oz) {
1171		xfs_warn(mp, "unable to allocate a zone for gc");
1172		error = -EIO;
1173		goto out;
1174	}
1175
1176	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1177	zi->zi_open_gc_zone = oz;
1178
1179	data = xfs_zone_gc_data_alloc(mp);
1180	if (!data) {
1181		error = -ENOMEM;
1182		goto out_put_gc_zone;
1183	}
1184
1185	zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1186			"xfs-zone-gc/%s", mp->m_super->s_id);
1187	if (IS_ERR(zi->zi_gc_thread)) {
1188		xfs_warn(mp, "unable to create zone gc thread");
1189		error = PTR_ERR(zi->zi_gc_thread);
1190		goto out_free_gc_data;
1191	}
1192
1193	/* xfs_zone_gc_start will unpark for rw mounts */
1194	kthread_park(zi->zi_gc_thread);
1195	return 0;
1196
1197out_free_gc_data:
1198	kfree(data);
1199out_put_gc_zone:
1200	xfs_open_zone_put(zi->zi_open_gc_zone);
1201out:
1202	return error;
1203}
1204
1205void
1206xfs_zone_gc_unmount(
1207	struct xfs_mount	*mp)
1208{
1209	struct xfs_zone_info	*zi = mp->m_zone_info;
1210
1211	kthread_stop(zi->zi_gc_thread);
1212	if (zi->zi_open_gc_zone)
1213		xfs_open_zone_put(zi->zi_open_gc_zone);
1214}