at master 30 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6#include "xfs.h" 7#include "xfs_shared.h" 8#include "xfs_format.h" 9#include "xfs_log_format.h" 10#include "xfs_trans_resv.h" 11#include "xfs_mount.h" 12#include "xfs_inode.h" 13#include "xfs_btree.h" 14#include "xfs_trans.h" 15#include "xfs_icache.h" 16#include "xfs_rmap.h" 17#include "xfs_rtbitmap.h" 18#include "xfs_rtrmap_btree.h" 19#include "xfs_zone_alloc.h" 20#include "xfs_zone_priv.h" 21#include "xfs_zones.h" 22#include "xfs_trace.h" 23 24/* 25 * Implement Garbage Collection (GC) of partially used zoned. 26 * 27 * To support the purely sequential writes in each zone, zoned XFS needs to be 28 * able to move data remaining in a zone out of it to reset the zone to prepare 29 * for writing to it again. 30 * 31 * This is done by the GC thread implemented in this file. To support that a 32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 33 * write the garbage collected data into. 34 * 35 * Whenever the available space is below the chosen threshold, the GC thread 36 * looks for potential non-empty but not fully used zones that are worth 37 * reclaiming. Once found the rmap for the victim zone is queried, and after 38 * a bit of sorting to reduce fragmentation, the still live extents are read 39 * into memory and written to the GC target zone, and the bmap btree of the 40 * files is updated to point to the new location. To avoid taking the IOLOCK 41 * and MMAPLOCK for the entire GC process and thus affecting the latency of 42 * user reads and writes to the files, the GC writes are speculative and the 43 * I/O completion checks that no other writes happened for the affected regions 44 * before remapping. 45 * 46 * Once a zone does not contain any valid data, be that through GC or user 47 * block removal, it is queued for for a zone reset. The reset operation 48 * carefully ensures that the RT device cache is flushed and all transactions 49 * referencing the rmap have been committed to disk. 50 */ 51 52/* 53 * Size of each GC scratch pad. This is also the upper bound for each 54 * GC I/O, which helps to keep latency down. 55 */ 56#define XFS_GC_CHUNK_SIZE SZ_1M 57 58/* 59 * Scratchpad data to read GCed data into. 60 * 61 * The offset member tracks where the next allocation starts, and freed tracks 62 * the amount of space that is not used anymore. 63 */ 64#define XFS_ZONE_GC_NR_SCRATCH 2 65struct xfs_zone_scratch { 66 struct folio *folio; 67 unsigned int offset; 68 unsigned int freed; 69}; 70 71/* 72 * Chunk that is read and written for each GC operation. 73 * 74 * Note that for writes to actual zoned devices, the chunk can be split when 75 * reaching the hardware limit. 76 */ 77struct xfs_gc_bio { 78 struct xfs_zone_gc_data *data; 79 80 /* 81 * Entry into the reading/writing/resetting list. Only accessed from 82 * the GC thread, so no locking needed. 83 */ 84 struct list_head entry; 85 86 /* 87 * State of this gc_bio. Done means the current I/O completed. 88 * Set from the bio end I/O handler, read from the GC thread. 89 */ 90 enum { 91 XFS_GC_BIO_NEW, 92 XFS_GC_BIO_DONE, 93 } state; 94 95 /* 96 * Pointer to the inode and byte range in the inode that this 97 * GC chunk is operating on. 98 */ 99 struct xfs_inode *ip; 100 loff_t offset; 101 unsigned int len; 102 103 /* 104 * Existing startblock (in the zone to be freed) and newly assigned 105 * daddr in the zone GCed into. 106 */ 107 xfs_fsblock_t old_startblock; 108 xfs_daddr_t new_daddr; 109 struct xfs_zone_scratch *scratch; 110 111 /* Are we writing to a sequential write required zone? */ 112 bool is_seq; 113 114 /* Open Zone being written to */ 115 struct xfs_open_zone *oz; 116 117 struct xfs_rtgroup *victim_rtg; 118 119 /* Bio used for reads and writes, including the bvec used by it */ 120 struct bio bio; /* must be last */ 121}; 122 123#define XFS_ZONE_GC_RECS 1024 124 125/* iterator, needs to be reinitialized for each victim zone */ 126struct xfs_zone_gc_iter { 127 struct xfs_rtgroup *victim_rtg; 128 unsigned int rec_count; 129 unsigned int rec_idx; 130 xfs_agblock_t next_startblock; 131 struct xfs_rmap_irec *recs; 132}; 133 134/* 135 * Per-mount GC state. 136 */ 137struct xfs_zone_gc_data { 138 struct xfs_mount *mp; 139 140 /* bioset used to allocate the gc_bios */ 141 struct bio_set bio_set; 142 143 /* 144 * Scratchpad used, and index to indicated which one is used. 145 */ 146 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; 147 unsigned int scratch_idx; 148 149 /* 150 * List of bios currently being read, written and reset. 151 * These lists are only accessed by the GC thread itself, and must only 152 * be processed in order. 153 */ 154 struct list_head reading; 155 struct list_head writing; 156 struct list_head resetting; 157 158 /* 159 * Iterator for the victim zone. 160 */ 161 struct xfs_zone_gc_iter iter; 162}; 163 164/* 165 * We aim to keep enough zones free in stock to fully use the open zone limit 166 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 167 * can be set to make sure a fraction of the unused blocks are available for 168 * writing. 169 */ 170bool 171xfs_zoned_need_gc( 172 struct xfs_mount *mp) 173{ 174 s64 available, free, threshold; 175 s32 remainder; 176 177 if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) 178 return false; 179 180 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 181 182 if (available < 183 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 184 return true; 185 186 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 187 188 threshold = div_s64_rem(free, 100, &remainder); 189 threshold = threshold * mp->m_zonegc_low_space + 190 remainder * div_s64(mp->m_zonegc_low_space, 100); 191 192 if (available < threshold) 193 return true; 194 195 return false; 196} 197 198static struct xfs_zone_gc_data * 199xfs_zone_gc_data_alloc( 200 struct xfs_mount *mp) 201{ 202 struct xfs_zone_gc_data *data; 203 int i; 204 205 data = kzalloc(sizeof(*data), GFP_KERNEL); 206 if (!data) 207 return NULL; 208 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 209 GFP_KERNEL); 210 if (!data->iter.recs) 211 goto out_free_data; 212 213 /* 214 * We actually only need a single bio_vec. It would be nice to have 215 * a flag that only allocates the inline bvecs and not the separate 216 * bvec pool. 217 */ 218 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 219 BIOSET_NEED_BVECS)) 220 goto out_free_recs; 221 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { 222 data->scratch[i].folio = 223 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); 224 if (!data->scratch[i].folio) 225 goto out_free_scratch; 226 } 227 INIT_LIST_HEAD(&data->reading); 228 INIT_LIST_HEAD(&data->writing); 229 INIT_LIST_HEAD(&data->resetting); 230 data->mp = mp; 231 return data; 232 233out_free_scratch: 234 while (--i >= 0) 235 folio_put(data->scratch[i].folio); 236 bioset_exit(&data->bio_set); 237out_free_recs: 238 kfree(data->iter.recs); 239out_free_data: 240 kfree(data); 241 return NULL; 242} 243 244static void 245xfs_zone_gc_data_free( 246 struct xfs_zone_gc_data *data) 247{ 248 int i; 249 250 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) 251 folio_put(data->scratch[i].folio); 252 bioset_exit(&data->bio_set); 253 kfree(data->iter.recs); 254 kfree(data); 255} 256 257static void 258xfs_zone_gc_iter_init( 259 struct xfs_zone_gc_iter *iter, 260 struct xfs_rtgroup *victim_rtg) 261 262{ 263 iter->next_startblock = 0; 264 iter->rec_count = 0; 265 iter->rec_idx = 0; 266 iter->victim_rtg = victim_rtg; 267 atomic_inc(&victim_rtg->rtg_gccount); 268} 269 270/* 271 * Query the rmap of the victim zone to gather the records to evacuate. 272 */ 273static int 274xfs_zone_gc_query_cb( 275 struct xfs_btree_cur *cur, 276 const struct xfs_rmap_irec *irec, 277 void *private) 278{ 279 struct xfs_zone_gc_iter *iter = private; 280 281 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 282 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 283 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 284 285 iter->recs[iter->rec_count] = *irec; 286 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 287 iter->next_startblock = 288 irec->rm_startblock + irec->rm_blockcount; 289 return 1; 290 } 291 return 0; 292} 293 294static int 295xfs_zone_gc_rmap_rec_cmp( 296 const void *a, 297 const void *b) 298{ 299 const struct xfs_rmap_irec *reca = a; 300 const struct xfs_rmap_irec *recb = b; 301 int diff; 302 303 diff = cmp_int(reca->rm_owner, recb->rm_owner); 304 if (diff) 305 return diff; 306 return cmp_int(reca->rm_offset, recb->rm_offset); 307} 308 309static int 310xfs_zone_gc_query( 311 struct xfs_mount *mp, 312 struct xfs_zone_gc_iter *iter) 313{ 314 struct xfs_rtgroup *rtg = iter->victim_rtg; 315 struct xfs_rmap_irec ri_low = { }; 316 struct xfs_rmap_irec ri_high; 317 struct xfs_btree_cur *cur; 318 struct xfs_trans *tp; 319 int error; 320 321 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 322 if (iter->next_startblock == rtg_blocks(rtg)) 323 goto done; 324 325 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 326 ri_low.rm_startblock = iter->next_startblock; 327 memset(&ri_high, 0xFF, sizeof(ri_high)); 328 329 iter->rec_idx = 0; 330 iter->rec_count = 0; 331 332 tp = xfs_trans_alloc_empty(mp); 333 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 334 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 335 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 336 xfs_zone_gc_query_cb, iter); 337 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 338 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 339 xfs_trans_cancel(tp); 340 341 if (error < 0) 342 return error; 343 344 /* 345 * Sort the rmap records by inode number and increasing offset to 346 * defragment the mappings. 347 * 348 * This could be further enhanced by an even bigger look ahead window, 349 * but that's better left until we have better detection of changes to 350 * inode mapping to avoid the potential of GCing already dead data. 351 */ 352 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 353 xfs_zone_gc_rmap_rec_cmp, NULL); 354 355 if (error == 0) { 356 /* 357 * We finished iterating through the zone. 358 */ 359 iter->next_startblock = rtg_blocks(rtg); 360 if (iter->rec_count == 0) 361 goto done; 362 } 363 364 return 0; 365done: 366 atomic_dec(&iter->victim_rtg->rtg_gccount); 367 xfs_rtgroup_rele(iter->victim_rtg); 368 iter->victim_rtg = NULL; 369 return 0; 370} 371 372static bool 373xfs_zone_gc_iter_next( 374 struct xfs_mount *mp, 375 struct xfs_zone_gc_iter *iter, 376 struct xfs_rmap_irec *chunk_rec, 377 struct xfs_inode **ipp) 378{ 379 struct xfs_rmap_irec *irec; 380 int error; 381 382 if (!iter->victim_rtg) 383 return false; 384 385retry: 386 if (iter->rec_idx == iter->rec_count) { 387 error = xfs_zone_gc_query(mp, iter); 388 if (error) 389 goto fail; 390 if (!iter->victim_rtg) 391 return false; 392 } 393 394 irec = &iter->recs[iter->rec_idx]; 395 error = xfs_iget(mp, NULL, irec->rm_owner, 396 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 397 if (error) { 398 /* 399 * If the inode was already deleted, skip over it. 400 */ 401 if (error == -ENOENT) { 402 iter->rec_idx++; 403 goto retry; 404 } 405 goto fail; 406 } 407 408 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 409 iter->rec_idx++; 410 xfs_irele(*ipp); 411 goto retry; 412 } 413 414 *chunk_rec = *irec; 415 return true; 416 417fail: 418 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 419 return false; 420} 421 422static void 423xfs_zone_gc_iter_advance( 424 struct xfs_zone_gc_iter *iter, 425 xfs_extlen_t count_fsb) 426{ 427 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 428 429 irec->rm_offset += count_fsb; 430 irec->rm_startblock += count_fsb; 431 irec->rm_blockcount -= count_fsb; 432 if (!irec->rm_blockcount) 433 iter->rec_idx++; 434} 435 436static struct xfs_rtgroup * 437xfs_zone_gc_pick_victim_from( 438 struct xfs_mount *mp, 439 uint32_t bucket) 440{ 441 struct xfs_zone_info *zi = mp->m_zone_info; 442 uint32_t victim_used = U32_MAX; 443 struct xfs_rtgroup *victim_rtg = NULL; 444 uint32_t bit; 445 446 if (!zi->zi_used_bucket_entries[bucket]) 447 return NULL; 448 449 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 450 mp->m_sb.sb_rgcount) { 451 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 452 453 if (!rtg) 454 continue; 455 456 /* 457 * If the zone is already undergoing GC, don't pick it again. 458 * 459 * This prevents us from picking one of the zones for which we 460 * already submitted GC I/O, but for which the remapping hasn't 461 * concluded yet. This won't cause data corruption, but 462 * increases write amplification and slows down GC, so this is 463 * a bad thing. 464 */ 465 if (atomic_read(&rtg->rtg_gccount)) { 466 xfs_rtgroup_rele(rtg); 467 continue; 468 } 469 470 /* skip zones that are just waiting for a reset */ 471 if (rtg_rmap(rtg)->i_used_blocks == 0 || 472 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 473 xfs_rtgroup_rele(rtg); 474 continue; 475 } 476 477 if (victim_rtg) 478 xfs_rtgroup_rele(victim_rtg); 479 victim_rtg = rtg; 480 victim_used = rtg_rmap(rtg)->i_used_blocks; 481 482 /* 483 * Any zone that is less than 1 percent used is fair game for 484 * instant reclaim. All of these zones are in the last 485 * bucket, so avoid the expensive division for the zones 486 * in the other buckets. 487 */ 488 if (bucket == 0 && 489 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 490 break; 491 } 492 493 return victim_rtg; 494} 495 496/* 497 * Iterate through all zones marked as reclaimable and find a candidate to 498 * reclaim. 499 */ 500static bool 501xfs_zone_gc_select_victim( 502 struct xfs_zone_gc_data *data) 503{ 504 struct xfs_zone_gc_iter *iter = &data->iter; 505 struct xfs_mount *mp = data->mp; 506 struct xfs_zone_info *zi = mp->m_zone_info; 507 struct xfs_rtgroup *victim_rtg = NULL; 508 unsigned int bucket; 509 510 spin_lock(&zi->zi_used_buckets_lock); 511 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 512 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 513 if (victim_rtg) 514 break; 515 } 516 spin_unlock(&zi->zi_used_buckets_lock); 517 518 if (!victim_rtg) 519 return false; 520 521 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 522 xfs_zone_gc_iter_init(iter, victim_rtg); 523 return true; 524} 525 526static struct xfs_open_zone * 527xfs_zone_gc_steal_open( 528 struct xfs_zone_info *zi) 529{ 530 struct xfs_open_zone *oz, *found = NULL; 531 532 spin_lock(&zi->zi_open_zones_lock); 533 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 534 if (!found || oz->oz_allocated < found->oz_allocated) 535 found = oz; 536 } 537 538 if (found) { 539 found->oz_is_gc = true; 540 list_del_init(&found->oz_entry); 541 zi->zi_nr_open_zones--; 542 } 543 544 spin_unlock(&zi->zi_open_zones_lock); 545 return found; 546} 547 548static struct xfs_open_zone * 549xfs_zone_gc_select_target( 550 struct xfs_mount *mp) 551{ 552 struct xfs_zone_info *zi = mp->m_zone_info; 553 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 554 555 /* 556 * We need to wait for pending writes to finish. 557 */ 558 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 559 return NULL; 560 561 ASSERT(zi->zi_nr_open_zones <= 562 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 563 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 564 if (oz) 565 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 566 spin_lock(&zi->zi_open_zones_lock); 567 zi->zi_open_gc_zone = oz; 568 spin_unlock(&zi->zi_open_zones_lock); 569 return oz; 570} 571 572/* 573 * Ensure we have a valid open zone to write the GC data to. 574 * 575 * If the current target zone has space keep writing to it, else first wait for 576 * all pending writes and then pick a new one. 577 */ 578static struct xfs_open_zone * 579xfs_zone_gc_ensure_target( 580 struct xfs_mount *mp) 581{ 582 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 583 584 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 585 return xfs_zone_gc_select_target(mp); 586 return oz; 587} 588 589static unsigned int 590xfs_zone_gc_scratch_available( 591 struct xfs_zone_gc_data *data) 592{ 593 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; 594} 595 596static bool 597xfs_zone_gc_space_available( 598 struct xfs_zone_gc_data *data) 599{ 600 struct xfs_open_zone *oz; 601 602 oz = xfs_zone_gc_ensure_target(data->mp); 603 if (!oz) 604 return false; 605 return oz->oz_allocated < rtg_blocks(oz->oz_rtg) && 606 xfs_zone_gc_scratch_available(data); 607} 608 609static void 610xfs_zone_gc_end_io( 611 struct bio *bio) 612{ 613 struct xfs_gc_bio *chunk = 614 container_of(bio, struct xfs_gc_bio, bio); 615 struct xfs_zone_gc_data *data = chunk->data; 616 617 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 618 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 619} 620 621static struct xfs_open_zone * 622xfs_zone_gc_alloc_blocks( 623 struct xfs_zone_gc_data *data, 624 xfs_extlen_t *count_fsb, 625 xfs_daddr_t *daddr, 626 bool *is_seq) 627{ 628 struct xfs_mount *mp = data->mp; 629 struct xfs_open_zone *oz; 630 631 oz = xfs_zone_gc_ensure_target(mp); 632 if (!oz) 633 return NULL; 634 635 *count_fsb = min(*count_fsb, 636 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); 637 638 /* 639 * Directly allocate GC blocks from the reserved pool. 640 * 641 * If we'd take them from the normal pool we could be stealing blocks 642 * from a regular writer, which would then have to wait for GC and 643 * deadlock. 644 */ 645 spin_lock(&mp->m_sb_lock); 646 *count_fsb = min(*count_fsb, 647 rtg_blocks(oz->oz_rtg) - oz->oz_allocated); 648 *count_fsb = min3(*count_fsb, 649 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 650 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 651 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 652 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 653 spin_unlock(&mp->m_sb_lock); 654 655 if (!*count_fsb) 656 return NULL; 657 658 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 659 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 660 if (!*is_seq) 661 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 662 oz->oz_allocated += *count_fsb; 663 atomic_inc(&oz->oz_ref); 664 return oz; 665} 666 667static bool 668xfs_zone_gc_start_chunk( 669 struct xfs_zone_gc_data *data) 670{ 671 struct xfs_zone_gc_iter *iter = &data->iter; 672 struct xfs_mount *mp = data->mp; 673 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 674 struct xfs_open_zone *oz; 675 struct xfs_rmap_irec irec; 676 struct xfs_gc_bio *chunk; 677 struct xfs_inode *ip; 678 struct bio *bio; 679 xfs_daddr_t daddr; 680 bool is_seq; 681 682 if (xfs_is_shutdown(mp)) 683 return false; 684 685 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 686 return false; 687 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 688 &is_seq); 689 if (!oz) { 690 xfs_irele(ip); 691 return false; 692 } 693 694 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); 695 696 chunk = container_of(bio, struct xfs_gc_bio, bio); 697 chunk->ip = ip; 698 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 699 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 700 chunk->old_startblock = 701 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 702 chunk->new_daddr = daddr; 703 chunk->is_seq = is_seq; 704 chunk->scratch = &data->scratch[data->scratch_idx]; 705 chunk->data = data; 706 chunk->oz = oz; 707 chunk->victim_rtg = iter->victim_rtg; 708 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); 709 atomic_inc(&chunk->victim_rtg->rtg_gccount); 710 711 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 712 bio->bi_end_io = xfs_zone_gc_end_io; 713 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, 714 chunk->scratch->offset); 715 chunk->scratch->offset += chunk->len; 716 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { 717 data->scratch_idx = 718 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; 719 } 720 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 721 list_add_tail(&chunk->entry, &data->reading); 722 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 723 724 submit_bio(bio); 725 return true; 726} 727 728static void 729xfs_zone_gc_free_chunk( 730 struct xfs_gc_bio *chunk) 731{ 732 atomic_dec(&chunk->victim_rtg->rtg_gccount); 733 xfs_rtgroup_rele(chunk->victim_rtg); 734 list_del(&chunk->entry); 735 xfs_open_zone_put(chunk->oz); 736 xfs_irele(chunk->ip); 737 bio_put(&chunk->bio); 738} 739 740static void 741xfs_zone_gc_submit_write( 742 struct xfs_zone_gc_data *data, 743 struct xfs_gc_bio *chunk) 744{ 745 if (chunk->is_seq) { 746 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 747 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 748 } 749 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 750 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 751 submit_bio(&chunk->bio); 752} 753 754static struct xfs_gc_bio * 755xfs_zone_gc_split_write( 756 struct xfs_zone_gc_data *data, 757 struct xfs_gc_bio *chunk) 758{ 759 struct queue_limits *lim = 760 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 761 struct xfs_gc_bio *split_chunk; 762 int split_sectors; 763 unsigned int split_len; 764 struct bio *split; 765 unsigned int nsegs; 766 767 if (!chunk->is_seq) 768 return NULL; 769 770 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 771 lim->max_zone_append_sectors << SECTOR_SHIFT); 772 if (!split_sectors) 773 return NULL; 774 775 /* ensure the split chunk is still block size aligned */ 776 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 777 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 778 split_len = split_sectors << SECTOR_SHIFT; 779 780 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 781 split_chunk = container_of(split, struct xfs_gc_bio, bio); 782 split_chunk->data = data; 783 ihold(VFS_I(chunk->ip)); 784 split_chunk->ip = chunk->ip; 785 split_chunk->is_seq = chunk->is_seq; 786 split_chunk->scratch = chunk->scratch; 787 split_chunk->offset = chunk->offset; 788 split_chunk->len = split_len; 789 split_chunk->old_startblock = chunk->old_startblock; 790 split_chunk->new_daddr = chunk->new_daddr; 791 split_chunk->oz = chunk->oz; 792 atomic_inc(&chunk->oz->oz_ref); 793 794 split_chunk->victim_rtg = chunk->victim_rtg; 795 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); 796 atomic_inc(&chunk->victim_rtg->rtg_gccount); 797 798 chunk->offset += split_len; 799 chunk->len -= split_len; 800 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 801 802 /* add right before the original chunk */ 803 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 804 list_add_tail(&split_chunk->entry, &chunk->entry); 805 return split_chunk; 806} 807 808static void 809xfs_zone_gc_write_chunk( 810 struct xfs_gc_bio *chunk) 811{ 812 struct xfs_zone_gc_data *data = chunk->data; 813 struct xfs_mount *mp = chunk->ip->i_mount; 814 phys_addr_t bvec_paddr = 815 bvec_phys(bio_first_bvec_all(&chunk->bio)); 816 struct xfs_gc_bio *split_chunk; 817 818 if (chunk->bio.bi_status) 819 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 820 if (xfs_is_shutdown(mp)) { 821 xfs_zone_gc_free_chunk(chunk); 822 return; 823 } 824 825 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 826 list_move_tail(&chunk->entry, &data->writing); 827 828 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); 829 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, 830 offset_in_folio(chunk->scratch->folio, bvec_paddr)); 831 832 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 833 xfs_zone_gc_submit_write(data, split_chunk); 834 xfs_zone_gc_submit_write(data, chunk); 835} 836 837static void 838xfs_zone_gc_finish_chunk( 839 struct xfs_gc_bio *chunk) 840{ 841 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 842 struct xfs_inode *ip = chunk->ip; 843 struct xfs_mount *mp = ip->i_mount; 844 int error; 845 846 if (chunk->bio.bi_status) 847 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 848 if (xfs_is_shutdown(mp)) { 849 xfs_zone_gc_free_chunk(chunk); 850 return; 851 } 852 853 chunk->scratch->freed += chunk->len; 854 if (chunk->scratch->freed == chunk->scratch->offset) { 855 chunk->scratch->offset = 0; 856 chunk->scratch->freed = 0; 857 } 858 859 /* 860 * Cycle through the iolock and wait for direct I/O and layouts to 861 * ensure no one is reading from the old mapping before it goes away. 862 * 863 * Note that xfs_zoned_end_io() below checks that no other writer raced 864 * with us to update the mapping by checking that the old startblock 865 * didn't change. 866 */ 867 xfs_ilock(ip, iolock); 868 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 869 if (!error) 870 inode_dio_wait(VFS_I(ip)); 871 xfs_iunlock(ip, iolock); 872 if (error) 873 goto free; 874 875 if (chunk->is_seq) 876 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 877 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 878 chunk->new_daddr, chunk->oz, chunk->old_startblock); 879free: 880 if (error) 881 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 882 xfs_zone_gc_free_chunk(chunk); 883} 884 885static void 886xfs_zone_gc_finish_reset( 887 struct xfs_gc_bio *chunk) 888{ 889 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 890 struct xfs_mount *mp = rtg_mount(rtg); 891 struct xfs_zone_info *zi = mp->m_zone_info; 892 893 if (chunk->bio.bi_status) { 894 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 895 goto out; 896 } 897 898 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 899 atomic_inc(&zi->zi_nr_free_zones); 900 901 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 902 903 wake_up_all(&zi->zi_zone_wait); 904out: 905 list_del(&chunk->entry); 906 bio_put(&chunk->bio); 907} 908 909static bool 910xfs_zone_gc_prepare_reset( 911 struct bio *bio, 912 struct xfs_rtgroup *rtg) 913{ 914 trace_xfs_zone_reset(rtg); 915 916 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 917 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 918 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 919 if (!bdev_max_discard_sectors(bio->bi_bdev)) 920 return false; 921 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; 922 bio->bi_iter.bi_size = 923 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); 924 } 925 926 return true; 927} 928 929int 930xfs_zone_gc_reset_sync( 931 struct xfs_rtgroup *rtg) 932{ 933 int error = 0; 934 struct bio bio; 935 936 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 937 REQ_OP_ZONE_RESET); 938 if (xfs_zone_gc_prepare_reset(&bio, rtg)) 939 error = submit_bio_wait(&bio); 940 bio_uninit(&bio); 941 942 return error; 943} 944 945static void 946xfs_zone_gc_reset_zones( 947 struct xfs_zone_gc_data *data, 948 struct xfs_group *reset_list) 949{ 950 struct xfs_group *next = reset_list; 951 952 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 953 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 954 return; 955 } 956 957 do { 958 struct xfs_rtgroup *rtg = to_rtg(next); 959 struct xfs_gc_bio *chunk; 960 struct bio *bio; 961 962 xfs_log_force_inode(rtg_rmap(rtg)); 963 964 next = rtg_group(rtg)->xg_next_reset; 965 rtg_group(rtg)->xg_next_reset = NULL; 966 967 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 968 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 969 bio->bi_private = rtg; 970 bio->bi_end_io = xfs_zone_gc_end_io; 971 972 chunk = container_of(bio, struct xfs_gc_bio, bio); 973 chunk->data = data; 974 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 975 list_add_tail(&chunk->entry, &data->resetting); 976 977 /* 978 * Also use the bio to drive the state machine when neither 979 * zone reset nor discard is supported to keep things simple. 980 */ 981 if (xfs_zone_gc_prepare_reset(bio, rtg)) 982 submit_bio(bio); 983 else 984 bio_endio(bio); 985 } while (next); 986} 987 988static bool 989xfs_zone_gc_should_start_new_work( 990 struct xfs_zone_gc_data *data) 991{ 992 if (xfs_is_shutdown(data->mp)) 993 return false; 994 if (!xfs_zone_gc_space_available(data)) 995 return false; 996 997 if (!data->iter.victim_rtg) { 998 if (kthread_should_stop() || kthread_should_park()) 999 return false; 1000 if (!xfs_zoned_need_gc(data->mp)) 1001 return false; 1002 if (!xfs_zone_gc_select_victim(data)) 1003 return false; 1004 } 1005 1006 return true; 1007} 1008 1009/* 1010 * Handle the work to read and write data for GC and to reset the zones, 1011 * including handling all completions. 1012 * 1013 * Note that the order of the chunks is preserved so that we don't undo the 1014 * optimal order established by xfs_zone_gc_query(). 1015 */ 1016static void 1017xfs_zone_gc_handle_work( 1018 struct xfs_zone_gc_data *data) 1019{ 1020 struct xfs_zone_info *zi = data->mp->m_zone_info; 1021 struct xfs_gc_bio *chunk, *next; 1022 struct xfs_group *reset_list; 1023 struct blk_plug plug; 1024 1025 spin_lock(&zi->zi_reset_list_lock); 1026 reset_list = zi->zi_reset_list; 1027 zi->zi_reset_list = NULL; 1028 spin_unlock(&zi->zi_reset_list_lock); 1029 1030 if (reset_list) { 1031 set_current_state(TASK_RUNNING); 1032 xfs_zone_gc_reset_zones(data, reset_list); 1033 } 1034 1035 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1036 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1037 break; 1038 set_current_state(TASK_RUNNING); 1039 xfs_zone_gc_finish_reset(chunk); 1040 } 1041 1042 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1043 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1044 break; 1045 set_current_state(TASK_RUNNING); 1046 xfs_zone_gc_finish_chunk(chunk); 1047 } 1048 1049 blk_start_plug(&plug); 1050 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1051 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1052 break; 1053 set_current_state(TASK_RUNNING); 1054 xfs_zone_gc_write_chunk(chunk); 1055 } 1056 blk_finish_plug(&plug); 1057 1058 if (xfs_zone_gc_should_start_new_work(data)) { 1059 set_current_state(TASK_RUNNING); 1060 blk_start_plug(&plug); 1061 while (xfs_zone_gc_start_chunk(data)) 1062 ; 1063 blk_finish_plug(&plug); 1064 } 1065} 1066 1067/* 1068 * Note that the current GC algorithm would break reflinks and thus duplicate 1069 * data that was shared by multiple owners before. Because of that reflinks 1070 * are currently not supported on zoned file systems and can't be created or 1071 * mounted. 1072 */ 1073static int 1074xfs_zoned_gcd( 1075 void *private) 1076{ 1077 struct xfs_zone_gc_data *data = private; 1078 struct xfs_mount *mp = data->mp; 1079 struct xfs_zone_info *zi = mp->m_zone_info; 1080 unsigned int nofs_flag; 1081 1082 nofs_flag = memalloc_nofs_save(); 1083 set_freezable(); 1084 1085 for (;;) { 1086 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1087 xfs_set_zonegc_running(mp); 1088 1089 xfs_zone_gc_handle_work(data); 1090 1091 /* 1092 * Only sleep if nothing set the state to running. Else check for 1093 * work again as someone might have queued up more work and woken 1094 * us in the meantime. 1095 */ 1096 if (get_current_state() == TASK_RUNNING) { 1097 try_to_freeze(); 1098 continue; 1099 } 1100 1101 if (list_empty(&data->reading) && 1102 list_empty(&data->writing) && 1103 list_empty(&data->resetting) && 1104 !zi->zi_reset_list) { 1105 xfs_clear_zonegc_running(mp); 1106 xfs_zoned_resv_wake_all(mp); 1107 1108 if (kthread_should_stop()) { 1109 __set_current_state(TASK_RUNNING); 1110 break; 1111 } 1112 1113 if (kthread_should_park()) { 1114 __set_current_state(TASK_RUNNING); 1115 kthread_parkme(); 1116 continue; 1117 } 1118 } 1119 1120 schedule(); 1121 } 1122 xfs_clear_zonegc_running(mp); 1123 1124 if (data->iter.victim_rtg) 1125 xfs_rtgroup_rele(data->iter.victim_rtg); 1126 1127 memalloc_nofs_restore(nofs_flag); 1128 xfs_zone_gc_data_free(data); 1129 return 0; 1130} 1131 1132void 1133xfs_zone_gc_start( 1134 struct xfs_mount *mp) 1135{ 1136 if (xfs_has_zoned(mp)) 1137 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1138} 1139 1140void 1141xfs_zone_gc_stop( 1142 struct xfs_mount *mp) 1143{ 1144 if (xfs_has_zoned(mp)) 1145 kthread_park(mp->m_zone_info->zi_gc_thread); 1146} 1147 1148int 1149xfs_zone_gc_mount( 1150 struct xfs_mount *mp) 1151{ 1152 struct xfs_zone_info *zi = mp->m_zone_info; 1153 struct xfs_zone_gc_data *data; 1154 struct xfs_open_zone *oz; 1155 int error; 1156 1157 /* 1158 * If there are no free zones available for GC, pick the open zone with 1159 * the least used space to GC into. This should only happen after an 1160 * unclean shutdown near ENOSPC while GC was ongoing. 1161 * 1162 * We also need to do this for the first gc zone allocation if we 1163 * unmounted while at the open limit. 1164 */ 1165 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1166 zi->zi_nr_open_zones == mp->m_max_open_zones) 1167 oz = xfs_zone_gc_steal_open(zi); 1168 else 1169 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1170 if (!oz) { 1171 xfs_warn(mp, "unable to allocate a zone for gc"); 1172 error = -EIO; 1173 goto out; 1174 } 1175 1176 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1177 zi->zi_open_gc_zone = oz; 1178 1179 data = xfs_zone_gc_data_alloc(mp); 1180 if (!data) { 1181 error = -ENOMEM; 1182 goto out_put_gc_zone; 1183 } 1184 1185 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1186 "xfs-zone-gc/%s", mp->m_super->s_id); 1187 if (IS_ERR(zi->zi_gc_thread)) { 1188 xfs_warn(mp, "unable to create zone gc thread"); 1189 error = PTR_ERR(zi->zi_gc_thread); 1190 goto out_free_gc_data; 1191 } 1192 1193 /* xfs_zone_gc_start will unpark for rw mounts */ 1194 kthread_park(zi->zi_gc_thread); 1195 return 0; 1196 1197out_free_gc_data: 1198 kfree(data); 1199out_put_gc_zone: 1200 xfs_open_zone_put(zi->zi_open_gc_zone); 1201out: 1202 return error; 1203} 1204 1205void 1206xfs_zone_gc_unmount( 1207 struct xfs_mount *mp) 1208{ 1209 struct xfs_zone_info *zi = mp->m_zone_info; 1210 1211 kthread_stop(zi->zi_gc_thread); 1212 if (zi->zi_open_gc_zone) 1213 xfs_open_zone_put(zi->zi_open_gc_zone); 1214}