Merge tag 'xfs-6.15-merge' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+6 -1

fs/xfs/Makefile

··· 64 64 xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ 65 65 xfs_rtbitmap.o \ 66 66 xfs_rtgroup.o \ 67 + xfs_zones.o \ 67 68 ) 68 69 69 70 # highlevel code ··· 137 136 xfs_quotaops.o 138 137 139 138 # xfs_rtbitmap is shared with libxfs 140 - xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o 139 + xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ 140 + xfs_zone_alloc.o \ 141 + xfs_zone_gc.o \ 142 + xfs_zone_info.o \ 143 + xfs_zone_space_resv.o 141 144 142 145 xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o 143 146 xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o

+1 -1

fs/xfs/libxfs/xfs_ag.c

··· 301 301 struct xfs_buf *bp; 302 302 int error; 303 303 304 - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); 304 + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); 305 305 if (error) 306 306 return error; 307 307

+23 -293

fs/xfs/libxfs/xfs_bmap.c

··· 34 34 #include "xfs_ag.h" 35 35 #include "xfs_ag_resv.h" 36 36 #include "xfs_refcount.h" 37 - #include "xfs_icache.h" 38 37 #include "xfs_iomap.h" 39 38 #include "xfs_health.h" 40 39 #include "xfs_bmap_item.h" 41 40 #include "xfs_symlink_remote.h" 42 41 #include "xfs_inode_util.h" 43 42 #include "xfs_rtgroup.h" 43 + #include "xfs_zone_alloc.h" 44 44 45 45 struct kmem_cache *xfs_bmap_intent_cache; 46 46 ··· 171 171 * Compute the worst-case number of indirect blocks that will be used 172 172 * for ip's delayed extent of length "len". 173 173 */ 174 - STATIC xfs_filblks_t 174 + xfs_filblks_t 175 175 xfs_bmap_worst_indlen( 176 - xfs_inode_t *ip, /* incore inode pointer */ 177 - xfs_filblks_t len) /* delayed extent length */ 176 + struct xfs_inode *ip, /* incore inode pointer */ 177 + xfs_filblks_t len) /* delayed extent length */ 178 178 { 179 - int level; /* btree level number */ 180 - int maxrecs; /* maximum record count at this level */ 181 - xfs_mount_t *mp; /* mount structure */ 182 - xfs_filblks_t rval; /* return value */ 179 + struct xfs_mount *mp = ip->i_mount; 180 + int maxrecs = mp->m_bmap_dmxr[0]; 181 + int level; 182 + xfs_filblks_t rval; 183 183 184 - mp = ip->i_mount; 185 - maxrecs = mp->m_bmap_dmxr[0]; 186 184 for (level = 0, rval = 0; 187 185 level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); 188 186 level++) { ··· 2570 2572 } 2571 2573 2572 2574 /* 2573 - * Convert a hole to a delayed allocation. 2574 - */ 2575 - STATIC void 2576 - xfs_bmap_add_extent_hole_delay( 2577 - xfs_inode_t *ip, /* incore inode pointer */ 2578 - int whichfork, 2579 - struct xfs_iext_cursor *icur, 2580 - xfs_bmbt_irec_t *new) /* new data to add to file extents */ 2581 - { 2582 - struct xfs_ifork *ifp; /* inode fork pointer */ 2583 - xfs_bmbt_irec_t left; /* left neighbor extent entry */ 2584 - xfs_filblks_t newlen=0; /* new indirect size */ 2585 - xfs_filblks_t oldlen=0; /* old indirect size */ 2586 - xfs_bmbt_irec_t right; /* right neighbor extent entry */ 2587 - uint32_t state = xfs_bmap_fork_to_state(whichfork); 2588 - xfs_filblks_t temp; /* temp for indirect calculations */ 2589 - 2590 - ifp = xfs_ifork_ptr(ip, whichfork); 2591 - ASSERT(isnullstartblock(new->br_startblock)); 2592 - 2593 - /* 2594 - * Check and set flags if this segment has a left neighbor 2595 - */ 2596 - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { 2597 - state |= BMAP_LEFT_VALID; 2598 - if (isnullstartblock(left.br_startblock)) 2599 - state |= BMAP_LEFT_DELAY; 2600 - } 2601 - 2602 - /* 2603 - * Check and set flags if the current (right) segment exists. 2604 - * If it doesn't exist, we're converting the hole at end-of-file. 2605 - */ 2606 - if (xfs_iext_get_extent(ifp, icur, &right)) { 2607 - state |= BMAP_RIGHT_VALID; 2608 - if (isnullstartblock(right.br_startblock)) 2609 - state |= BMAP_RIGHT_DELAY; 2610 - } 2611 - 2612 - /* 2613 - * Set contiguity flags on the left and right neighbors. 2614 - * Don't let extents get too large, even if the pieces are contiguous. 2615 - */ 2616 - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && 2617 - left.br_startoff + left.br_blockcount == new->br_startoff && 2618 - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) 2619 - state |= BMAP_LEFT_CONTIG; 2620 - 2621 - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && 2622 - new->br_startoff + new->br_blockcount == right.br_startoff && 2623 - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && 2624 - (!(state & BMAP_LEFT_CONTIG) || 2625 - (left.br_blockcount + new->br_blockcount + 2626 - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) 2627 - state |= BMAP_RIGHT_CONTIG; 2628 - 2629 - /* 2630 - * Switch out based on the contiguity flags. 2631 - */ 2632 - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { 2633 - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 2634 - /* 2635 - * New allocation is contiguous with delayed allocations 2636 - * on the left and on the right. 2637 - * Merge all three into a single extent record. 2638 - */ 2639 - temp = left.br_blockcount + new->br_blockcount + 2640 - right.br_blockcount; 2641 - 2642 - oldlen = startblockval(left.br_startblock) + 2643 - startblockval(new->br_startblock) + 2644 - startblockval(right.br_startblock); 2645 - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2646 - oldlen); 2647 - left.br_startblock = nullstartblock(newlen); 2648 - left.br_blockcount = temp; 2649 - 2650 - xfs_iext_remove(ip, icur, state); 2651 - xfs_iext_prev(ifp, icur); 2652 - xfs_iext_update_extent(ip, state, icur, &left); 2653 - break; 2654 - 2655 - case BMAP_LEFT_CONTIG: 2656 - /* 2657 - * New allocation is contiguous with a delayed allocation 2658 - * on the left. 2659 - * Merge the new allocation with the left neighbor. 2660 - */ 2661 - temp = left.br_blockcount + new->br_blockcount; 2662 - 2663 - oldlen = startblockval(left.br_startblock) + 2664 - startblockval(new->br_startblock); 2665 - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2666 - oldlen); 2667 - left.br_blockcount = temp; 2668 - left.br_startblock = nullstartblock(newlen); 2669 - 2670 - xfs_iext_prev(ifp, icur); 2671 - xfs_iext_update_extent(ip, state, icur, &left); 2672 - break; 2673 - 2674 - case BMAP_RIGHT_CONTIG: 2675 - /* 2676 - * New allocation is contiguous with a delayed allocation 2677 - * on the right. 2678 - * Merge the new allocation with the right neighbor. 2679 - */ 2680 - temp = new->br_blockcount + right.br_blockcount; 2681 - oldlen = startblockval(new->br_startblock) + 2682 - startblockval(right.br_startblock); 2683 - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2684 - oldlen); 2685 - right.br_startoff = new->br_startoff; 2686 - right.br_startblock = nullstartblock(newlen); 2687 - right.br_blockcount = temp; 2688 - xfs_iext_update_extent(ip, state, icur, &right); 2689 - break; 2690 - 2691 - case 0: 2692 - /* 2693 - * New allocation is not contiguous with another 2694 - * delayed allocation. 2695 - * Insert a new entry. 2696 - */ 2697 - oldlen = newlen = 0; 2698 - xfs_iext_insert(ip, icur, new, state); 2699 - break; 2700 - } 2701 - if (oldlen != newlen) { 2702 - ASSERT(oldlen > newlen); 2703 - xfs_add_fdblocks(ip->i_mount, oldlen - newlen); 2704 - 2705 - /* 2706 - * Nothing to do for disk quota accounting here. 2707 - */ 2708 - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); 2709 - } 2710 - } 2711 - 2712 - /* 2713 2575 * Convert a hole to a real allocation. 2714 2576 */ 2715 2577 STATIC int /* error */ ··· 3897 4039 return 0; 3898 4040 } 3899 4041 3900 - /* 3901 - * Add a delayed allocation extent to an inode. Blocks are reserved from the 3902 - * global pool and the extent inserted into the inode in-core extent tree. 3903 - * 3904 - * On entry, got refers to the first extent beyond the offset of the extent to 3905 - * allocate or eof is specified if no such extent exists. On return, got refers 3906 - * to the extent record that was inserted to the inode fork. 3907 - * 3908 - * Note that the allocated extent may have been merged with contiguous extents 3909 - * during insertion into the inode fork. Thus, got does not reflect the current 3910 - * state of the inode fork on return. If necessary, the caller can use lastx to 3911 - * look up the updated record in the inode fork. 3912 - */ 3913 - int 3914 - xfs_bmapi_reserve_delalloc( 3915 - struct xfs_inode *ip, 3916 - int whichfork, 3917 - xfs_fileoff_t off, 3918 - xfs_filblks_t len, 3919 - xfs_filblks_t prealloc, 3920 - struct xfs_bmbt_irec *got, 3921 - struct xfs_iext_cursor *icur, 3922 - int eof) 3923 - { 3924 - struct xfs_mount *mp = ip->i_mount; 3925 - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3926 - xfs_extlen_t alen; 3927 - xfs_extlen_t indlen; 3928 - uint64_t fdblocks; 3929 - int error; 3930 - xfs_fileoff_t aoff; 3931 - bool use_cowextszhint = 3932 - whichfork == XFS_COW_FORK && !prealloc; 3933 - 3934 - retry: 3935 - /* 3936 - * Cap the alloc length. Keep track of prealloc so we know whether to 3937 - * tag the inode before we return. 3938 - */ 3939 - aoff = off; 3940 - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); 3941 - if (!eof) 3942 - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); 3943 - if (prealloc && alen >= len) 3944 - prealloc = alen - len; 3945 - 3946 - /* 3947 - * If we're targetting the COW fork but aren't creating a speculative 3948 - * posteof preallocation, try to expand the reservation to align with 3949 - * the COW extent size hint if there's sufficient free space. 3950 - * 3951 - * Unlike the data fork, the CoW cancellation functions will free all 3952 - * the reservations at inactivation, so we don't require that every 3953 - * delalloc reservation have a dirty pagecache. 3954 - */ 3955 - if (use_cowextszhint) { 3956 - struct xfs_bmbt_irec prev; 3957 - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); 3958 - 3959 - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) 3960 - prev.br_startoff = NULLFILEOFF; 3961 - 3962 - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, 3963 - 1, 0, &aoff, &alen); 3964 - ASSERT(!error); 3965 - } 3966 - 3967 - /* 3968 - * Make a transaction-less quota reservation for delayed allocation 3969 - * blocks. This number gets adjusted later. We return if we haven't 3970 - * allocated blocks already inside this loop. 3971 - */ 3972 - error = xfs_quota_reserve_blkres(ip, alen); 3973 - if (error) 3974 - goto out; 3975 - 3976 - /* 3977 - * Split changing sb for alen and indlen since they could be coming 3978 - * from different places. 3979 - */ 3980 - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); 3981 - ASSERT(indlen > 0); 3982 - 3983 - fdblocks = indlen; 3984 - if (XFS_IS_REALTIME_INODE(ip)) { 3985 - error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); 3986 - if (error) 3987 - goto out_unreserve_quota; 3988 - } else { 3989 - fdblocks += alen; 3990 - } 3991 - 3992 - error = xfs_dec_fdblocks(mp, fdblocks, false); 3993 - if (error) 3994 - goto out_unreserve_frextents; 3995 - 3996 - ip->i_delayed_blks += alen; 3997 - xfs_mod_delalloc(ip, alen, indlen); 3998 - 3999 - got->br_startoff = aoff; 4000 - got->br_startblock = nullstartblock(indlen); 4001 - got->br_blockcount = alen; 4002 - got->br_state = XFS_EXT_NORM; 4003 - 4004 - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); 4005 - 4006 - /* 4007 - * Tag the inode if blocks were preallocated. Note that COW fork 4008 - * preallocation can occur at the start or end of the extent, even when 4009 - * prealloc == 0, so we must also check the aligned offset and length. 4010 - */ 4011 - if (whichfork == XFS_DATA_FORK && prealloc) 4012 - xfs_inode_set_eofblocks_tag(ip); 4013 - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) 4014 - xfs_inode_set_cowblocks_tag(ip); 4015 - 4016 - return 0; 4017 - 4018 - out_unreserve_frextents: 4019 - if (XFS_IS_REALTIME_INODE(ip)) 4020 - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); 4021 - out_unreserve_quota: 4022 - if (XFS_IS_QUOTA_ON(mp)) 4023 - xfs_quota_unreserve_blkres(ip, alen); 4024 - out: 4025 - if (error == -ENOSPC || error == -EDQUOT) { 4026 - trace_xfs_delalloc_enospc(ip, off, len); 4027 - 4028 - if (prealloc || use_cowextszhint) { 4029 - /* retry without any preallocation */ 4030 - use_cowextszhint = false; 4031 - prealloc = 0; 4032 - goto retry; 4033 - } 4034 - } 4035 - return error; 4036 - } 4037 - 4038 4042 static int 4039 4043 xfs_bmapi_allocate( 4040 4044 struct xfs_bmalloca *bma) ··· 4668 4948 int whichfork, 4669 4949 struct xfs_iext_cursor *icur, 4670 4950 struct xfs_bmbt_irec *got, 4671 - struct xfs_bmbt_irec *del) 4951 + struct xfs_bmbt_irec *del, 4952 + uint32_t bflags) /* bmapi flags */ 4672 4953 { 4673 4954 struct xfs_mount *mp = ip->i_mount; 4674 4955 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ··· 4789 5068 da_diff = da_old - da_new; 4790 5069 fdblocks = da_diff; 4791 5070 4792 - if (isrt) 4793 - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); 4794 - else 5071 + if (bflags & XFS_BMAPI_REMAP) { 5072 + ; 5073 + } else if (isrt) { 5074 + xfs_rtbxlen_t rtxlen; 5075 + 5076 + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); 5077 + if (xfs_is_zoned_inode(ip)) 5078 + xfs_zoned_add_available(mp, rtxlen); 5079 + xfs_add_frextents(mp, rtxlen); 5080 + } else { 4795 5081 fdblocks += del->br_blockcount; 5082 + } 4796 5083 4797 5084 xfs_add_fdblocks(mp, fdblocks); 4798 5085 xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); ··· 5399 5670 5400 5671 delete: 5401 5672 if (wasdel) { 5402 - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); 5673 + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, 5674 + &del, flags); 5403 5675 } else { 5404 5676 error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, 5405 5677 &del, &tmp_logflags, whichfork,

+2 -5

fs/xfs/libxfs/xfs_bmap.h

··· 204 204 xfs_extnum_t nexts, int *done); 205 205 void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, 206 206 struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, 207 - struct xfs_bmbt_irec *del); 207 + struct xfs_bmbt_irec *del, uint32_t bflags); 208 208 void xfs_bmap_del_extent_cow(struct xfs_inode *ip, 209 209 struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, 210 210 struct xfs_bmbt_irec *del); ··· 219 219 bool *done, xfs_fileoff_t stop_fsb); 220 220 int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, 221 221 xfs_fileoff_t split_offset); 222 - int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, 223 - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, 224 - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, 225 - int eof); 226 222 int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, 227 223 xfs_off_t offset, struct iomap *iomap, unsigned int *seq); 228 224 int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, ··· 229 233 int fork); 230 234 int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, 231 235 struct xfs_alloc_arg *args); 236 + xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); 232 237 233 238 enum xfs_bmap_intent_type { 234 239 XFS_BMAP_MAP = 1,

+16 -4

fs/xfs/libxfs/xfs_format.h

··· 178 178 179 179 xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ 180 180 xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ 181 - 182 181 uint8_t sb_rgblklog; /* rt group number shift */ 183 182 uint8_t sb_pad[7]; /* zeroes */ 183 + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ 184 + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ 184 185 185 186 /* must be padded to 64 bit alignment */ 186 187 } xfs_sb_t; ··· 271 270 __be64 sb_metadirino; /* metadata directory tree root */ 272 271 __be32 sb_rgcount; /* # of realtime groups */ 273 272 __be32 sb_rgextents; /* size of rtgroup in rtx */ 274 - 275 273 __u8 sb_rgblklog; /* rt group number shift */ 276 274 __u8 sb_pad[7]; /* zeroes */ 275 + __be64 sb_rtstart; /* start of internal RT section (FSB) */ 276 + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ 277 277 278 278 /* 279 279 * The size of this structure must be padded to 64 bit alignment. ··· 397 395 #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ 398 396 #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ 399 397 #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ 398 + #define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ 399 + #define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ 400 + 400 401 #define XFS_SB_FEAT_INCOMPAT_ALL \ 401 402 (XFS_SB_FEAT_INCOMPAT_FTYPE | \ 402 403 XFS_SB_FEAT_INCOMPAT_SPINODES | \ ··· 409 404 XFS_SB_FEAT_INCOMPAT_NREXT64 | \ 410 405 XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ 411 406 XFS_SB_FEAT_INCOMPAT_PARENT | \ 412 - XFS_SB_FEAT_INCOMPAT_METADIR) 407 + XFS_SB_FEAT_INCOMPAT_METADIR | \ 408 + XFS_SB_FEAT_INCOMPAT_ZONED | \ 409 + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) 413 410 414 411 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL 415 412 static inline bool ··· 959 952 __be64 di_changecount; /* number of attribute changes */ 960 953 __be64 di_lsn; /* flush sequence */ 961 954 __be64 di_flags2; /* more random flags */ 962 - __be32 di_cowextsize; /* basic cow extent size for file */ 955 + union { 956 + /* basic cow extent size for (regular) file */ 957 + __be32 di_cowextsize; 958 + /* used blocks in RTG for (zoned) rtrmap inode */ 959 + __be32 di_used_blocks; 960 + }; 963 961 __u8 di_pad2[12]; /* more padding for future expansion */ 964 962 965 963 /* fields only written to during inode creation */

+13 -1

fs/xfs/libxfs/xfs_fs.h

··· 189 189 uint32_t checked; /* o: checked fs & rt metadata */ 190 190 __u32 rgextents; /* rt extents in a realtime group */ 191 191 __u32 rgcount; /* number of realtime groups */ 192 - __u64 reserved[16]; /* reserved space */ 192 + __u64 rtstart; /* start of internal rt section */ 193 + __u64 rtreserved; /* RT (zoned) reserved blocks */ 194 + __u64 reserved[14]; /* reserved space */ 193 195 }; 194 196 195 197 #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ ··· 249 247 #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ 250 248 #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ 251 249 #define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ 250 + #define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ 252 251 253 252 /* 254 253 * Minimum and maximum sizes need for growth checks. ··· 1082 1079 #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) 1083 1080 /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ 1084 1081 1082 + /* 1083 + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device 1084 + * when using internal RT devices. 1085 + */ 1086 + enum xfs_device { 1087 + XFS_DEV_DATA = 1, 1088 + XFS_DEV_LOG = 2, 1089 + XFS_DEV_RT = 3, 1090 + }; 1085 1091 1086 1092 #ifndef HAVE_BBMACROS 1087 1093 /*

+25 -6

fs/xfs/libxfs/xfs_group.h

··· 19 19 #ifdef __KERNEL__ 20 20 /* -- kernel only structures below this line -- */ 21 21 22 - /* 23 - * Track freed but not yet committed extents. 24 - */ 25 - struct xfs_extent_busy_tree *xg_busy_extents; 22 + union { 23 + /* 24 + * For perags and non-zoned RT groups: 25 + * Track freed but not yet committed extents. 26 + */ 27 + struct xfs_extent_busy_tree *xg_busy_extents; 28 + 29 + /* 30 + * For zoned RT groups: 31 + * List of groups that need a zone reset. 32 + * 33 + * The zonegc code forces a log flush of the rtrmap inode before 34 + * resetting the write pointer, so there is no need for 35 + * individual busy extent tracking. 36 + */ 37 + struct xfs_group *xg_next_reset; 38 + }; 26 39 27 40 /* 28 41 * Bitsets of per-ag metadata that have been checked and/or are sick. ··· 120 107 xfs_agblock_t gbno) 121 108 { 122 109 struct xfs_mount *mp = xg->xg_mount; 123 - uint32_t blocks = mp->m_groups[xg->xg_type].blocks; 110 + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; 111 + xfs_fsblock_t fsbno; 124 112 125 - return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno); 113 + if (g->has_daddr_gaps) 114 + fsbno = xfs_gbno_to_fsb(xg, gbno); 115 + else 116 + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; 117 + 118 + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); 126 119 } 127 120 128 121 static inline uint32_t

+2 -2

fs/xfs/libxfs/xfs_ialloc.c

··· 364 364 (j * M_IGEO(mp)->blocks_per_cluster)); 365 365 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 366 366 mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, 367 - XBF_UNMAPPED, &fbuf); 367 + 0, &fbuf); 368 368 if (error) 369 369 return error; 370 370 ··· 1927 1927 * that we can immediately allocate, but then we allow allocation on the 1928 1928 * second pass if we fail to find an AG with free inodes in it. 1929 1929 */ 1930 - if (percpu_counter_read_positive(&mp->m_fdblocks) < 1930 + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < 1931 1931 mp->m_low_space[XFS_LOWSP_1_PCNT]) { 1932 1932 ok_alloc = false; 1933 1933 low_space = true;

+17 -6

fs/xfs/libxfs/xfs_inode_buf.c

··· 137 137 int error; 138 138 139 139 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 140 - imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); 140 + imap->im_len, 0, bpp, &xfs_inode_buf_ops); 141 141 if (xfs_metadata_is_sick(error)) 142 142 xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), 143 143 XFS_SICK_AG_INODES); ··· 252 252 be64_to_cpu(from->di_changecount)); 253 253 ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); 254 254 ip->i_diflags2 = be64_to_cpu(from->di_flags2); 255 + /* also covers the di_used_blocks union arm: */ 255 256 ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); 257 + BUILD_BUG_ON(sizeof(from->di_cowextsize) != 258 + sizeof(from->di_used_blocks)); 256 259 } 257 260 258 261 error = xfs_iformat_data_fork(ip, from); ··· 352 349 to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); 353 350 to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); 354 351 to->di_flags2 = cpu_to_be64(ip->i_diflags2); 352 + /* also covers the di_used_blocks union arm: */ 355 353 to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); 356 354 to->di_ino = cpu_to_be64(ip->i_ino); 357 355 to->di_lsn = cpu_to_be64(lsn); ··· 756 752 !xfs_has_rtreflink(mp)) 757 753 return __this_address; 758 754 759 - /* COW extent size hint validation */ 760 - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), 761 - mode, flags, flags2); 762 - if (fa) 763 - return fa; 755 + if (xfs_has_zoned(mp) && 756 + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { 757 + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) 758 + return __this_address; 759 + } else { 760 + /* COW extent size hint validation */ 761 + fa = xfs_inode_validate_cowextsize(mp, 762 + be32_to_cpu(dip->di_cowextsize), 763 + mode, flags, flags2); 764 + if (fa) 765 + return fa; 766 + } 764 767 765 768 /* bigtime iflag can only happen on bigtime filesystems */ 766 769 if (xfs_dinode_has_bigtime(dip) &&

+1

fs/xfs/libxfs/xfs_inode_util.c

··· 322 322 323 323 if (xfs_has_v3inodes(mp)) { 324 324 inode_set_iversion(inode, 1); 325 + /* also covers the di_used_blocks union arm: */ 325 326 ip->i_cowextsize = 0; 326 327 times |= XFS_ICHGTIME_CREATE; 327 328 }

+6 -1

fs/xfs/libxfs/xfs_log_format.h

··· 475 475 xfs_lsn_t di_lsn; 476 476 477 477 uint64_t di_flags2; /* more random flags */ 478 - uint32_t di_cowextsize; /* basic cow extent size for file */ 478 + union { 479 + /* basic cow extent size for (regular) file */ 480 + uint32_t di_cowextsize; 481 + /* used blocks in RTG for (zoned) rtrmap inode */ 482 + uint32_t di_used_blocks; 483 + }; 479 484 uint8_t di_pad2[12]; /* more padding for future expansion */ 480 485 481 486 /* fields only written to during inode creation */

+111 -64

fs/xfs/libxfs/xfs_metafile.c

··· 21 21 #include "xfs_errortag.h" 22 22 #include "xfs_error.h" 23 23 #include "xfs_alloc.h" 24 + #include "xfs_rtgroup.h" 25 + #include "xfs_rtrmap_btree.h" 26 + #include "xfs_rtrefcount_btree.h" 24 27 25 28 static const struct { 26 29 enum xfs_metafile_type mtype; ··· 77 74 } 78 75 79 76 /* 80 - * Is the amount of space that could be allocated towards a given metadata 81 - * file at or beneath a certain threshold? 77 + * Is the metafile reservations at or beneath a certain threshold? 82 78 */ 83 79 static inline bool 84 80 xfs_metafile_resv_can_cover( 85 - struct xfs_inode *ip, 81 + struct xfs_mount *mp, 86 82 int64_t rhs) 87 83 { 88 84 /* ··· 90 88 * global free block count. Take care of the first case to avoid 91 89 * touching the per-cpu counter. 92 90 */ 93 - if (ip->i_delayed_blks >= rhs) 91 + if (mp->m_metafile_resv_avail >= rhs) 94 92 return true; 95 93 96 94 /* 97 95 * There aren't enough blocks left in the inode's reservation, but it 98 96 * isn't critical unless there also isn't enough free space. 99 97 */ 100 - return __percpu_counter_compare(&ip->i_mount->m_fdblocks, 101 - rhs - ip->i_delayed_blks, 2048) >= 0; 98 + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, 99 + rhs - mp->m_metafile_resv_avail, 2048) >= 0; 102 100 } 103 101 104 102 /* 105 - * Is this metadata file critically low on blocks? For now we'll define that 106 - * as the number of blocks we can get our hands on being less than 10% of what 107 - * we reserved or less than some arbitrary number (maximum btree height). 103 + * Is the metafile reservation critically low on blocks? For now we'll define 104 + * that as the number of blocks we can get our hands on being less than 10% of 105 + * what we reserved or less than some arbitrary number (maximum btree height). 108 106 */ 109 107 bool 110 108 xfs_metafile_resv_critical( 111 - struct xfs_inode *ip) 109 + struct xfs_mount *mp) 112 110 { 113 - uint64_t asked_low_water; 111 + ASSERT(xfs_has_metadir(mp)); 114 112 115 - if (!ip) 116 - return false; 113 + trace_xfs_metafile_resv_critical(mp, 0); 117 114 118 - ASSERT(xfs_is_metadir_inode(ip)); 119 - trace_xfs_metafile_resv_critical(ip, 0); 120 - 121 - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) 115 + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) 122 116 return true; 123 117 124 - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); 125 - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) 118 + if (!xfs_metafile_resv_can_cover(mp, 119 + div_u64(mp->m_metafile_resv_target, 10))) 126 120 return true; 127 121 128 - return XFS_TEST_ERROR(false, ip->i_mount, 129 - XFS_ERRTAG_METAFILE_RESV_CRITICAL); 122 + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); 130 123 } 131 124 132 125 /* Allocate a block from the metadata file's reservation. */ ··· 130 133 struct xfs_inode *ip, 131 134 struct xfs_alloc_arg *args) 132 135 { 136 + struct xfs_mount *mp = ip->i_mount; 133 137 int64_t len = args->len; 134 138 135 139 ASSERT(xfs_is_metadir_inode(ip)); 136 140 ASSERT(args->resv == XFS_AG_RESV_METAFILE); 137 141 138 - trace_xfs_metafile_resv_alloc_space(ip, args->len); 142 + trace_xfs_metafile_resv_alloc_space(mp, args->len); 139 143 140 144 /* 141 145 * Allocate the blocks from the metadata inode's block reservation 142 146 * and update the ondisk sb counter. 143 147 */ 144 - if (ip->i_delayed_blks > 0) { 148 + mutex_lock(&mp->m_metafile_resv_lock); 149 + if (mp->m_metafile_resv_avail > 0) { 145 150 int64_t from_resv; 146 151 147 - from_resv = min_t(int64_t, len, ip->i_delayed_blks); 148 - ip->i_delayed_blks -= from_resv; 152 + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); 153 + mp->m_metafile_resv_avail -= from_resv; 149 154 xfs_mod_delalloc(ip, 0, -from_resv); 150 155 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, 151 156 -from_resv); ··· 174 175 xfs_trans_mod_sb(args->tp, field, -len); 175 176 } 176 177 178 + mp->m_metafile_resv_used += args->len; 179 + mutex_unlock(&mp->m_metafile_resv_lock); 180 + 177 181 ip->i_nblocks += args->len; 178 182 xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); 179 183 } ··· 188 186 struct xfs_trans *tp, 189 187 xfs_filblks_t len) 190 188 { 189 + struct xfs_mount *mp = ip->i_mount; 191 190 int64_t to_resv; 192 191 193 192 ASSERT(xfs_is_metadir_inode(ip)); 194 - trace_xfs_metafile_resv_free_space(ip, len); 193 + 194 + trace_xfs_metafile_resv_free_space(mp, len); 195 195 196 196 ip->i_nblocks -= len; 197 197 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 198 + 199 + mutex_lock(&mp->m_metafile_resv_lock); 200 + mp->m_metafile_resv_used -= len; 198 201 199 202 /* 200 203 * Add the freed blocks back into the inode's delalloc reservation 201 204 * until it reaches the maximum size. Update the ondisk fdblocks only. 202 205 */ 203 - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); 206 + to_resv = mp->m_metafile_resv_target - 207 + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); 204 208 if (to_resv > 0) { 205 209 to_resv = min_t(int64_t, to_resv, len); 206 - ip->i_delayed_blks += to_resv; 210 + mp->m_metafile_resv_avail += to_resv; 207 211 xfs_mod_delalloc(ip, 0, to_resv); 208 212 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); 209 213 len -= to_resv; 210 214 } 215 + mutex_unlock(&mp->m_metafile_resv_lock); 211 216 212 217 /* 213 218 * Everything else goes back to the filesystem, so update the in-core ··· 224 215 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); 225 216 } 226 217 227 - /* Release a metadata file's space reservation. */ 228 - void 229 - xfs_metafile_resv_free( 230 - struct xfs_inode *ip) 218 + static void 219 + __xfs_metafile_resv_free( 220 + struct xfs_mount *mp) 231 221 { 232 - /* Non-btree metadata inodes don't need space reservations. */ 233 - if (!ip || !ip->i_meta_resv_asked) 234 - return; 235 - 236 - ASSERT(xfs_is_metadir_inode(ip)); 237 - trace_xfs_metafile_resv_free(ip, 0); 238 - 239 - if (ip->i_delayed_blks) { 240 - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); 241 - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); 242 - ip->i_delayed_blks = 0; 222 + if (mp->m_metafile_resv_avail) { 223 + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); 224 + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); 243 225 } 244 - ip->i_meta_resv_asked = 0; 226 + mp->m_metafile_resv_avail = 0; 227 + mp->m_metafile_resv_used = 0; 228 + mp->m_metafile_resv_target = 0; 245 229 } 246 230 247 - /* Set up a metadata file's space reservation. */ 231 + /* Release unused metafile space reservation. */ 232 + void 233 + xfs_metafile_resv_free( 234 + struct xfs_mount *mp) 235 + { 236 + if (!xfs_has_metadir(mp)) 237 + return; 238 + 239 + trace_xfs_metafile_resv_free(mp, 0); 240 + 241 + mutex_lock(&mp->m_metafile_resv_lock); 242 + __xfs_metafile_resv_free(mp); 243 + mutex_unlock(&mp->m_metafile_resv_lock); 244 + } 245 + 246 + /* Set up a metafile space reservation. */ 248 247 int 249 248 xfs_metafile_resv_init( 250 - struct xfs_inode *ip, 251 - xfs_filblks_t ask) 249 + struct xfs_mount *mp) 252 250 { 251 + struct xfs_rtgroup *rtg = NULL; 252 + xfs_filblks_t used = 0, target = 0; 253 253 xfs_filblks_t hidden_space; 254 - xfs_filblks_t used; 255 - int error; 254 + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; 255 + int error = 0; 256 256 257 - if (!ip || ip->i_meta_resv_asked > 0) 257 + if (!xfs_has_metadir(mp)) 258 258 return 0; 259 259 260 - ASSERT(xfs_is_metadir_inode(ip)); 260 + /* 261 + * Free any previous reservation to have a clean slate. 262 + */ 263 + mutex_lock(&mp->m_metafile_resv_lock); 264 + __xfs_metafile_resv_free(mp); 261 265 262 266 /* 263 - * Space taken by all other metadata btrees are accounted on-disk as 267 + * Currently the only btree metafiles that require reservations are the 268 + * rtrmap and the rtrefcount. Anything new will have to be added here 269 + * as well. 270 + */ 271 + while ((rtg = xfs_rtgroup_next(mp, rtg))) { 272 + if (xfs_has_rtrmapbt(mp)) { 273 + used += rtg_rmap(rtg)->i_nblocks; 274 + target += xfs_rtrmapbt_calc_reserves(mp); 275 + } 276 + if (xfs_has_rtreflink(mp)) { 277 + used += rtg_refcount(rtg)->i_nblocks; 278 + target += xfs_rtrefcountbt_calc_reserves(mp); 279 + } 280 + } 281 + 282 + if (!target) 283 + goto out_unlock; 284 + 285 + /* 286 + * Space taken by the per-AG metadata btrees are accounted on-disk as 264 287 * used space. We therefore only hide the space that is reserved but 265 288 * not used by the trees. 266 289 */ 267 - used = ip->i_nblocks; 268 - if (used > ask) 269 - ask = used; 270 - hidden_space = ask - used; 290 + if (used > target) 291 + target = used; 292 + else if (target > dblocks_avail) 293 + target = dblocks_avail; 294 + hidden_space = target - used; 271 295 272 - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); 296 + error = xfs_dec_fdblocks(mp, hidden_space, true); 273 297 if (error) { 274 - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); 275 - return error; 298 + trace_xfs_metafile_resv_init_error(mp, 0); 299 + goto out_unlock; 276 300 } 277 301 278 - xfs_mod_delalloc(ip, 0, hidden_space); 279 - ip->i_delayed_blks = hidden_space; 280 - ip->i_meta_resv_asked = ask; 302 + xfs_mod_sb_delalloc(mp, hidden_space); 281 303 282 - trace_xfs_metafile_resv_init(ip, ask); 283 - return 0; 304 + mp->m_metafile_resv_target = target; 305 + mp->m_metafile_resv_used = used; 306 + mp->m_metafile_resv_avail = hidden_space; 307 + 308 + trace_xfs_metafile_resv_init(mp, target); 309 + 310 + out_unlock: 311 + mutex_unlock(&mp->m_metafile_resv_lock); 312 + return error; 284 313 }

+3 -3

fs/xfs/libxfs/xfs_metafile.h

··· 26 26 /* Space reservations for metadata inodes. */ 27 27 struct xfs_alloc_arg; 28 28 29 - bool xfs_metafile_resv_critical(struct xfs_inode *ip); 29 + bool xfs_metafile_resv_critical(struct xfs_mount *mp); 30 30 void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, 31 31 struct xfs_alloc_arg *args); 32 32 void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, 33 33 xfs_filblks_t len); 34 - void xfs_metafile_resv_free(struct xfs_inode *ip); 35 - int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); 34 + void xfs_metafile_resv_free(struct xfs_mount *mp); 35 + int xfs_metafile_resv_init(struct xfs_mount *mp); 36 36 37 37 /* Code specific to kernel/userspace; must be provided externally. */ 38 38

+4 -2

fs/xfs/libxfs/xfs_ondisk.h

··· 233 233 16299260424LL); 234 234 235 235 /* superblock field checks we got from xfs/122 */ 236 - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288); 237 - XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288); 236 + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); 237 + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); 238 238 XFS_CHECK_SB_OFFSET(sb_magicnum, 0); 239 239 XFS_CHECK_SB_OFFSET(sb_blocksize, 4); 240 240 XFS_CHECK_SB_OFFSET(sb_dblocks, 8); ··· 295 295 XFS_CHECK_SB_OFFSET(sb_rgextents, 276); 296 296 XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); 297 297 XFS_CHECK_SB_OFFSET(sb_pad, 281); 298 + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); 299 + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); 298 300 } 299 301 300 302 #endif /* __XFS_ONDISK_H */

+11

fs/xfs/libxfs/xfs_rtbitmap.c

··· 1123 1123 xfs_extlen_t mod; 1124 1124 int error; 1125 1125 1126 + ASSERT(!xfs_has_zoned(mp)); 1126 1127 ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); 1127 1128 1128 1129 mod = xfs_blen_to_rtxoff(mp, rtlen); ··· 1174 1173 return 0; 1175 1174 1176 1175 end = min(end, rtg->rtg_extents - 1); 1176 + 1177 + if (xfs_has_zoned(mp)) 1178 + return -EINVAL; 1177 1179 1178 1180 /* Iterate the bitmap, looking for discrepancies. */ 1179 1181 while (start <= end) { ··· 1272 1268 struct xfs_mount *mp, 1273 1269 xfs_rtbxlen_t rtextents) 1274 1270 { 1271 + if (xfs_has_zoned(mp)) 1272 + return 0; 1275 1273 return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); 1276 1274 } 1277 1275 ··· 1313 1307 { 1314 1308 xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); 1315 1309 unsigned long long rsumwords; 1310 + 1311 + if (xfs_has_zoned(mp)) { 1312 + *rsumlevels = 0; 1313 + return 0; 1314 + } 1316 1315 1317 1316 *rsumlevels = xfs_compute_rextslog(rextents) + 1; 1318 1317 rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);

+23 -16

fs/xfs/libxfs/xfs_rtgroup.c

··· 194 194 ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || 195 195 !(rtglock_flags & XFS_RTGLOCK_BITMAP)); 196 196 197 - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 198 - /* 199 - * Lock both realtime free space metadata inodes for a freespace 200 - * update. 201 - */ 202 - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); 203 - xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); 204 - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { 205 - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); 197 + if (!xfs_has_zoned(rtg_mount(rtg))) { 198 + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 199 + /* 200 + * Lock both realtime free space metadata inodes for a 201 + * freespace update. 202 + */ 203 + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); 204 + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); 205 + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { 206 + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); 207 + } 206 208 } 207 209 208 210 if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) ··· 230 228 if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) 231 229 xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); 232 230 233 - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 234 - xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); 235 - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); 236 - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { 237 - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); 231 + if (!xfs_has_zoned(rtg_mount(rtg))) { 232 + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 233 + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); 234 + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); 235 + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { 236 + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); 237 + } 238 238 } 239 239 } 240 240 ··· 253 249 ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); 254 250 ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); 255 251 256 - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 252 + if (!xfs_has_zoned(rtg_mount(rtg)) && 253 + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { 257 254 xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); 258 255 xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); 259 256 } ··· 275 270 /* Fill out form. */ 276 271 memset(rgeo, 0, sizeof(*rgeo)); 277 272 rgeo->rg_number = rtg_rgno(rtg); 278 - rgeo->rg_length = rtg_group(rtg)->xg_block_count; 273 + rgeo->rg_length = rtg_blocks(rtg); 279 274 xfs_rtgroup_geom_health(rtg, rgeo); 280 275 return 0; 281 276 } ··· 359 354 .sick = XFS_SICK_RG_BITMAP, 360 355 .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | 361 356 (1U << XFS_DINODE_FMT_BTREE), 357 + .enabled = xfs_has_nonzoned, 362 358 .create = xfs_rtbitmap_create, 363 359 }, 364 360 [XFS_RTGI_SUMMARY] = { ··· 368 362 .sick = XFS_SICK_RG_SUMMARY, 369 363 .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | 370 364 (1U << XFS_DINODE_FMT_BTREE), 365 + .enabled = xfs_has_nonzoned, 371 366 .create = xfs_rtsummary_create, 372 367 }, 373 368 [XFS_RTGI_RMAP] = {

+39 -11

fs/xfs/libxfs/xfs_rtgroup.h

··· 37 37 xfs_rtxnum_t rtg_extents; 38 38 39 39 /* 40 - * Cache of rt summary level per bitmap block with the invariant that 41 - * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, 42 - * or 0 if rsum[i][bbno] == 0 for all i. 43 - * 40 + * For bitmap based RT devices this points to a cache of rt summary 41 + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] 42 + * > the maximum i for which rsum[i][bbno] != 0, or 0 if 43 + * rsum[i][bbno] == 0 for all i. 44 44 * Reads and writes are serialized by the rsumip inode lock. 45 + * 46 + * For zoned RT devices this points to the open zone structure for 47 + * a group that is open for writers, or is NULL. 45 48 */ 46 - uint8_t *rtg_rsum_cache; 49 + union { 50 + uint8_t *rtg_rsum_cache; 51 + struct xfs_open_zone *rtg_open_zone; 52 + }; 47 53 }; 54 + 55 + /* 56 + * For zoned RT devices this is set on groups that have no written blocks 57 + * and can be picked by the allocator for opening. 58 + */ 59 + #define XFS_RTG_FREE XA_MARK_0 60 + 61 + /* 62 + * For zoned RT devices this is set on groups that are fully written and that 63 + * have unused blocks. Used by the garbage collection to pick targets. 64 + */ 65 + #define XFS_RTG_RECLAIMABLE XA_MARK_1 48 66 49 67 static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) 50 68 { ··· 82 64 static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) 83 65 { 84 66 return rtg->rtg_group.xg_gno; 67 + } 68 + 69 + static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) 70 + { 71 + return rtg->rtg_group.xg_block_count; 85 72 } 86 73 87 74 static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) ··· 245 222 xfs_rtblock_t rtbno) 246 223 { 247 224 struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; 248 - xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); 249 - uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks; 250 225 251 - return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask)); 226 + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { 227 + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); 228 + 229 + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); 230 + } 231 + 232 + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); 252 233 } 253 234 254 235 static inline xfs_rtblock_t ··· 260 233 struct xfs_mount *mp, 261 234 xfs_daddr_t daddr) 262 235 { 263 - xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr); 236 + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; 237 + xfs_rfsblock_t bno; 264 238 265 - if (xfs_has_rtgroups(mp)) { 266 - struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; 239 + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; 240 + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { 267 241 xfs_rgnumber_t rgno; 268 242 uint32_t rgbno; 269 243

+19

fs/xfs/libxfs/xfs_rtrmap_btree.c

··· 1033 1033 xfs_btree_del_cursor(cur, error); 1034 1034 return error; 1035 1035 } 1036 + 1037 + /* 1038 + * Return the highest rgbno currently tracked by the rmap for this rtg. 1039 + */ 1040 + xfs_rgblock_t 1041 + xfs_rtrmap_highest_rgbno( 1042 + struct xfs_rtgroup *rtg) 1043 + { 1044 + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; 1045 + union xfs_btree_key key = {}; 1046 + struct xfs_btree_cur *cur; 1047 + 1048 + if (block->bb_numrecs == 0) 1049 + return NULLRGBLOCK; 1050 + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); 1051 + xfs_btree_get_keys(cur, block, &key); 1052 + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1053 + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); 1054 + }

+2

fs/xfs/libxfs/xfs_rtrmap_btree.h

··· 207 207 int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, 208 208 struct xfs_buftarg *btp, xfs_rgnumber_t rgno); 209 209 210 + xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); 211 + 210 212 #endif /* __XFS_RTRMAP_BTREE_H__ */

+74 -7

fs/xfs/libxfs/xfs_sb.c

··· 185 185 features |= XFS_FEAT_PARENT; 186 186 if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) 187 187 features |= XFS_FEAT_METADIR; 188 + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) 189 + features |= XFS_FEAT_ZONED; 188 190 189 191 return features; 190 192 } ··· 268 266 xfs_expected_rbmblocks( 269 267 struct xfs_sb *sbp) 270 268 { 269 + if (xfs_sb_is_v5(sbp) && 270 + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) 271 + return 0; 271 272 return howmany_64(xfs_extents_per_rbm(sbp), 272 273 NBBY * xfs_rtbmblock_size(sbp)); 273 274 } ··· 280 275 xfs_validate_rt_geometry( 281 276 struct xfs_sb *sbp) 282 277 { 283 - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || 284 - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) 285 - return false; 278 + if (xfs_sb_is_v5(sbp) && 279 + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { 280 + if (sbp->sb_rextsize != 1) 281 + return false; 282 + } else { 283 + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || 284 + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) 285 + return false; 286 + } 286 287 287 288 if (sbp->sb_rblocks == 0) { 288 289 if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || ··· 446 435 return 0; 447 436 } 448 437 438 + static int 439 + xfs_validate_sb_zoned( 440 + struct xfs_mount *mp, 441 + struct xfs_sb *sbp) 442 + { 443 + if (sbp->sb_frextents != 0) { 444 + xfs_warn(mp, 445 + "sb_frextents must be zero for zoned file systems."); 446 + return -EINVAL; 447 + } 448 + 449 + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { 450 + xfs_warn(mp, 451 + "sb_rtstart (%lld) overlaps sb_dblocks (%lld).", 452 + sbp->sb_rtstart, sbp->sb_dblocks); 453 + return -EINVAL; 454 + } 455 + 456 + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { 457 + xfs_warn(mp, 458 + "sb_rtreserved (%lld) larger than sb_rblocks (%lld).", 459 + sbp->sb_rtreserved, sbp->sb_rblocks); 460 + return -EINVAL; 461 + } 462 + 463 + return 0; 464 + } 465 + 449 466 /* Check the validity of the SB. */ 450 467 STATIC int 451 468 xfs_validate_sb_common( ··· 559 520 } 560 521 561 522 error = xfs_validate_sb_rtgroups(mp, sbp); 523 + if (error) 524 + return error; 525 + } 526 + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { 527 + error = xfs_validate_sb_zoned(mp, sbp); 562 528 if (error) 563 529 return error; 564 530 } ··· 879 835 to->sb_rgcount = 1; 880 836 to->sb_rgextents = 0; 881 837 } 838 + 839 + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { 840 + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); 841 + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); 842 + } else { 843 + to->sb_rtstart = 0; 844 + to->sb_rtreserved = 0; 845 + } 882 846 } 883 847 884 848 void ··· 1053 1001 to->sb_rbmino = cpu_to_be64(0); 1054 1002 to->sb_rsumino = cpu_to_be64(0); 1055 1003 } 1004 + 1005 + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { 1006 + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); 1007 + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); 1008 + } 1056 1009 } 1057 1010 1058 1011 /* ··· 1203 1146 rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; 1204 1147 rgs->blklog = mp->m_sb.sb_rgblklog; 1205 1148 rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); 1149 + rgs->start_fsb = mp->m_sb.sb_rtstart; 1150 + if (xfs_sb_has_incompat_feature(sbp, 1151 + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) 1152 + rgs->has_daddr_gaps = true; 1206 1153 } else { 1207 1154 rgs->blocks = 0; 1208 1155 rgs->blklog = 0; ··· 1326 1265 mp->m_sb.sb_ifree = min_t(uint64_t, 1327 1266 percpu_counter_sum_positive(&mp->m_ifree), 1328 1267 mp->m_sb.sb_icount); 1329 - mp->m_sb.sb_fdblocks = 1330 - percpu_counter_sum_positive(&mp->m_fdblocks); 1268 + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); 1331 1269 } 1332 1270 1333 1271 /* ··· 1335 1275 * we handle nearly-lockless reservations, so we must use the _positive 1336 1276 * variant here to avoid writing out nonsense frextents. 1337 1277 */ 1338 - if (xfs_has_rtgroups(mp)) 1278 + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { 1339 1279 mp->m_sb.sb_frextents = 1340 - percpu_counter_sum_positive(&mp->m_frextents); 1280 + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); 1281 + } 1341 1282 1342 1283 xfs_sb_to_disk(bp->b_addr, &mp->m_sb); 1343 1284 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); ··· 1571 1510 geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; 1572 1511 if (xfs_has_metadir(mp)) 1573 1512 geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; 1513 + if (xfs_has_zoned(mp)) 1514 + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; 1574 1515 geo->rtsectsize = sbp->sb_blocksize; 1575 1516 geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); 1576 1517 ··· 1592 1529 if (xfs_has_rtgroups(mp)) { 1593 1530 geo->rgcount = sbp->sb_rgcount; 1594 1531 geo->rgextents = sbp->sb_rgextents; 1532 + } 1533 + if (xfs_has_zoned(mp)) { 1534 + geo->rtstart = sbp->sb_rtstart; 1535 + geo->rtreserved = sbp->sb_rtreserved; 1595 1536 } 1596 1537 } 1597 1538

+28

fs/xfs/libxfs/xfs_types.h

··· 233 233 { XG_TYPE_AG, "ag" }, \ 234 234 { XG_TYPE_RTG, "rtg" } 235 235 236 + enum xfs_free_counter { 237 + /* 238 + * Number of free blocks on the data device. 239 + */ 240 + XC_FREE_BLOCKS, 241 + 242 + /* 243 + * Number of free RT extents on the RT device. 244 + */ 245 + XC_FREE_RTEXTENTS, 246 + 247 + /* 248 + * Number of available for use RT extents. 249 + * 250 + * This counter only exists for zoned RT device and indicates the number 251 + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS 252 + * also includes blocks that have been written previously and freed, but 253 + * sit in a rtgroup that still needs a zone reset. 254 + */ 255 + XC_FREE_RTAVAILABLE, 256 + XC_FREE_NR, 257 + }; 258 + 259 + #define XFS_FREECOUNTER_STR \ 260 + { XC_FREE_BLOCKS, "blocks" }, \ 261 + { XC_FREE_RTEXTENTS, "rtextents" }, \ 262 + { XC_FREE_RTAVAILABLE, "rtavailable" } 263 + 236 264 /* 237 265 * Type verifier functions 238 266 */

+186

fs/xfs/libxfs/xfs_zones.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2023-2025 Christoph Hellwig. 4 + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_mount.h" 13 + #include "xfs_inode.h" 14 + #include "xfs_rtgroup.h" 15 + #include "xfs_zones.h" 16 + 17 + static bool 18 + xfs_zone_validate_empty( 19 + struct blk_zone *zone, 20 + struct xfs_rtgroup *rtg, 21 + xfs_rgblock_t *write_pointer) 22 + { 23 + struct xfs_mount *mp = rtg_mount(rtg); 24 + 25 + if (rtg_rmap(rtg)->i_used_blocks > 0) { 26 + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", 27 + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); 28 + return false; 29 + } 30 + 31 + *write_pointer = 0; 32 + return true; 33 + } 34 + 35 + static bool 36 + xfs_zone_validate_wp( 37 + struct blk_zone *zone, 38 + struct xfs_rtgroup *rtg, 39 + xfs_rgblock_t *write_pointer) 40 + { 41 + struct xfs_mount *mp = rtg_mount(rtg); 42 + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); 43 + 44 + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { 45 + xfs_warn(mp, "zone %u has too large used counter (0x%x).", 46 + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); 47 + return false; 48 + } 49 + 50 + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { 51 + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", 52 + rtg_rgno(rtg), wp_fsb); 53 + return false; 54 + } 55 + 56 + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); 57 + if (*write_pointer >= rtg->rtg_extents) { 58 + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", 59 + rtg_rgno(rtg), *write_pointer); 60 + return false; 61 + } 62 + 63 + return true; 64 + } 65 + 66 + static bool 67 + xfs_zone_validate_full( 68 + struct blk_zone *zone, 69 + struct xfs_rtgroup *rtg, 70 + xfs_rgblock_t *write_pointer) 71 + { 72 + struct xfs_mount *mp = rtg_mount(rtg); 73 + 74 + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { 75 + xfs_warn(mp, "zone %u has too large used counter (0x%x).", 76 + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); 77 + return false; 78 + } 79 + 80 + *write_pointer = rtg->rtg_extents; 81 + return true; 82 + } 83 + 84 + static bool 85 + xfs_zone_validate_seq( 86 + struct blk_zone *zone, 87 + struct xfs_rtgroup *rtg, 88 + xfs_rgblock_t *write_pointer) 89 + { 90 + struct xfs_mount *mp = rtg_mount(rtg); 91 + 92 + switch (zone->cond) { 93 + case BLK_ZONE_COND_EMPTY: 94 + return xfs_zone_validate_empty(zone, rtg, write_pointer); 95 + case BLK_ZONE_COND_IMP_OPEN: 96 + case BLK_ZONE_COND_EXP_OPEN: 97 + case BLK_ZONE_COND_CLOSED: 98 + return xfs_zone_validate_wp(zone, rtg, write_pointer); 99 + case BLK_ZONE_COND_FULL: 100 + return xfs_zone_validate_full(zone, rtg, write_pointer); 101 + case BLK_ZONE_COND_NOT_WP: 102 + case BLK_ZONE_COND_OFFLINE: 103 + case BLK_ZONE_COND_READONLY: 104 + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", 105 + rtg_rgno(rtg), zone->cond); 106 + return false; 107 + default: 108 + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", 109 + rtg_rgno(rtg), zone->cond); 110 + return false; 111 + } 112 + } 113 + 114 + static bool 115 + xfs_zone_validate_conv( 116 + struct blk_zone *zone, 117 + struct xfs_rtgroup *rtg) 118 + { 119 + struct xfs_mount *mp = rtg_mount(rtg); 120 + 121 + switch (zone->cond) { 122 + case BLK_ZONE_COND_NOT_WP: 123 + return true; 124 + default: 125 + xfs_warn(mp, 126 + "conventional zone %u has unsupported zone condition 0x%x.", 127 + rtg_rgno(rtg), zone->cond); 128 + return false; 129 + } 130 + } 131 + 132 + bool 133 + xfs_zone_validate( 134 + struct blk_zone *zone, 135 + struct xfs_rtgroup *rtg, 136 + xfs_rgblock_t *write_pointer) 137 + { 138 + struct xfs_mount *mp = rtg_mount(rtg); 139 + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; 140 + uint32_t expected_size; 141 + 142 + /* 143 + * Check that the zone capacity matches the rtgroup size stored in the 144 + * superblock. Note that all zones including the last one must have a 145 + * uniform capacity. 146 + */ 147 + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { 148 + xfs_warn(mp, 149 + "zone %u capacity (0x%llx) does not match RT group size (0x%x).", 150 + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), 151 + g->blocks); 152 + return false; 153 + } 154 + 155 + if (g->has_daddr_gaps) { 156 + expected_size = 1 << g->blklog; 157 + } else { 158 + if (zone->len != zone->capacity) { 159 + xfs_warn(mp, 160 + "zone %u has capacity != size ((0x%llx vs 0x%llx)", 161 + rtg_rgno(rtg), 162 + XFS_BB_TO_FSB(mp, zone->len), 163 + XFS_BB_TO_FSB(mp, zone->capacity)); 164 + return false; 165 + } 166 + expected_size = g->blocks; 167 + } 168 + 169 + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { 170 + xfs_warn(mp, 171 + "zone %u length (0x%llx) does match geometry (0x%x).", 172 + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), 173 + expected_size); 174 + } 175 + 176 + switch (zone->type) { 177 + case BLK_ZONE_TYPE_CONVENTIONAL: 178 + return xfs_zone_validate_conv(zone, rtg); 179 + case BLK_ZONE_TYPE_SEQWRITE_REQ: 180 + return xfs_zone_validate_seq(zone, rtg, write_pointer); 181 + default: 182 + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", 183 + rtg_rgno(rtg), zone->type); 184 + return false; 185 + } 186 + }

+35

fs/xfs/libxfs/xfs_zones.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LIBXFS_ZONES_H 3 + #define _LIBXFS_ZONES_H 4 + 5 + struct xfs_rtgroup; 6 + 7 + /* 8 + * In order to guarantee forward progress for GC we need to reserve at least 9 + * two zones: one that will be used for moving data into and one spare zone 10 + * making sure that we have enough space to relocate a nearly-full zone. 11 + * To allow for slightly sloppy accounting for when we need to reserve the 12 + * second zone, we actually reserve three as that is easier than doing fully 13 + * accurate bookkeeping. 14 + */ 15 + #define XFS_GC_ZONES 3U 16 + 17 + /* 18 + * In addition we need two zones for user writes, one open zone for writing 19 + * and one to still have available blocks without resetting the open zone 20 + * when data in the open zone has been freed. 21 + */ 22 + #define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) 23 + #define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) 24 + 25 + /* 26 + * Always keep one zone out of the general open zone pool to allow for GC to 27 + * happen while other writers are waiting for free space. 28 + */ 29 + #define XFS_OPEN_GC_ZONES 1U 30 + #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) 31 + 32 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, 33 + xfs_rgblock_t *write_pointer); 34 + 35 + #endif /* _LIBXFS_ZONES_H */

+2

fs/xfs/scrub/agheader.c

··· 69 69 xchk_superblock_ondisk_size( 70 70 struct xfs_mount *mp) 71 71 { 72 + if (xfs_has_zoned(mp)) 73 + return offsetofend(struct xfs_dsb, sb_rtreserved); 72 74 if (xfs_has_metadir(mp)) 73 75 return offsetofend(struct xfs_dsb, sb_pad); 74 76 if (xfs_has_metauuid(mp))

+2 -2

fs/xfs/scrub/bmap.c

··· 1038 1038 1039 1039 switch (whichfork) { 1040 1040 case XFS_COW_FORK: 1041 - /* No CoW forks on non-reflink filesystems. */ 1042 - if (!xfs_has_reflink(mp)) { 1041 + /* No CoW forks filesystem doesn't support out of place writes */ 1042 + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { 1043 1043 xchk_ino_set_corrupt(sc, sc->ip->i_ino); 1044 1044 return 0; 1045 1045 }

+15 -7

fs/xfs/scrub/fscounters.c

··· 350 350 * The global incore space reservation is taken from the incore 351 351 * counters, so leave that out of the computation. 352 352 */ 353 - fsc->fdblocks -= mp->m_resblks_avail; 353 + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; 354 354 355 355 /* 356 356 * Delayed allocation reservations are taken out of the incore counters ··· 413 413 414 414 fsc->frextents = 0; 415 415 fsc->frextents_delayed = 0; 416 - if (!xfs_has_realtime(mp)) 416 + 417 + /* 418 + * Don't bother verifying and repairing the fs counters for zoned file 419 + * systems as they don't track an on-disk frextents count, and the 420 + * in-memory percpu counter also includes reservations. 421 + */ 422 + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) 417 423 return 0; 418 424 419 425 while ((rtg = xfs_rtgroup_next(mp, rtg))) { ··· 519 513 /* Snapshot the percpu counters. */ 520 514 icount = percpu_counter_sum(&mp->m_icount); 521 515 ifree = percpu_counter_sum(&mp->m_ifree); 522 - fdblocks = percpu_counter_sum(&mp->m_fdblocks); 523 - frextents = percpu_counter_sum(&mp->m_frextents); 516 + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); 517 + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); 524 518 525 519 /* No negative values, please! */ 526 520 if (icount < 0 || ifree < 0) ··· 595 589 try_again = true; 596 590 } 597 591 598 - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 599 - fsc->fdblocks)) { 592 + if (!xchk_fscount_within_range(sc, fdblocks, 593 + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { 600 594 if (fsc->frozen) 601 595 xchk_set_corrupt(sc); 602 596 else 603 597 try_again = true; 604 598 } 605 599 606 - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, 600 + if (!xfs_has_zoned(mp) && 601 + !xchk_fscount_within_range(sc, frextents, 602 + &mp->m_free[XC_FREE_RTEXTENTS].count, 607 603 fsc->frextents - fsc->frextents_delayed)) { 608 604 if (fsc->frozen) 609 605 xchk_set_corrupt(sc);

+7 -5

fs/xfs/scrub/fscounters_repair.c

··· 64 64 65 65 percpu_counter_set(&mp->m_icount, fsc->icount); 66 66 percpu_counter_set(&mp->m_ifree, fsc->ifree); 67 - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); 67 + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); 68 68 69 69 /* 70 70 * Online repair is only supported on v5 file systems, which require ··· 74 74 * track of the delalloc reservations separately, as they are are 75 75 * subtracted from m_frextents, but not included in sb_frextents. 76 76 */ 77 - percpu_counter_set(&mp->m_frextents, 78 - fsc->frextents - fsc->frextents_delayed); 79 - if (!xfs_has_rtgroups(mp)) 80 - mp->m_sb.sb_frextents = fsc->frextents; 77 + if (!xfs_has_zoned(mp)) { 78 + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 79 + fsc->frextents - fsc->frextents_delayed); 80 + if (!xfs_has_rtgroups(mp)) 81 + mp->m_sb.sb_frextents = fsc->frextents; 82 + } 81 83 82 84 return 0; 83 85 }

+7

fs/xfs/scrub/inode.c

··· 273 273 xfs_failaddr_t fa; 274 274 uint32_t value = be32_to_cpu(dip->di_cowextsize); 275 275 276 + /* 277 + * The used block counter for rtrmap is checked and repaired elsewhere. 278 + */ 279 + if (xfs_has_zoned(sc->mp) && 280 + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) 281 + return; 282 + 276 283 fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); 277 284 if (fa) 278 285 xchk_ino_set_corrupt(sc, ino);

+4 -3

fs/xfs/scrub/inode_repair.c

··· 710 710 XFS_DIFLAG_EXTSZINHERIT); 711 711 } 712 712 713 - if (dip->di_version < 3) 713 + if (dip->di_version < 3 || 714 + (xfs_has_zoned(sc->mp) && 715 + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) 714 716 return; 715 717 716 718 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), ··· 1560 1558 1561 1559 /* Read the inode cluster buffer. */ 1562 1560 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, 1563 - ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, 1564 - NULL); 1561 + ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL); 1565 1562 if (error) 1566 1563 return error; 1567 1564

+1 -1

fs/xfs/scrub/newbt.c

··· 62 62 free = sc->sa.pag->pagf_freeblks; 63 63 sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); 64 64 } else { 65 - free = percpu_counter_sum(&sc->mp->m_fdblocks); 65 + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); 66 66 sz = sc->mp->m_sb.sb_dblocks; 67 67 } 68 68

+6 -3

fs/xfs/scrub/reap.c

··· 935 935 if (error) 936 936 return error; 937 937 938 - if (xreap_dirty(&rs)) 939 - return xrep_defer_finish(sc); 938 + if (xreap_dirty(&rs)) { 939 + error = xrep_defer_finish(sc); 940 + if (error) 941 + return error; 942 + } 940 943 941 - return 0; 944 + return xrep_reset_metafile_resv(sc); 942 945 } 943 946 944 947 /*

+22 -15

fs/xfs/scrub/repair.c

··· 43 43 #include "xfs_rtalloc.h" 44 44 #include "xfs_metafile.h" 45 45 #include "xfs_rtrefcount_btree.h" 46 + #include "xfs_zone_alloc.h" 46 47 #include "scrub/scrub.h" 47 48 #include "scrub/common.h" 48 49 #include "scrub/trace.h" ··· 1051 1050 xfs_rtxnum_t startrtx; 1052 1051 xfs_rtxnum_t endrtx; 1053 1052 bool is_free = false; 1054 - int error; 1053 + int error = 0; 1054 + 1055 + if (xfs_has_zoned(mp)) { 1056 + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) 1057 + return -EFSCORRUPTED; 1058 + return 0; 1059 + } 1055 1060 1056 1061 startrtx = xfs_rgbno_to_rtx(mp, rgbno); 1057 1062 endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); ··· 1393 1386 xrep_reset_metafile_resv( 1394 1387 struct xfs_scrub *sc) 1395 1388 { 1396 - struct xfs_inode *ip = sc->ip; 1389 + struct xfs_mount *mp = sc->mp; 1397 1390 int64_t delta; 1398 1391 int error; 1399 1392 1400 - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; 1393 + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - 1394 + mp->m_metafile_resv_target; 1401 1395 if (delta == 0) 1402 1396 return 0; 1403 1397 ··· 1409 1401 if (delta > 0) { 1410 1402 int64_t give_back; 1411 1403 1412 - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); 1404 + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); 1413 1405 if (give_back > 0) { 1414 - xfs_mod_delalloc(ip, 0, -give_back); 1415 - xfs_add_fdblocks(ip->i_mount, give_back); 1416 - ip->i_delayed_blks -= give_back; 1406 + xfs_mod_sb_delalloc(mp, -give_back); 1407 + xfs_add_fdblocks(mp, give_back); 1408 + mp->m_metafile_resv_avail -= give_back; 1417 1409 } 1418 1410 1419 1411 return 0; ··· 1421 1413 1422 1414 /* 1423 1415 * Not enough reservation; try to take some blocks from the filesystem 1424 - * to the metadata inode. @delta is negative here, so invert the sign. 1416 + * to the metabtree reservation. 1425 1417 */ 1426 - delta = -delta; 1427 - error = xfs_dec_fdblocks(sc->mp, delta, true); 1418 + delta = -delta; /* delta is negative here, so invert the sign. */ 1419 + error = xfs_dec_fdblocks(mp, delta, true); 1428 1420 while (error == -ENOSPC) { 1429 1421 delta--; 1430 1422 if (delta == 0) { 1431 1423 xfs_warn(sc->mp, 1432 - "Insufficient free space to reset space reservation for inode 0x%llx after repair.", 1433 - ip->i_ino); 1424 + "Insufficient free space to reset metabtree reservation after repair."); 1434 1425 return 0; 1435 1426 } 1436 - error = xfs_dec_fdblocks(sc->mp, delta, true); 1427 + error = xfs_dec_fdblocks(mp, delta, true); 1437 1428 } 1438 1429 if (error) 1439 1430 return error; 1440 1431 1441 - xfs_mod_delalloc(ip, 0, delta); 1442 - ip->i_delayed_blks += delta; 1432 + xfs_mod_sb_delalloc(mp, delta); 1433 + mp->m_metafile_resv_avail += delta; 1443 1434 return 0; 1444 1435 }

+9 -2

fs/xfs/scrub/rtbitmap.c

··· 21 21 #include "xfs_rmap.h" 22 22 #include "xfs_rtrmap_btree.h" 23 23 #include "xfs_exchmaps.h" 24 + #include "xfs_zone_alloc.h" 24 25 #include "scrub/scrub.h" 25 26 #include "scrub/common.h" 26 27 #include "scrub/repair.h" ··· 273 272 xfs_extlen_t len) 274 273 { 275 274 struct xfs_rtgroup *rtg = sc->sr.rtg; 276 - struct xfs_inode *rbmip = rtg_bitmap(rtg); 277 275 xfs_rtxnum_t startext; 278 276 xfs_rtxnum_t endext; 279 277 bool is_free; ··· 281 281 if (xchk_skip_xref(sc->sm)) 282 282 return; 283 283 284 + if (xfs_has_zoned(sc->mp)) { 285 + if (!xfs_zone_rgbno_is_valid(rtg, 286 + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) 287 + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); 288 + return; 289 + } 290 + 284 291 startext = xfs_rtb_to_rtx(sc->mp, rtbno); 285 292 endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); 286 293 error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, ··· 295 288 if (!xchk_should_check_xref(sc, &error, NULL)) 296 289 return; 297 290 if (is_free) 298 - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino); 291 + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); 299 292 }

+6 -28

fs/xfs/scrub/rtrefcount_repair.c

··· 697 697 return error; 698 698 } 699 699 700 - /* 701 - * Now that we've logged the roots of the new btrees, invalidate all of the 702 - * old blocks and free them. 703 - */ 704 - STATIC int 705 - xrep_rtrefc_remove_old_tree( 706 - struct xrep_rtrefc *rr) 707 - { 708 - int error; 709 - 710 - /* 711 - * Free all the extents that were allocated to the former rtrefcountbt 712 - * and aren't cross-linked with something else. 713 - */ 714 - error = xrep_reap_metadir_fsblocks(rr->sc, 715 - &rr->old_rtrefcountbt_blocks); 716 - if (error) 717 - return error; 718 - 719 - /* 720 - * Ensure the proper reservation for the rtrefcount inode so that we 721 - * don't fail to expand the btree. 722 - */ 723 - return xrep_reset_metafile_resv(rr->sc); 724 - } 725 - 726 700 /* Rebuild the rt refcount btree. */ 727 701 int 728 702 xrep_rtrefcountbt( ··· 743 769 if (error) 744 770 goto out_bitmap; 745 771 746 - /* Kill the old tree. */ 747 - error = xrep_rtrefc_remove_old_tree(rr); 772 + /* 773 + * Free all the extents that were allocated to the former rtrefcountbt 774 + * and aren't cross-linked with something else. 775 + */ 776 + error = xrep_reap_metadir_fsblocks(rr->sc, 777 + &rr->old_rtrefcountbt_blocks); 748 778 if (error) 749 779 goto out_bitmap; 750 780

+5 -24

fs/xfs/scrub/rtrmap_repair.c

··· 810 810 811 811 /* Reaping the old btree. */ 812 812 813 - /* Reap the old rtrmapbt blocks. */ 814 - STATIC int 815 - xrep_rtrmap_remove_old_tree( 816 - struct xrep_rtrmap *rr) 817 - { 818 - int error; 819 - 820 - /* 821 - * Free all the extents that were allocated to the former rtrmapbt and 822 - * aren't cross-linked with something else. 823 - */ 824 - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); 825 - if (error) 826 - return error; 827 - 828 - /* 829 - * Ensure the proper reservation for the rtrmap inode so that we don't 830 - * fail to expand the new btree. 831 - */ 832 - return xrep_reset_metafile_resv(rr->sc); 833 - } 834 - 835 813 static inline bool 836 814 xrep_rtrmapbt_want_live_update( 837 815 struct xchk_iscan *iscan, ··· 973 995 if (error) 974 996 goto out_records; 975 997 976 - /* Kill the old tree. */ 977 - error = xrep_rtrmap_remove_old_tree(rr); 998 + /* 999 + * Free all the extents that were allocated to the former rtrmapbt and 1000 + * aren't cross-linked with something else. 1001 + */ 1002 + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); 978 1003 if (error) 979 1004 goto out_records; 980 1005

+2

fs/xfs/scrub/scrub.c

··· 399 399 }, 400 400 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ 401 401 .type = ST_RTGROUP, 402 + .has = xfs_has_nonzoned, 402 403 .setup = xchk_setup_rtbitmap, 403 404 .scrub = xchk_rtbitmap, 404 405 .repair = xrep_rtbitmap, 405 406 }, 406 407 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ 407 408 .type = ST_RTGROUP, 409 + .has = xfs_has_nonzoned, 408 410 .setup = xchk_setup_rtsummary, 409 411 .scrub = xchk_rtsummary, 410 412 .repair = xrep_rtsummary,

+161 -10

fs/xfs/xfs_aops.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 - * Copyright (c) 2016-2018 Christoph Hellwig. 4 + * Copyright (c) 2016-2025 Christoph Hellwig. 5 5 * All Rights Reserved. 6 6 */ 7 7 #include "xfs.h" ··· 20 20 #include "xfs_errortag.h" 21 21 #include "xfs_error.h" 22 22 #include "xfs_icache.h" 23 + #include "xfs_zone_alloc.h" 24 + #include "xfs_rtgroup.h" 23 25 24 26 struct xfs_writepage_ctx { 25 27 struct iomap_writepage_ctx ctx; ··· 79 77 return xfs_trans_commit(tp); 80 78 } 81 79 80 + static void 81 + xfs_ioend_put_open_zones( 82 + struct iomap_ioend *ioend) 83 + { 84 + struct iomap_ioend *tmp; 85 + 86 + /* 87 + * Put the open zone for all ioends merged into this one (if any). 88 + */ 89 + list_for_each_entry(tmp, &ioend->io_list, io_list) 90 + xfs_open_zone_put(tmp->io_private); 91 + 92 + /* 93 + * The main ioend might not have an open zone if the submission failed 94 + * before xfs_zone_alloc_and_submit got called. 95 + */ 96 + if (ioend->io_private) 97 + xfs_open_zone_put(ioend->io_private); 98 + } 99 + 82 100 /* 83 101 * IO write completion. 84 102 */ ··· 108 86 { 109 87 struct xfs_inode *ip = XFS_I(ioend->io_inode); 110 88 struct xfs_mount *mp = ip->i_mount; 89 + bool is_zoned = xfs_is_zoned_inode(ip); 111 90 xfs_off_t offset = ioend->io_offset; 112 91 size_t size = ioend->io_size; 113 92 unsigned int nofs_flag; ··· 139 116 error = blk_status_to_errno(ioend->io_bio.bi_status); 140 117 if (unlikely(error)) { 141 118 if (ioend->io_flags & IOMAP_IOEND_SHARED) { 119 + ASSERT(!is_zoned); 142 120 xfs_reflink_cancel_cow_range(ip, offset, size, true); 143 121 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, 144 - offset + size); 122 + offset + size, NULL); 145 123 } 146 124 goto done; 147 125 } ··· 150 126 /* 151 127 * Success: commit the COW or unwritten blocks if needed. 152 128 */ 153 - if (ioend->io_flags & IOMAP_IOEND_SHARED) 129 + if (is_zoned) 130 + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector, 131 + ioend->io_private, NULLFSBLOCK); 132 + else if (ioend->io_flags & IOMAP_IOEND_SHARED) 154 133 error = xfs_reflink_end_cow(ip, offset, size); 155 134 else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) 156 135 error = xfs_iomap_write_unwritten(ip, offset, size, false); 157 136 158 - if (!error && xfs_ioend_is_append(ioend)) 137 + if (!error && 138 + !(ioend->io_flags & IOMAP_IOEND_DIRECT) && 139 + xfs_ioend_is_append(ioend)) 159 140 error = xfs_setfilesize(ip, offset, size); 160 141 done: 142 + if (is_zoned) 143 + xfs_ioend_put_open_zones(ioend); 161 144 iomap_finish_ioends(ioend, error); 162 145 memalloc_nofs_restore(nofs_flag); 163 146 } ··· 207 176 } 208 177 } 209 178 210 - STATIC void 179 + void 211 180 xfs_end_bio( 212 181 struct bio *bio) 213 182 { 214 183 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); 215 184 struct xfs_inode *ip = XFS_I(ioend->io_inode); 185 + struct xfs_mount *mp = ip->i_mount; 216 186 unsigned long flags; 187 + 188 + /* 189 + * For Appends record the actually written block number and set the 190 + * boundary flag if needed. 191 + */ 192 + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) { 193 + ioend->io_sector = bio->bi_iter.bi_sector; 194 + xfs_mark_rtg_boundary(ioend); 195 + } 217 196 218 197 spin_lock_irqsave(&ip->i_ioend_lock, flags); 219 198 if (list_empty(&ip->i_ioend_list)) 220 - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, 199 + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, 221 200 &ip->i_ioend_work)); 222 201 list_add_tail(&ioend->io_list, &ip->i_ioend_list); 223 202 spin_unlock_irqrestore(&ip->i_ioend_lock, flags); ··· 504 463 * folio itself and not the start offset that is passed in. 505 464 */ 506 465 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, 507 - folio_pos(folio) + folio_size(folio)); 466 + folio_pos(folio) + folio_size(folio), NULL); 508 467 } 509 468 510 469 static const struct iomap_writeback_ops xfs_writeback_ops = { ··· 513 472 .discard_folio = xfs_discard_folio, 514 473 }; 515 474 475 + struct xfs_zoned_writepage_ctx { 476 + struct iomap_writepage_ctx ctx; 477 + struct xfs_open_zone *open_zone; 478 + }; 479 + 480 + static inline struct xfs_zoned_writepage_ctx * 481 + XFS_ZWPC(struct iomap_writepage_ctx *ctx) 482 + { 483 + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx); 484 + } 485 + 486 + static int 487 + xfs_zoned_map_blocks( 488 + struct iomap_writepage_ctx *wpc, 489 + struct inode *inode, 490 + loff_t offset, 491 + unsigned int len) 492 + { 493 + struct xfs_inode *ip = XFS_I(inode); 494 + struct xfs_mount *mp = ip->i_mount; 495 + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 496 + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); 497 + xfs_filblks_t count_fsb; 498 + struct xfs_bmbt_irec imap, del; 499 + struct xfs_iext_cursor icur; 500 + 501 + if (xfs_is_shutdown(mp)) 502 + return -EIO; 503 + 504 + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); 505 + 506 + /* 507 + * All dirty data must be covered by delalloc extents. But truncate can 508 + * remove delalloc extents underneath us or reduce their size. 509 + * Returning a hole tells iomap to not write back any data from this 510 + * range, which is the right thing to do in that case. 511 + * 512 + * Otherwise just tell iomap to treat ranges previously covered by a 513 + * delalloc extent as mapped. The actual block allocation will be done 514 + * just before submitting the bio. 515 + * 516 + * This implies we never map outside folios that are locked or marked 517 + * as under writeback, and thus there is no need check the fork sequence 518 + * count here. 519 + */ 520 + xfs_ilock(ip, XFS_ILOCK_EXCL); 521 + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) 522 + imap.br_startoff = end_fsb; /* fake a hole past EOF */ 523 + if (imap.br_startoff > offset_fsb) { 524 + imap.br_blockcount = imap.br_startoff - offset_fsb; 525 + imap.br_startoff = offset_fsb; 526 + imap.br_startblock = HOLESTARTBLOCK; 527 + imap.br_state = XFS_EXT_NORM; 528 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 529 + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0); 530 + return 0; 531 + } 532 + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); 533 + count_fsb = end_fsb - offset_fsb; 534 + 535 + del = imap; 536 + xfs_trim_extent(&del, offset_fsb, count_fsb); 537 + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del, 538 + XFS_BMAPI_REMAP); 539 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 540 + 541 + wpc->iomap.type = IOMAP_MAPPED; 542 + wpc->iomap.flags = IOMAP_F_DIRTY; 543 + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev; 544 + wpc->iomap.offset = offset; 545 + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb); 546 + wpc->iomap.flags = IOMAP_F_ANON_WRITE; 547 + 548 + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length); 549 + return 0; 550 + } 551 + 552 + static int 553 + xfs_zoned_submit_ioend( 554 + struct iomap_writepage_ctx *wpc, 555 + int status) 556 + { 557 + wpc->ioend->io_bio.bi_end_io = xfs_end_bio; 558 + if (status) 559 + return status; 560 + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); 561 + return 0; 562 + } 563 + 564 + static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { 565 + .map_blocks = xfs_zoned_map_blocks, 566 + .submit_ioend = xfs_zoned_submit_ioend, 567 + .discard_folio = xfs_discard_folio, 568 + }; 569 + 516 570 STATIC int 517 571 xfs_vm_writepages( 518 572 struct address_space *mapping, 519 573 struct writeback_control *wbc) 520 574 { 521 - struct xfs_writepage_ctx wpc = { }; 575 + struct xfs_inode *ip = XFS_I(mapping->host); 522 576 523 - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 524 - return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); 577 + xfs_iflags_clear(ip, XFS_ITRUNCATED); 578 + 579 + if (xfs_is_zoned_inode(ip)) { 580 + struct xfs_zoned_writepage_ctx xc = { }; 581 + int error; 582 + 583 + error = iomap_writepages(mapping, wbc, &xc.ctx, 584 + &xfs_zoned_writeback_ops); 585 + if (xc.open_zone) 586 + xfs_open_zone_put(xc.open_zone); 587 + return error; 588 + } else { 589 + struct xfs_writepage_ctx wpc = { }; 590 + 591 + return iomap_writepages(mapping, wbc, &wpc.ctx, 592 + &xfs_writeback_ops); 593 + } 525 594 } 526 595 527 596 STATIC int

+2 -1

fs/xfs/xfs_aops.h

··· 9 9 extern const struct address_space_operations xfs_address_space_operations; 10 10 extern const struct address_space_operations xfs_dax_aops; 11 11 12 - int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); 12 + int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); 13 + void xfs_end_bio(struct bio *bio); 13 14 14 15 #endif /* __XFS_AOPS_H__ */

+25 -7

fs/xfs/xfs_bmap_util.c

··· 30 30 #include "xfs_reflink.h" 31 31 #include "xfs_rtbitmap.h" 32 32 #include "xfs_rtgroup.h" 33 + #include "xfs_zone_alloc.h" 33 34 34 35 /* Kernel only BMAP related definitions and functions */ 35 36 ··· 437 436 struct xfs_inode *ip, 438 437 int whichfork, 439 438 xfs_off_t start_byte, 440 - xfs_off_t end_byte) 439 + xfs_off_t end_byte, 440 + struct xfs_zone_alloc_ctx *ac) 441 441 { 442 442 struct xfs_mount *mp = ip->i_mount; 443 443 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ··· 469 467 continue; 470 468 } 471 469 472 - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); 470 + if (xfs_is_zoned_inode(ip) && ac) { 471 + /* 472 + * In a zoned buffered write context we need to return 473 + * the punched delalloc allocations to the allocation 474 + * context. This allows reusing them in the following 475 + * iomap iterations. 476 + */ 477 + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, 478 + &del, XFS_BMAPI_REMAP); 479 + ac->reserved_blocks += del.br_blockcount; 480 + } else { 481 + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, 482 + &del, 0); 483 + } 484 + 473 485 if (!xfs_iext_get_extent(ifp, &icur, &got)) 474 486 break; 475 487 } ··· 598 582 if (ip->i_delayed_blks) { 599 583 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, 600 584 round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), 601 - LLONG_MAX); 585 + LLONG_MAX, NULL); 602 586 } 603 587 xfs_inode_clear_eofblocks_tag(ip); 604 588 return 0; ··· 841 825 xfs_free_file_space( 842 826 struct xfs_inode *ip, 843 827 xfs_off_t offset, 844 - xfs_off_t len) 828 + xfs_off_t len, 829 + struct xfs_zone_alloc_ctx *ac) 845 830 { 846 831 struct xfs_mount *mp = ip->i_mount; 847 832 xfs_fileoff_t startoffset_fsb; ··· 897 880 return 0; 898 881 if (offset + len > XFS_ISIZE(ip)) 899 882 len = XFS_ISIZE(ip) - offset; 900 - error = xfs_zero_range(ip, offset, len, NULL); 883 + error = xfs_zero_range(ip, offset, len, ac, NULL); 901 884 if (error) 902 885 return error; 903 886 ··· 985 968 xfs_collapse_file_space( 986 969 struct xfs_inode *ip, 987 970 xfs_off_t offset, 988 - xfs_off_t len) 971 + xfs_off_t len, 972 + struct xfs_zone_alloc_ctx *ac) 989 973 { 990 974 struct xfs_mount *mp = ip->i_mount; 991 975 struct xfs_trans *tp; ··· 999 981 1000 982 trace_xfs_collapse_file_space(ip); 1001 983 1002 - error = xfs_free_file_space(ip, offset, len); 984 + error = xfs_free_file_space(ip, offset, len, ac); 1003 985 if (error) 1004 986 return error; 1005 987

+7 -5

fs/xfs/xfs_bmap_util.h

··· 15 15 struct xfs_mount; 16 16 struct xfs_trans; 17 17 struct xfs_bmalloca; 18 + struct xfs_zone_alloc_ctx; 18 19 19 20 #ifdef CONFIG_XFS_RT 20 21 int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); ··· 32 31 #endif /* CONFIG_XFS_RT */ 33 32 34 33 void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, 35 - xfs_off_t start_byte, xfs_off_t end_byte); 34 + xfs_off_t start_byte, xfs_off_t end_byte, 35 + struct xfs_zone_alloc_ctx *ac); 36 36 37 37 struct kgetbmap { 38 38 __s64 bmv_offset; /* file offset of segment in blocks */ ··· 56 54 57 55 /* preallocation and hole punch interface */ 58 56 int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, 59 - xfs_off_t len); 57 + xfs_off_t len); 60 58 int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, 61 - xfs_off_t len); 59 + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); 62 60 int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, 63 - xfs_off_t len); 61 + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); 64 62 int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, 65 - xfs_off_t len); 63 + xfs_off_t len); 66 64 67 65 /* EOF block manipulation functions */ 68 66 bool xfs_can_free_eofblocks(struct xfs_inode *ip);

+210 -362

fs/xfs/xfs_buf.c

··· 55 55 return bp->b_rhash_key == XFS_BUF_DADDR_NULL; 56 56 } 57 57 58 - static inline int 59 - xfs_buf_is_vmapped( 60 - struct xfs_buf *bp) 61 - { 62 - /* 63 - * Return true if the buffer is vmapped. 64 - * 65 - * b_addr is null if the buffer is not mapped, but the code is clever 66 - * enough to know it doesn't have to map a single page, so the check has 67 - * to be both for b_addr and bp->b_page_count > 1. 68 - */ 69 - return bp->b_addr && bp->b_page_count > 1; 70 - } 71 - 72 - static inline int 73 - xfs_buf_vmap_len( 74 - struct xfs_buf *bp) 75 - { 76 - return (bp->b_page_count * PAGE_SIZE); 77 - } 78 - 79 58 /* 80 59 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 81 60 * b_lru_ref count so that the buffer is freed immediately when the buffer ··· 88 109 spin_unlock(&bp->b_lock); 89 110 } 90 111 91 - static int 92 - xfs_buf_get_maps( 93 - struct xfs_buf *bp, 94 - int map_count) 112 + static void 113 + xfs_buf_free_callback( 114 + struct callback_head *cb) 95 115 { 96 - ASSERT(bp->b_maps == NULL); 97 - bp->b_map_count = map_count; 116 + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 98 117 99 - if (map_count == 1) { 100 - bp->b_maps = &bp->__b_map; 101 - return 0; 102 - } 103 - 104 - bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), 105 - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 106 - if (!bp->b_maps) 107 - return -ENOMEM; 108 - return 0; 118 + if (bp->b_maps != &bp->__b_map) 119 + kfree(bp->b_maps); 120 + kmem_cache_free(xfs_buf_cache, bp); 109 121 } 110 122 111 123 static void 112 - xfs_buf_free_maps( 113 - struct xfs_buf *bp) 124 + xfs_buf_free( 125 + struct xfs_buf *bp) 114 126 { 115 - if (bp->b_maps != &bp->__b_map) { 116 - kfree(bp->b_maps); 117 - bp->b_maps = NULL; 118 - } 127 + unsigned int size = BBTOB(bp->b_length); 128 + 129 + trace_xfs_buf_free(bp, _RET_IP_); 130 + 131 + ASSERT(list_empty(&bp->b_lru)); 132 + 133 + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) 134 + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); 135 + 136 + if (is_vmalloc_addr(bp->b_addr)) 137 + vfree(bp->b_addr); 138 + else if (bp->b_flags & _XBF_KMEM) 139 + kfree(bp->b_addr); 140 + else 141 + folio_put(virt_to_folio(bp->b_addr)); 142 + 143 + call_rcu(&bp->b_rcu, xfs_buf_free_callback); 119 144 } 120 145 121 146 static int 122 - _xfs_buf_alloc( 147 + xfs_buf_alloc_kmem( 148 + struct xfs_buf *bp, 149 + size_t size, 150 + gfp_t gfp_mask) 151 + { 152 + ASSERT(is_power_of_2(size)); 153 + ASSERT(size < PAGE_SIZE); 154 + 155 + bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); 156 + if (!bp->b_addr) 157 + return -ENOMEM; 158 + 159 + /* 160 + * Slab guarantees that we get back naturally aligned allocations for 161 + * power of two sizes. Keep this check as the canary in the coal mine 162 + * if anything changes in slab. 163 + */ 164 + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { 165 + kfree(bp->b_addr); 166 + bp->b_addr = NULL; 167 + return -ENOMEM; 168 + } 169 + bp->b_flags |= _XBF_KMEM; 170 + trace_xfs_buf_backing_kmem(bp, _RET_IP_); 171 + return 0; 172 + } 173 + 174 + /* 175 + * Allocate backing memory for a buffer. 176 + * 177 + * For tmpfs-backed buffers used by in-memory btrees this directly maps the 178 + * tmpfs page cache folios. 179 + * 180 + * For real file system buffers there are three different kinds backing memory: 181 + * 182 + * The first type backs the buffer by a kmalloc allocation. This is done for 183 + * less than PAGE_SIZE allocations to avoid wasting memory. 184 + * 185 + * The second type is a single folio buffer - this may be a high order folio or 186 + * just a single page sized folio, but either way they get treated the same way 187 + * by the rest of the code - the buffer memory spans a single contiguous memory 188 + * region that we don't have to map and unmap to access the data directly. 189 + * 190 + * The third type of buffer is the vmalloc()d buffer. This provides the buffer 191 + * with the required contiguous memory region but backed by discontiguous 192 + * physical pages. 193 + */ 194 + static int 195 + xfs_buf_alloc_backing_mem( 196 + struct xfs_buf *bp, 197 + xfs_buf_flags_t flags) 198 + { 199 + size_t size = BBTOB(bp->b_length); 200 + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 201 + struct folio *folio; 202 + 203 + if (xfs_buftarg_is_mem(bp->b_target)) 204 + return xmbuf_map_backing_mem(bp); 205 + 206 + /* Assure zeroed buffer for non-read cases. */ 207 + if (!(flags & XBF_READ)) 208 + gfp_mask |= __GFP_ZERO; 209 + 210 + if (flags & XBF_READ_AHEAD) 211 + gfp_mask |= __GFP_NORETRY; 212 + 213 + /* 214 + * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that 215 + * is properly aligned. The slab allocator now guarantees an aligned 216 + * allocation for all power of two sizes, which matches most of the 217 + * smaller than PAGE_SIZE buffers used by XFS. 218 + */ 219 + if (size < PAGE_SIZE && is_power_of_2(size)) 220 + return xfs_buf_alloc_kmem(bp, size, gfp_mask); 221 + 222 + /* 223 + * Don't bother with the retry loop for single PAGE allocations: vmalloc 224 + * won't do any better. 225 + */ 226 + if (size <= PAGE_SIZE) 227 + gfp_mask |= __GFP_NOFAIL; 228 + 229 + /* 230 + * Optimistically attempt a single high order folio allocation for 231 + * larger than PAGE_SIZE buffers. 232 + * 233 + * Allocating a high order folio makes the assumption that buffers are a 234 + * power-of-2 size, matching the power-of-2 folios sizes available. 235 + * 236 + * The exception here are user xattr data buffers, which can be arbitrarily 237 + * sized up to 64kB plus structure metadata, skip straight to the vmalloc 238 + * path for them instead of wasting memory here. 239 + */ 240 + if (size > PAGE_SIZE) { 241 + if (!is_power_of_2(size)) 242 + goto fallback; 243 + gfp_mask &= ~__GFP_DIRECT_RECLAIM; 244 + gfp_mask |= __GFP_NORETRY; 245 + } 246 + folio = folio_alloc(gfp_mask, get_order(size)); 247 + if (!folio) { 248 + if (size <= PAGE_SIZE) 249 + return -ENOMEM; 250 + trace_xfs_buf_backing_fallback(bp, _RET_IP_); 251 + goto fallback; 252 + } 253 + bp->b_addr = folio_address(folio); 254 + trace_xfs_buf_backing_folio(bp, _RET_IP_); 255 + return 0; 256 + 257 + fallback: 258 + for (;;) { 259 + bp->b_addr = __vmalloc(size, gfp_mask); 260 + if (bp->b_addr) 261 + break; 262 + if (flags & XBF_READ_AHEAD) 263 + return -ENOMEM; 264 + XFS_STATS_INC(bp->b_mount, xb_page_retries); 265 + memalloc_retry_wait(gfp_mask); 266 + } 267 + 268 + trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); 269 + return 0; 270 + } 271 + 272 + static int 273 + xfs_buf_alloc( 123 274 struct xfs_buftarg *target, 124 275 struct xfs_buf_map *map, 125 276 int nmaps, ··· 268 159 * We don't want certain flags to appear in b_flags unless they are 269 160 * specifically set by later operations on the buffer. 270 161 */ 271 - flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 162 + flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 272 163 273 164 /* 274 165 * A new buffer is held and locked by the owner. This ensures that the ··· 288 179 bp->b_target = target; 289 180 bp->b_mount = target->bt_mount; 290 181 bp->b_flags = flags; 291 - 292 - error = xfs_buf_get_maps(bp, nmaps); 293 - if (error) { 294 - kmem_cache_free(xfs_buf_cache, bp); 295 - return error; 296 - } 297 - 298 182 bp->b_rhash_key = map[0].bm_bn; 299 183 bp->b_length = 0; 184 + bp->b_map_count = nmaps; 185 + if (nmaps == 1) 186 + bp->b_maps = &bp->__b_map; 187 + else 188 + bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), 189 + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 300 190 for (i = 0; i < nmaps; i++) { 301 191 bp->b_maps[i].bm_bn = map[i].bm_bn; 302 192 bp->b_maps[i].bm_len = map[i].bm_len; ··· 308 200 XFS_STATS_INC(bp->b_mount, xb_create); 309 201 trace_xfs_buf_init(bp, _RET_IP_); 310 202 203 + error = xfs_buf_alloc_backing_mem(bp, flags); 204 + if (error) { 205 + xfs_buf_free(bp); 206 + return error; 207 + } 208 + 311 209 *bpp = bp; 312 - return 0; 313 - } 314 - 315 - static void 316 - xfs_buf_free_pages( 317 - struct xfs_buf *bp) 318 - { 319 - uint i; 320 - 321 - ASSERT(bp->b_flags & _XBF_PAGES); 322 - 323 - if (xfs_buf_is_vmapped(bp)) 324 - vm_unmap_ram(bp->b_addr, bp->b_page_count); 325 - 326 - for (i = 0; i < bp->b_page_count; i++) { 327 - if (bp->b_pages[i]) 328 - __free_page(bp->b_pages[i]); 329 - } 330 - mm_account_reclaimed_pages(bp->b_page_count); 331 - 332 - if (bp->b_pages != bp->b_page_array) 333 - kfree(bp->b_pages); 334 - bp->b_pages = NULL; 335 - bp->b_flags &= ~_XBF_PAGES; 336 - } 337 - 338 - static void 339 - xfs_buf_free_callback( 340 - struct callback_head *cb) 341 - { 342 - struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 343 - 344 - xfs_buf_free_maps(bp); 345 - kmem_cache_free(xfs_buf_cache, bp); 346 - } 347 - 348 - static void 349 - xfs_buf_free( 350 - struct xfs_buf *bp) 351 - { 352 - trace_xfs_buf_free(bp, _RET_IP_); 353 - 354 - ASSERT(list_empty(&bp->b_lru)); 355 - 356 - if (xfs_buftarg_is_mem(bp->b_target)) 357 - xmbuf_unmap_page(bp); 358 - else if (bp->b_flags & _XBF_PAGES) 359 - xfs_buf_free_pages(bp); 360 - else if (bp->b_flags & _XBF_KMEM) 361 - kfree(bp->b_addr); 362 - 363 - call_rcu(&bp->b_rcu, xfs_buf_free_callback); 364 - } 365 - 366 - static int 367 - xfs_buf_alloc_kmem( 368 - struct xfs_buf *bp, 369 - xfs_buf_flags_t flags) 370 - { 371 - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; 372 - size_t size = BBTOB(bp->b_length); 373 - 374 - /* Assure zeroed buffer for non-read cases. */ 375 - if (!(flags & XBF_READ)) 376 - gfp_mask |= __GFP_ZERO; 377 - 378 - bp->b_addr = kmalloc(size, gfp_mask); 379 - if (!bp->b_addr) 380 - return -ENOMEM; 381 - 382 - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 383 - ((unsigned long)bp->b_addr & PAGE_MASK)) { 384 - /* b_addr spans two pages - use alloc_page instead */ 385 - kfree(bp->b_addr); 386 - bp->b_addr = NULL; 387 - return -ENOMEM; 388 - } 389 - bp->b_offset = offset_in_page(bp->b_addr); 390 - bp->b_pages = bp->b_page_array; 391 - bp->b_pages[0] = kmem_to_page(bp->b_addr); 392 - bp->b_page_count = 1; 393 - bp->b_flags |= _XBF_KMEM; 394 - return 0; 395 - } 396 - 397 - static int 398 - xfs_buf_alloc_pages( 399 - struct xfs_buf *bp, 400 - xfs_buf_flags_t flags) 401 - { 402 - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 403 - long filled = 0; 404 - 405 - if (flags & XBF_READ_AHEAD) 406 - gfp_mask |= __GFP_NORETRY; 407 - 408 - /* Make sure that we have a page list */ 409 - bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); 410 - if (bp->b_page_count <= XB_PAGES) { 411 - bp->b_pages = bp->b_page_array; 412 - } else { 413 - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, 414 - gfp_mask); 415 - if (!bp->b_pages) 416 - return -ENOMEM; 417 - } 418 - bp->b_flags |= _XBF_PAGES; 419 - 420 - /* Assure zeroed buffer for non-read cases. */ 421 - if (!(flags & XBF_READ)) 422 - gfp_mask |= __GFP_ZERO; 423 - 424 - /* 425 - * Bulk filling of pages can take multiple calls. Not filling the entire 426 - * array is not an allocation failure, so don't back off if we get at 427 - * least one extra page. 428 - */ 429 - for (;;) { 430 - long last = filled; 431 - 432 - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, 433 - bp->b_pages); 434 - if (filled == bp->b_page_count) { 435 - XFS_STATS_INC(bp->b_mount, xb_page_found); 436 - break; 437 - } 438 - 439 - if (filled != last) 440 - continue; 441 - 442 - if (flags & XBF_READ_AHEAD) { 443 - xfs_buf_free_pages(bp); 444 - return -ENOMEM; 445 - } 446 - 447 - XFS_STATS_INC(bp->b_mount, xb_page_retries); 448 - memalloc_retry_wait(gfp_mask); 449 - } 450 - return 0; 451 - } 452 - 453 - /* 454 - * Map buffer into kernel address-space if necessary. 455 - */ 456 - STATIC int 457 - _xfs_buf_map_pages( 458 - struct xfs_buf *bp, 459 - xfs_buf_flags_t flags) 460 - { 461 - ASSERT(bp->b_flags & _XBF_PAGES); 462 - if (bp->b_page_count == 1) { 463 - /* A single page buffer is always mappable */ 464 - bp->b_addr = page_address(bp->b_pages[0]); 465 - } else if (flags & XBF_UNMAPPED) { 466 - bp->b_addr = NULL; 467 - } else { 468 - int retried = 0; 469 - unsigned nofs_flag; 470 - 471 - /* 472 - * vm_map_ram() will allocate auxiliary structures (e.g. 473 - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs 474 - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations 475 - * from the same call site that can be run from both above and 476 - * below memory reclaim causes lockdep false positives. Hence we 477 - * always need to force this allocation to nofs context because 478 - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to 479 - * prevent false positive lockdep reports. 480 - * 481 - * XXX(dgc): I think dquot reclaim is the only place we can get 482 - * to this function from memory reclaim context now. If we fix 483 - * that like we've fixed inode reclaim to avoid writeback from 484 - * reclaim, this nofs wrapping can go away. 485 - */ 486 - nofs_flag = memalloc_nofs_save(); 487 - do { 488 - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 489 - -1); 490 - if (bp->b_addr) 491 - break; 492 - vm_unmap_aliases(); 493 - } while (retried++ <= 1); 494 - memalloc_nofs_restore(nofs_flag); 495 - 496 - if (!bp->b_addr) 497 - return -ENOMEM; 498 - } 499 - 500 210 return 0; 501 211 } 502 212 ··· 433 507 return -ENOENT; 434 508 } 435 509 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 436 - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 510 + bp->b_flags &= _XBF_KMEM; 437 511 bp->b_ops = NULL; 438 512 } 439 513 return 0; ··· 501 575 struct xfs_buf *bp; 502 576 int error; 503 577 504 - error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 578 + error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 505 579 if (error) 506 580 goto out_drop_pag; 507 - 508 - if (xfs_buftarg_is_mem(new_bp->b_target)) { 509 - error = xmbuf_map_page(new_bp); 510 - } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || 511 - xfs_buf_alloc_kmem(new_bp, flags) < 0) { 512 - /* 513 - * For buffers that fit entirely within a single page, first 514 - * attempt to allocate the memory from the heap to minimise 515 - * memory usage. If we can't get heap memory for these small 516 - * buffers, we fall back to using the page allocator. 517 - */ 518 - error = xfs_buf_alloc_pages(new_bp, flags); 519 - } 520 - if (error) 521 - goto out_free_buf; 522 581 523 582 /* The new buffer keeps the perag reference until it is freed. */ 524 583 new_bp->b_pag = pag; ··· 613 702 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 614 703 if (pag) 615 704 xfs_perag_put(pag); 616 - } 617 - 618 - /* We do not hold a perag reference anymore. */ 619 - if (!bp->b_addr) { 620 - error = _xfs_buf_map_pages(bp, flags); 621 - if (unlikely(error)) { 622 - xfs_warn_ratelimited(btp->bt_mount, 623 - "%s: failed to map %u pages", __func__, 624 - bp->b_page_count); 625 - xfs_buf_relse(bp); 626 - return error; 627 - } 628 705 } 629 706 630 707 /* ··· 802 903 struct xfs_buftarg *target, 803 904 xfs_daddr_t daddr, 804 905 size_t numblks, 805 - xfs_buf_flags_t flags, 806 906 struct xfs_buf **bpp, 807 907 const struct xfs_buf_ops *ops) 808 908 { ··· 810 912 811 913 *bpp = NULL; 812 914 813 - error = xfs_buf_get_uncached(target, numblks, flags, &bp); 915 + error = xfs_buf_get_uncached(target, numblks, &bp); 814 916 if (error) 815 917 return error; 816 918 ··· 836 938 xfs_buf_get_uncached( 837 939 struct xfs_buftarg *target, 838 940 size_t numblks, 839 - xfs_buf_flags_t flags, 840 941 struct xfs_buf **bpp) 841 942 { 842 943 int error; 843 - struct xfs_buf *bp; 844 944 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 845 945 846 - /* there are currently no valid flags for xfs_buf_get_uncached */ 847 - ASSERT(flags == 0); 848 - 849 - *bpp = NULL; 850 - 851 - error = _xfs_buf_alloc(target, &map, 1, flags, &bp); 852 - if (error) 853 - return error; 854 - 855 - if (xfs_buftarg_is_mem(bp->b_target)) 856 - error = xmbuf_map_page(bp); 857 - else 858 - error = xfs_buf_alloc_pages(bp, flags); 859 - if (error) 860 - goto fail_free_buf; 861 - 862 - error = _xfs_buf_map_pages(bp, 0); 863 - if (unlikely(error)) { 864 - xfs_warn(target->bt_mount, 865 - "%s: failed to map pages", __func__); 866 - goto fail_free_buf; 867 - } 868 - 869 - trace_xfs_buf_get_uncached(bp, _RET_IP_); 870 - *bpp = bp; 871 - return 0; 872 - 873 - fail_free_buf: 874 - xfs_buf_free(bp); 946 + error = xfs_buf_alloc(target, &map, 1, 0, bpp); 947 + if (!error) 948 + trace_xfs_buf_get_uncached(*bpp, _RET_IP_); 875 949 return error; 876 950 } 877 951 ··· 1169 1299 trace_xfs_buf_iodone(bp, _RET_IP_); 1170 1300 1171 1301 if (bp->b_flags & XBF_READ) { 1172 - if (!bp->b_error && xfs_buf_is_vmapped(bp)) 1302 + if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) 1173 1303 invalidate_kernel_vmap_range(bp->b_addr, 1174 - xfs_buf_vmap_len(bp)); 1304 + roundup(BBTOB(bp->b_length), PAGE_SIZE)); 1175 1305 if (!bp->b_error && bp->b_ops) 1176 1306 bp->b_ops->verify_read(bp); 1177 1307 if (!bp->b_error) ··· 1332 1462 xfs_buf_submit_bio( 1333 1463 struct xfs_buf *bp) 1334 1464 { 1335 - unsigned int size = BBTOB(bp->b_length); 1336 - unsigned int map = 0, p; 1465 + unsigned int map = 0; 1337 1466 struct blk_plug plug; 1338 1467 struct bio *bio; 1339 1468 1340 - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, 1341 - xfs_buf_bio_op(bp), GFP_NOIO); 1469 + if (is_vmalloc_addr(bp->b_addr)) { 1470 + unsigned int size = BBTOB(bp->b_length); 1471 + unsigned int alloc_size = roundup(size, PAGE_SIZE); 1472 + void *data = bp->b_addr; 1473 + 1474 + bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, 1475 + xfs_buf_bio_op(bp), GFP_NOIO); 1476 + 1477 + do { 1478 + unsigned int len = min(size, PAGE_SIZE); 1479 + 1480 + ASSERT(offset_in_page(data) == 0); 1481 + __bio_add_page(bio, vmalloc_to_page(data), len, 0); 1482 + data += len; 1483 + size -= len; 1484 + } while (size); 1485 + 1486 + flush_kernel_vmap_range(bp->b_addr, alloc_size); 1487 + } else { 1488 + /* 1489 + * Single folio or slab allocation. Must be contiguous and thus 1490 + * only a single bvec is needed. 1491 + * 1492 + * This uses the page based bio add helper for now as that is 1493 + * the lowest common denominator between folios and slab 1494 + * allocations. To be replaced with a better block layer 1495 + * helper soon (hopefully). 1496 + */ 1497 + bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), 1498 + GFP_NOIO); 1499 + __bio_add_page(bio, virt_to_page(bp->b_addr), 1500 + BBTOB(bp->b_length), 1501 + offset_in_page(bp->b_addr)); 1502 + } 1503 + 1342 1504 bio->bi_private = bp; 1343 1505 bio->bi_end_io = xfs_buf_bio_end_io; 1344 - 1345 - if (bp->b_flags & _XBF_KMEM) { 1346 - __bio_add_page(bio, virt_to_page(bp->b_addr), size, 1347 - bp->b_offset); 1348 - } else { 1349 - for (p = 0; p < bp->b_page_count; p++) 1350 - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); 1351 - bio->bi_iter.bi_size = size; /* limit to the actual size used */ 1352 - 1353 - if (xfs_buf_is_vmapped(bp)) 1354 - flush_kernel_vmap_range(bp->b_addr, 1355 - xfs_buf_vmap_len(bp)); 1356 - } 1357 1506 1358 1507 /* 1359 1508 * If there is more than one map segment, split out a new bio for each ··· 1498 1609 } 1499 1610 1500 1611 xfs_buf_submit_bio(bp); 1501 - } 1502 - 1503 - void * 1504 - xfs_buf_offset( 1505 - struct xfs_buf *bp, 1506 - size_t offset) 1507 - { 1508 - struct page *page; 1509 - 1510 - if (bp->b_addr) 1511 - return bp->b_addr + offset; 1512 - 1513 - page = bp->b_pages[offset >> PAGE_SHIFT]; 1514 - return page_address(page) + (offset & (PAGE_SIZE-1)); 1515 - } 1516 - 1517 - void 1518 - xfs_buf_zero( 1519 - struct xfs_buf *bp, 1520 - size_t boff, 1521 - size_t bsize) 1522 - { 1523 - size_t bend; 1524 - 1525 - bend = boff + bsize; 1526 - while (boff < bend) { 1527 - struct page *page; 1528 - int page_index, page_offset, csize; 1529 - 1530 - page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1531 - page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1532 - page = bp->b_pages[page_index]; 1533 - csize = min_t(size_t, PAGE_SIZE - page_offset, 1534 - BBTOB(bp->b_length) - boff); 1535 - 1536 - ASSERT((csize + page_offset) <= PAGE_SIZE); 1537 - 1538 - memset(page_address(page) + page_offset, 0, csize); 1539 - 1540 - boff += csize; 1541 - } 1542 1612 } 1543 1613 1544 1614 /*

+13 -16

fs/xfs/xfs_buf.h

··· 36 36 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ 37 37 38 38 /* flags used only internally */ 39 - #define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ 40 39 #define _XBF_KMEM (1u << 21)/* backed by heap memory */ 41 40 #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ 42 41 ··· 47 48 #define XBF_LIVESCAN (1u << 28) 48 49 #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ 49 50 #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ 50 - #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ 51 51 52 52 53 53 typedef unsigned int xfs_buf_flags_t; ··· 60 62 { XBF_STALE, "STALE" }, \ 61 63 { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ 62 64 { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ 63 - { _XBF_PAGES, "PAGES" }, \ 64 65 { _XBF_KMEM, "KMEM" }, \ 65 66 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 66 67 /* The following interface flags should never be set */ \ 67 68 { XBF_LIVESCAN, "LIVESCAN" }, \ 68 69 { XBF_INCORE, "INCORE" }, \ 69 - { XBF_TRYLOCK, "TRYLOCK" }, \ 70 - { XBF_UNMAPPED, "UNMAPPED" } 70 + { XBF_TRYLOCK, "TRYLOCK" } 71 71 72 72 /* 73 73 * Internal state flags. ··· 119 123 /* built-in cache, if we're not using the perag one */ 120 124 struct xfs_buf_cache bt_cache[]; 121 125 }; 122 - 123 - #define XB_PAGES 2 124 126 125 127 struct xfs_buf_map { 126 128 xfs_daddr_t bm_bn; /* block number for I/O */ ··· 181 187 struct xfs_buf_log_item *b_log_item; 182 188 struct list_head b_li_list; /* Log items list head */ 183 189 struct xfs_trans *b_transp; 184 - struct page **b_pages; /* array of page pointers */ 185 - struct page *b_page_array[XB_PAGES]; /* inline pages */ 186 190 struct xfs_buf_map *b_maps; /* compound buffer map */ 187 191 struct xfs_buf_map __b_map; /* inline compound buffer map */ 188 192 int b_map_count; 189 193 atomic_t b_pin_count; /* pin count */ 190 - unsigned int b_page_count; /* size of page array */ 191 - unsigned int b_offset; /* page offset of b_addr, 192 - only for _XBF_KMEM buffers */ 193 194 int b_error; /* error code on I/O */ 194 195 void (*b_iodone)(struct xfs_buf *bp); 195 196 ··· 273 284 } 274 285 275 286 int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, 276 - xfs_buf_flags_t flags, struct xfs_buf **bpp); 287 + struct xfs_buf **bpp); 277 288 int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, 278 - size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, 289 + size_t numblks, struct xfs_buf **bpp, 279 290 const struct xfs_buf_ops *ops); 280 291 int _xfs_buf_read(struct xfs_buf *bp); 281 292 void xfs_buf_hold(struct xfs_buf *bp); ··· 304 315 #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) 305 316 extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); 306 317 void xfs_buf_ioend_fail(struct xfs_buf *); 307 - void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); 308 318 void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); 309 319 #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) 310 320 311 321 /* Buffer Utility Routines */ 312 - extern void *xfs_buf_offset(struct xfs_buf *, size_t); 322 + static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset) 323 + { 324 + return bp->b_addr + offset; 325 + } 326 + 327 + static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize) 328 + { 329 + memset(bp->b_addr + boff, 0, bsize); 330 + } 331 + 313 332 extern void xfs_buf_stale(struct xfs_buf *bp); 314 333 315 334 /* Delayed Write Buffer Routines */

-114

fs/xfs/xfs_buf_item.c

··· 57 57 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 58 58 } 59 59 60 - static inline bool 61 - xfs_buf_item_straddle( 62 - struct xfs_buf *bp, 63 - uint offset, 64 - int first_bit, 65 - int nbits) 66 - { 67 - void *first, *last; 68 - 69 - first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); 70 - last = xfs_buf_offset(bp, 71 - offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); 72 - 73 - if (last - first != nbits * XFS_BLF_CHUNK) 74 - return true; 75 - return false; 76 - } 77 - 78 60 /* 79 61 * Return the number of log iovecs and space needed to log the given buf log 80 62 * item segment. ··· 73 91 int *nvecs, 74 92 int *nbytes) 75 93 { 76 - struct xfs_buf *bp = bip->bli_buf; 77 94 int first_bit; 78 95 int nbits; 79 - int next_bit; 80 - int last_bit; 81 96 82 97 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 83 98 if (first_bit == -1) ··· 87 108 nbits = xfs_contig_bits(blfp->blf_data_map, 88 109 blfp->blf_map_size, first_bit); 89 110 ASSERT(nbits > 0); 90 - 91 - /* 92 - * Straddling a page is rare because we don't log contiguous 93 - * chunks of unmapped buffers anywhere. 94 - */ 95 - if (nbits > 1 && 96 - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) 97 - goto slow_scan; 98 - 99 111 (*nvecs)++; 100 112 *nbytes += nbits * XFS_BLF_CHUNK; 101 113 ··· 101 131 } while (first_bit != -1); 102 132 103 133 return; 104 - 105 - slow_scan: 106 - /* Count the first bit we jumped out of the above loop from */ 107 - (*nvecs)++; 108 - *nbytes += XFS_BLF_CHUNK; 109 - last_bit = first_bit; 110 - while (last_bit != -1) { 111 - /* 112 - * This takes the bit number to start looking from and 113 - * returns the next set bit from there. It returns -1 114 - * if there are no more bits set or the start bit is 115 - * beyond the end of the bitmap. 116 - */ 117 - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 118 - last_bit + 1); 119 - /* 120 - * If we run out of bits, leave the loop, 121 - * else if we find a new set of bits bump the number of vecs, 122 - * else keep scanning the current set of bits. 123 - */ 124 - if (next_bit == -1) { 125 - break; 126 - } else if (next_bit != last_bit + 1 || 127 - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { 128 - last_bit = next_bit; 129 - first_bit = next_bit; 130 - (*nvecs)++; 131 - nbits = 1; 132 - } else { 133 - last_bit++; 134 - nbits++; 135 - } 136 - *nbytes += XFS_BLF_CHUNK; 137 - } 138 134 } 139 135 140 136 /* ··· 213 277 struct xfs_buf *bp = bip->bli_buf; 214 278 uint base_size; 215 279 int first_bit; 216 - int last_bit; 217 - int next_bit; 218 280 uint nbits; 219 281 220 282 /* copy the flags across from the base format item */ ··· 257 323 nbits = xfs_contig_bits(blfp->blf_data_map, 258 324 blfp->blf_map_size, first_bit); 259 325 ASSERT(nbits > 0); 260 - 261 - /* 262 - * Straddling a page is rare because we don't log contiguous 263 - * chunks of unmapped buffers anywhere. 264 - */ 265 - if (nbits > 1 && 266 - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) 267 - goto slow_scan; 268 - 269 326 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 270 327 first_bit, nbits); 271 328 blfp->blf_size++; ··· 272 347 } while (first_bit != -1); 273 348 274 349 return; 275 - 276 - slow_scan: 277 - ASSERT(bp->b_addr == NULL); 278 - last_bit = first_bit; 279 - nbits = 1; 280 - for (;;) { 281 - /* 282 - * This takes the bit number to start looking from and 283 - * returns the next set bit from there. It returns -1 284 - * if there are no more bits set or the start bit is 285 - * beyond the end of the bitmap. 286 - */ 287 - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 288 - (uint)last_bit + 1); 289 - /* 290 - * If we run out of bits fill in the last iovec and get out of 291 - * the loop. Else if we start a new set of bits then fill in 292 - * the iovec for the series we were looking at and start 293 - * counting the bits in the new one. Else we're still in the 294 - * same set of bits so just keep counting and scanning. 295 - */ 296 - if (next_bit == -1) { 297 - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 298 - first_bit, nbits); 299 - blfp->blf_size++; 300 - break; 301 - } else if (next_bit != last_bit + 1 || 302 - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { 303 - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 304 - first_bit, nbits); 305 - blfp->blf_size++; 306 - first_bit = next_bit; 307 - last_bit = next_bit; 308 - nbits = 1; 309 - } else { 310 - last_bit++; 311 - nbits++; 312 - } 313 - } 314 350 } 315 351 316 352 /*

+1 -7

fs/xfs/xfs_buf_item_recover.c

··· 1006 1006 struct xfs_mount *mp = log->l_mp; 1007 1007 struct xfs_buf *bp; 1008 1008 int error; 1009 - uint buf_flags; 1010 1009 xfs_lsn_t lsn; 1011 1010 1012 1011 /* ··· 1024 1025 } 1025 1026 1026 1027 trace_xfs_log_recover_buf_recover(log, buf_f); 1027 - 1028 - buf_flags = 0; 1029 - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 1030 - buf_flags |= XBF_UNMAPPED; 1031 - 1032 1028 error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 1033 - buf_flags, &bp, NULL); 1029 + 0, &bp, NULL); 1034 1030 if (error) 1035 1031 return error; 1036 1032

+10 -33

fs/xfs/xfs_buf_mem.c

··· 74 74 75 75 /* 76 76 * We don't want to bother with kmapping data during repair, so don't 77 - * allow highmem pages to back this mapping. 77 + * allow highmem folios to back this mapping. 78 78 */ 79 79 mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); 80 80 ··· 127 127 kfree(btp); 128 128 } 129 129 130 - /* Directly map a shmem page into the buffer cache. */ 130 + /* Directly map a shmem folio into the buffer cache. */ 131 131 int 132 - xmbuf_map_page( 132 + xmbuf_map_backing_mem( 133 133 struct xfs_buf *bp) 134 134 { 135 135 struct inode *inode = file_inode(bp->b_target->bt_file); 136 136 struct folio *folio = NULL; 137 - struct page *page; 138 137 loff_t pos = BBTOB(xfs_buf_daddr(bp)); 139 138 int error; 140 139 ··· 158 159 return -EIO; 159 160 } 160 161 161 - page = folio_file_page(folio, pos >> PAGE_SHIFT); 162 - 163 162 /* 164 - * Mark the page dirty so that it won't be reclaimed once we drop the 165 - * (potentially last) reference in xmbuf_unmap_page. 163 + * Mark the folio dirty so that it won't be reclaimed once we drop the 164 + * (potentially last) reference in xfs_buf_free. 166 165 */ 167 - set_page_dirty(page); 168 - unlock_page(page); 166 + folio_set_dirty(folio); 167 + folio_unlock(folio); 169 168 170 - bp->b_addr = page_address(page); 171 - bp->b_pages = bp->b_page_array; 172 - bp->b_pages[0] = page; 173 - bp->b_page_count = 1; 169 + bp->b_addr = folio_address(folio); 174 170 return 0; 175 - } 176 - 177 - /* Unmap a shmem page that was mapped into the buffer cache. */ 178 - void 179 - xmbuf_unmap_page( 180 - struct xfs_buf *bp) 181 - { 182 - struct page *page = bp->b_pages[0]; 183 - 184 - ASSERT(xfs_buftarg_is_mem(bp->b_target)); 185 - 186 - put_page(page); 187 - 188 - bp->b_addr = NULL; 189 - bp->b_pages[0] = NULL; 190 - bp->b_pages = NULL; 191 - bp->b_page_count = 0; 192 171 } 193 172 194 173 /* Is this a valid daddr within the buftarg? */ ··· 182 205 return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); 183 206 } 184 207 185 - /* Discard the page backing this buffer. */ 208 + /* Discard the folio backing this buffer. */ 186 209 static void 187 210 xmbuf_stale( 188 211 struct xfs_buf *bp) ··· 197 220 } 198 221 199 222 /* 200 - * Finalize a buffer -- discard the backing page if it's stale, or run the 223 + * Finalize a buffer -- discard the backing folio if it's stale, or run the 201 224 * write verifier to detect problems. 202 225 */ 203 226 int

+2 -4

fs/xfs/xfs_buf_mem.h

··· 19 19 struct xfs_buftarg **btpp); 20 20 void xmbuf_free(struct xfs_buftarg *btp); 21 21 22 - int xmbuf_map_page(struct xfs_buf *bp); 23 - void xmbuf_unmap_page(struct xfs_buf *bp); 24 22 bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); 25 23 void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); 26 24 int xmbuf_finalize(struct xfs_buf *bp); 27 25 #else 28 26 # define xfs_buftarg_is_mem(...) (false) 29 - # define xmbuf_map_page(...) (-ENOMEM) 30 - # define xmbuf_unmap_page(...) ((void)0) 31 27 # define xmbuf_verify_daddr(...) (false) 32 28 #endif /* CONFIG_XFS_MEMORY_BUFS */ 29 + 30 + int xmbuf_map_backing_mem(struct xfs_buf *bp); 33 31 34 32 #endif /* __XFS_BUF_MEM_H__ */

+2 -1

fs/xfs/xfs_discard.c

··· 844 844 845 845 if (!capable(CAP_SYS_ADMIN)) 846 846 return -EPERM; 847 - if (mp->m_rtdev_targp && 847 + 848 + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && 848 849 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) 849 850 rt_bdev = mp->m_rtdev_targp->bt_bdev; 850 851 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)

+1 -1

fs/xfs/xfs_extent_busy.c

··· 671 671 while ((pag = xfs_perag_next(mp, pag))) 672 672 xfs_extent_busy_wait_group(pag_group(pag)); 673 673 674 - if (xfs_has_rtgroups(mp)) 674 + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) 675 675 while ((rtg = xfs_rtgroup_next(mp, rtg))) 676 676 xfs_extent_busy_wait_group(rtg_group(rtg)); 677 677 }

+25 -10

fs/xfs/xfs_extfree_item.c

··· 29 29 #include "xfs_inode.h" 30 30 #include "xfs_rtbitmap.h" 31 31 #include "xfs_rtgroup.h" 32 + #include "xfs_zone_alloc.h" 32 33 33 34 struct kmem_cache *xfs_efi_cache; 34 35 struct kmem_cache *xfs_efd_cache; ··· 768 767 769 768 trace_xfs_extent_free_deferred(mp, xefi); 770 769 771 - if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) { 772 - if (*rtgp != to_rtg(xefi->xefi_group)) { 773 - *rtgp = to_rtg(xefi->xefi_group); 774 - xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP); 775 - xfs_rtgroup_trans_join(tp, *rtgp, 776 - XFS_RTGLOCK_BITMAP); 777 - } 778 - error = xfs_rtfree_blocks(tp, *rtgp, 779 - xefi->xefi_startblock, xefi->xefi_blockcount); 770 + if (xefi->xefi_flags & XFS_EFI_CANCELLED) 771 + goto done; 772 + 773 + if (*rtgp != to_rtg(xefi->xefi_group)) { 774 + unsigned int lock_flags; 775 + 776 + if (xfs_has_zoned(mp)) 777 + lock_flags = XFS_RTGLOCK_RMAP; 778 + else 779 + lock_flags = XFS_RTGLOCK_BITMAP; 780 + 781 + *rtgp = to_rtg(xefi->xefi_group); 782 + xfs_rtgroup_lock(*rtgp, lock_flags); 783 + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags); 780 784 } 785 + 786 + if (xfs_has_zoned(mp)) { 787 + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock, 788 + xefi->xefi_blockcount); 789 + } else { 790 + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock, 791 + xefi->xefi_blockcount); 792 + } 793 + 781 794 if (error == -EAGAIN) { 782 795 xfs_efd_from_efi(efdp); 783 796 return error; 784 797 } 785 - 798 + done: 786 799 xfs_efd_add_extent(efdp, xefi); 787 800 xfs_extent_free_cancel_item(item); 788 801 return error;

+309 -38

fs/xfs/xfs_file.c

··· 25 25 #include "xfs_iomap.h" 26 26 #include "xfs_reflink.h" 27 27 #include "xfs_file.h" 28 + #include "xfs_aops.h" 29 + #include "xfs_zone_alloc.h" 28 30 29 31 #include <linux/dax.h> 30 32 #include <linux/falloc.h> ··· 152 150 * ensure newly written file data make it to disk before logging the new 153 151 * inode size in case of an extending write. 154 152 */ 155 - if (XFS_IS_REALTIME_INODE(ip)) 153 + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) 156 154 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); 157 155 else if (mp->m_logdev_targp != mp->m_ddev_targp) 158 156 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); ··· 362 360 struct iov_iter *from, 363 361 unsigned int *iolock, 364 362 size_t count, 365 - bool *drained_dio) 363 + bool *drained_dio, 364 + struct xfs_zone_alloc_ctx *ac) 366 365 { 367 366 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 368 367 loff_t isize; ··· 417 414 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); 418 415 419 416 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 420 - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); 417 + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); 421 418 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 422 419 423 420 return error; ··· 434 431 xfs_file_write_checks( 435 432 struct kiocb *iocb, 436 433 struct iov_iter *from, 437 - unsigned int *iolock) 434 + unsigned int *iolock, 435 + struct xfs_zone_alloc_ctx *ac) 438 436 { 439 437 struct inode *inode = iocb->ki_filp->f_mapping->host; 440 438 size_t count = iov_iter_count(from); ··· 485 481 */ 486 482 if (iocb->ki_pos > i_size_read(inode)) { 487 483 error = xfs_file_write_zero_eof(iocb, from, iolock, count, 488 - &drained_dio); 484 + &drained_dio, ac); 489 485 if (error == 1) 490 486 goto restart; 491 487 if (error) ··· 493 489 } 494 490 495 491 return kiocb_modified(iocb); 492 + } 493 + 494 + static ssize_t 495 + xfs_zoned_write_space_reserve( 496 + struct xfs_inode *ip, 497 + struct kiocb *iocb, 498 + struct iov_iter *from, 499 + unsigned int flags, 500 + struct xfs_zone_alloc_ctx *ac) 501 + { 502 + loff_t count = iov_iter_count(from); 503 + int error; 504 + 505 + if (iocb->ki_flags & IOCB_NOWAIT) 506 + flags |= XFS_ZR_NOWAIT; 507 + 508 + /* 509 + * Check the rlimit and LFS boundary first so that we don't over-reserve 510 + * by possibly a lot. 511 + * 512 + * The generic write path will redo this check later, and it might have 513 + * changed by then. If it got expanded we'll stick to our earlier 514 + * smaller limit, and if it is decreased the new smaller limit will be 515 + * used and our extra space reservation will be returned after finishing 516 + * the write. 517 + */ 518 + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); 519 + if (error) 520 + return error; 521 + 522 + /* 523 + * Sloppily round up count to file system blocks. 524 + * 525 + * This will often reserve an extra block, but that avoids having to look 526 + * at the start offset, which isn't stable for O_APPEND until taking the 527 + * iolock. Also we need to reserve a block each for zeroing the old 528 + * EOF block and the new start block if they are unaligned. 529 + * 530 + * Any remaining block will be returned after the write. 531 + */ 532 + return xfs_zoned_space_reserve(ip, 533 + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); 496 534 } 497 535 498 536 static int ··· 548 502 struct xfs_inode *ip = XFS_I(inode); 549 503 loff_t offset = iocb->ki_pos; 550 504 unsigned int nofs_flag; 505 + 506 + ASSERT(!xfs_is_zoned_inode(ip) || 507 + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); 551 508 552 509 trace_xfs_end_io_direct_write(ip, offset, size); 553 510 ··· 631 582 .end_io = xfs_dio_write_end_io, 632 583 }; 633 584 585 + static void 586 + xfs_dio_zoned_submit_io( 587 + const struct iomap_iter *iter, 588 + struct bio *bio, 589 + loff_t file_offset) 590 + { 591 + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; 592 + struct xfs_zone_alloc_ctx *ac = iter->private; 593 + xfs_filblks_t count_fsb; 594 + struct iomap_ioend *ioend; 595 + 596 + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); 597 + if (count_fsb > ac->reserved_blocks) { 598 + xfs_err(mp, 599 + "allocation (%lld) larger than reservation (%lld).", 600 + count_fsb, ac->reserved_blocks); 601 + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 602 + bio_io_error(bio); 603 + return; 604 + } 605 + ac->reserved_blocks -= count_fsb; 606 + 607 + bio->bi_end_io = xfs_end_bio; 608 + ioend = iomap_init_ioend(iter->inode, bio, file_offset, 609 + IOMAP_IOEND_DIRECT); 610 + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); 611 + } 612 + 613 + static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { 614 + .bio_set = &iomap_ioend_bioset, 615 + .submit_io = xfs_dio_zoned_submit_io, 616 + .end_io = xfs_dio_write_end_io, 617 + }; 618 + 634 619 /* 635 - * Handle block aligned direct I/O writes 620 + * Handle block aligned direct I/O writes. 636 621 */ 637 622 static noinline ssize_t 638 623 xfs_file_dio_write_aligned( 639 624 struct xfs_inode *ip, 640 625 struct kiocb *iocb, 641 - struct iov_iter *from) 626 + struct iov_iter *from, 627 + const struct iomap_ops *ops, 628 + const struct iomap_dio_ops *dops, 629 + struct xfs_zone_alloc_ctx *ac) 642 630 { 643 631 unsigned int iolock = XFS_IOLOCK_SHARED; 644 632 ssize_t ret; ··· 683 597 ret = xfs_ilock_iocb_for_write(iocb, &iolock); 684 598 if (ret) 685 599 return ret; 686 - ret = xfs_file_write_checks(iocb, from, &iolock); 600 + ret = xfs_file_write_checks(iocb, from, &iolock, ac); 687 601 if (ret) 688 602 goto out_unlock; 689 603 ··· 697 611 iolock = XFS_IOLOCK_SHARED; 698 612 } 699 613 trace_xfs_file_direct_write(iocb, from); 700 - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 701 - &xfs_dio_write_ops, 0, NULL, 0); 614 + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); 702 615 out_unlock: 703 - if (iolock) 704 - xfs_iunlock(ip, iolock); 616 + xfs_iunlock(ip, iolock); 617 + return ret; 618 + } 619 + 620 + /* 621 + * Handle block aligned direct I/O writes to zoned devices. 622 + */ 623 + static noinline ssize_t 624 + xfs_file_dio_write_zoned( 625 + struct xfs_inode *ip, 626 + struct kiocb *iocb, 627 + struct iov_iter *from) 628 + { 629 + struct xfs_zone_alloc_ctx ac = { }; 630 + ssize_t ret; 631 + 632 + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); 633 + if (ret < 0) 634 + return ret; 635 + ret = xfs_file_dio_write_aligned(ip, iocb, from, 636 + &xfs_zoned_direct_write_iomap_ops, 637 + &xfs_dio_zoned_write_ops, &ac); 638 + xfs_zoned_space_unreserve(ip, &ac); 705 639 return ret; 706 640 } 707 641 ··· 781 675 goto out_unlock; 782 676 } 783 677 784 - ret = xfs_file_write_checks(iocb, from, &iolock); 678 + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 785 679 if (ret) 786 680 goto out_unlock; 787 681 ··· 827 721 /* direct I/O must be aligned to device logical sector size */ 828 722 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) 829 723 return -EINVAL; 830 - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) 724 + 725 + /* 726 + * For always COW inodes we also must check the alignment of each 727 + * individual iovec segment, as they could end up with different 728 + * I/Os due to the way bio_iov_iter_get_pages works, and we'd 729 + * then overwrite an already written block. 730 + */ 731 + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || 732 + (xfs_is_always_cow_inode(ip) && 733 + (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) 831 734 return xfs_file_dio_write_unaligned(ip, iocb, from); 832 - return xfs_file_dio_write_aligned(ip, iocb, from); 735 + if (xfs_is_zoned_inode(ip)) 736 + return xfs_file_dio_write_zoned(ip, iocb, from); 737 + return xfs_file_dio_write_aligned(ip, iocb, from, 738 + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); 833 739 } 834 740 835 741 static noinline ssize_t ··· 858 740 ret = xfs_ilock_iocb(iocb, iolock); 859 741 if (ret) 860 742 return ret; 861 - ret = xfs_file_write_checks(iocb, from, &iolock); 743 + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 862 744 if (ret) 863 745 goto out; 864 746 ··· 902 784 if (ret) 903 785 return ret; 904 786 905 - ret = xfs_file_write_checks(iocb, from, &iolock); 787 + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); 906 788 if (ret) 907 789 goto out; 908 790 ··· 944 826 if (ret > 0) { 945 827 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 946 828 /* Handle various SYNC-type writes */ 829 + ret = generic_write_sync(iocb, ret); 830 + } 831 + return ret; 832 + } 833 + 834 + STATIC ssize_t 835 + xfs_file_buffered_write_zoned( 836 + struct kiocb *iocb, 837 + struct iov_iter *from) 838 + { 839 + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); 840 + struct xfs_mount *mp = ip->i_mount; 841 + unsigned int iolock = XFS_IOLOCK_EXCL; 842 + bool cleared_space = false; 843 + struct xfs_zone_alloc_ctx ac = { }; 844 + ssize_t ret; 845 + 846 + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); 847 + if (ret < 0) 848 + return ret; 849 + 850 + ret = xfs_ilock_iocb(iocb, iolock); 851 + if (ret) 852 + goto out_unreserve; 853 + 854 + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); 855 + if (ret) 856 + goto out_unlock; 857 + 858 + /* 859 + * Truncate the iter to the length that we were actually able to 860 + * allocate blocks for. This needs to happen after 861 + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND 862 + * writes. 863 + */ 864 + iov_iter_truncate(from, 865 + XFS_FSB_TO_B(mp, ac.reserved_blocks) - 866 + (iocb->ki_pos & mp->m_blockmask)); 867 + if (!iov_iter_count(from)) 868 + goto out_unlock; 869 + 870 + retry: 871 + trace_xfs_file_buffered_write(iocb, from); 872 + ret = iomap_file_buffered_write(iocb, from, 873 + &xfs_buffered_write_iomap_ops, &ac); 874 + if (ret == -ENOSPC && !cleared_space) { 875 + /* 876 + * Kick off writeback to convert delalloc space and release the 877 + * usually too pessimistic indirect block reservations. 878 + */ 879 + xfs_flush_inodes(mp); 880 + cleared_space = true; 881 + goto retry; 882 + } 883 + 884 + out_unlock: 885 + xfs_iunlock(ip, iolock); 886 + out_unreserve: 887 + xfs_zoned_space_unreserve(ip, &ac); 888 + if (ret > 0) { 889 + XFS_STATS_ADD(mp, xs_write_bytes, ret); 947 890 ret = generic_write_sync(iocb, ret); 948 891 } 949 892 return ret; ··· 1057 878 return ret; 1058 879 } 1059 880 881 + if (xfs_is_zoned_inode(ip)) 882 + return xfs_file_buffered_write_zoned(iocb, from); 1060 883 return xfs_file_buffered_write(iocb, from); 1061 884 } 1062 885 ··· 1113 932 xfs_falloc_collapse_range( 1114 933 struct file *file, 1115 934 loff_t offset, 1116 - loff_t len) 935 + loff_t len, 936 + struct xfs_zone_alloc_ctx *ac) 1117 937 { 1118 938 struct inode *inode = file_inode(file); 1119 939 loff_t new_size = i_size_read(inode) - len; ··· 1130 948 if (offset + len >= i_size_read(inode)) 1131 949 return -EINVAL; 1132 950 1133 - error = xfs_collapse_file_space(XFS_I(inode), offset, len); 951 + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); 1134 952 if (error) 1135 953 return error; 1136 954 return xfs_falloc_setsize(file, new_size); ··· 1186 1004 struct file *file, 1187 1005 int mode, 1188 1006 loff_t offset, 1189 - loff_t len) 1007 + loff_t len, 1008 + struct xfs_zone_alloc_ctx *ac) 1190 1009 { 1191 1010 struct inode *inode = file_inode(file); 1192 1011 unsigned int blksize = i_blocksize(inode); ··· 1200 1017 if (error) 1201 1018 return error; 1202 1019 1203 - error = xfs_free_file_space(XFS_I(inode), offset, len); 1020 + error = xfs_free_file_space(XFS_I(inode), offset, len, ac); 1204 1021 if (error) 1205 1022 return error; 1206 1023 ··· 1271 1088 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) 1272 1089 1273 1090 STATIC long 1274 - xfs_file_fallocate( 1091 + __xfs_file_fallocate( 1275 1092 struct file *file, 1276 1093 int mode, 1277 1094 loff_t offset, 1278 - loff_t len) 1095 + loff_t len, 1096 + struct xfs_zone_alloc_ctx *ac) 1279 1097 { 1280 1098 struct inode *inode = file_inode(file); 1281 1099 struct xfs_inode *ip = XFS_I(inode); 1282 1100 long error; 1283 1101 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1284 - 1285 - if (!S_ISREG(inode->i_mode)) 1286 - return -EINVAL; 1287 - if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1288 - return -EOPNOTSUPP; 1289 1102 1290 1103 xfs_ilock(ip, iolock); 1291 1104 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); ··· 1303 1124 1304 1125 switch (mode & FALLOC_FL_MODE_MASK) { 1305 1126 case FALLOC_FL_PUNCH_HOLE: 1306 - error = xfs_free_file_space(ip, offset, len); 1127 + error = xfs_free_file_space(ip, offset, len, ac); 1307 1128 break; 1308 1129 case FALLOC_FL_COLLAPSE_RANGE: 1309 - error = xfs_falloc_collapse_range(file, offset, len); 1130 + error = xfs_falloc_collapse_range(file, offset, len, ac); 1310 1131 break; 1311 1132 case FALLOC_FL_INSERT_RANGE: 1312 1133 error = xfs_falloc_insert_range(file, offset, len); 1313 1134 break; 1314 1135 case FALLOC_FL_ZERO_RANGE: 1315 - error = xfs_falloc_zero_range(file, mode, offset, len); 1136 + error = xfs_falloc_zero_range(file, mode, offset, len, ac); 1316 1137 break; 1317 1138 case FALLOC_FL_UNSHARE_RANGE: 1318 1139 error = xfs_falloc_unshare_range(file, mode, offset, len); ··· 1331 1152 out_unlock: 1332 1153 xfs_iunlock(ip, iolock); 1333 1154 return error; 1155 + } 1156 + 1157 + static long 1158 + xfs_file_zoned_fallocate( 1159 + struct file *file, 1160 + int mode, 1161 + loff_t offset, 1162 + loff_t len) 1163 + { 1164 + struct xfs_zone_alloc_ctx ac = { }; 1165 + struct xfs_inode *ip = XFS_I(file_inode(file)); 1166 + int error; 1167 + 1168 + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); 1169 + if (error) 1170 + return error; 1171 + error = __xfs_file_fallocate(file, mode, offset, len, &ac); 1172 + xfs_zoned_space_unreserve(ip, &ac); 1173 + return error; 1174 + } 1175 + 1176 + static long 1177 + xfs_file_fallocate( 1178 + struct file *file, 1179 + int mode, 1180 + loff_t offset, 1181 + loff_t len) 1182 + { 1183 + struct inode *inode = file_inode(file); 1184 + 1185 + if (!S_ISREG(inode->i_mode)) 1186 + return -EINVAL; 1187 + if (mode & ~XFS_FALLOC_FL_SUPPORTED) 1188 + return -EOPNOTSUPP; 1189 + 1190 + /* 1191 + * For zoned file systems, zeroing the first and last block of a hole 1192 + * punch requires allocating a new block to rewrite the remaining data 1193 + * and new zeroes out of place. Get a reservations for those before 1194 + * taking the iolock. Dip into the reserved pool because we are 1195 + * expected to be able to punch a hole even on a completely full 1196 + * file system. 1197 + */ 1198 + if (xfs_is_zoned_inode(XFS_I(inode)) && 1199 + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | 1200 + FALLOC_FL_COLLAPSE_RANGE))) 1201 + return xfs_file_zoned_fallocate(file, mode, offset, len); 1202 + return __xfs_file_fallocate(file, mode, offset, len, NULL); 1334 1203 } 1335 1204 1336 1205 STATIC int ··· 1574 1347 * blocks. This avoids open/read/close workloads from removing EOF 1575 1348 * blocks that other writers depend upon to reduce fragmentation. 1576 1349 * 1350 + * Inodes on the zoned RT device never have preallocations, so skip 1351 + * taking the locks below. 1352 + */ 1353 + if (!inode->i_nlink || 1354 + !(file->f_mode & FMODE_WRITE) || 1355 + (ip->i_diflags & XFS_DIFLAG_APPEND) || 1356 + xfs_is_zoned_inode(ip)) 1357 + return 0; 1358 + 1359 + /* 1577 1360 * If we can't get the iolock just skip truncating the blocks past EOF 1578 1361 * because we could deadlock with the mmap_lock otherwise. We'll get 1579 1362 * another chance to drop them once the last reference to the inode is 1580 1363 * dropped, so we'll never leak blocks permanently. 1581 1364 */ 1582 - if (inode->i_nlink && 1583 - (file->f_mode & FMODE_WRITE) && 1584 - !(ip->i_diflags & XFS_DIFLAG_APPEND) && 1585 - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1365 + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && 1586 1366 xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1587 1367 if (xfs_can_free_eofblocks(ip) && 1588 1368 !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) ··· 1703 1469 * i_lock (XFS - extent map serialisation) 1704 1470 */ 1705 1471 static vm_fault_t 1706 - xfs_write_fault( 1472 + __xfs_write_fault( 1707 1473 struct vm_fault *vmf, 1708 - unsigned int order) 1474 + unsigned int order, 1475 + struct xfs_zone_alloc_ctx *ac) 1709 1476 { 1710 1477 struct inode *inode = file_inode(vmf->vma->vm_file); 1711 1478 struct xfs_inode *ip = XFS_I(inode); ··· 1734 1499 ret = xfs_dax_fault_locked(vmf, order, true); 1735 1500 else 1736 1501 ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, 1737 - NULL); 1502 + ac); 1738 1503 xfs_iunlock(ip, lock_mode); 1739 1504 1740 1505 sb_end_pagefault(inode->i_sb); 1741 1506 return ret; 1507 + } 1508 + 1509 + static vm_fault_t 1510 + xfs_write_fault_zoned( 1511 + struct vm_fault *vmf, 1512 + unsigned int order) 1513 + { 1514 + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); 1515 + unsigned int len = folio_size(page_folio(vmf->page)); 1516 + struct xfs_zone_alloc_ctx ac = { }; 1517 + int error; 1518 + vm_fault_t ret; 1519 + 1520 + /* 1521 + * This could over-allocate as it doesn't check for truncation. 1522 + * 1523 + * But as the overallocation is limited to less than a folio and will be 1524 + * release instantly that's just fine. 1525 + */ 1526 + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, 1527 + &ac); 1528 + if (error < 0) 1529 + return vmf_fs_error(error); 1530 + ret = __xfs_write_fault(vmf, order, &ac); 1531 + xfs_zoned_space_unreserve(ip, &ac); 1532 + return ret; 1533 + } 1534 + 1535 + static vm_fault_t 1536 + xfs_write_fault( 1537 + struct vm_fault *vmf, 1538 + unsigned int order) 1539 + { 1540 + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) 1541 + return xfs_write_fault_zoned(vmf, order); 1542 + return __xfs_write_fault(vmf, order, NULL); 1742 1543 } 1743 1544 1744 1545 static inline bool

+68 -18

fs/xfs/xfs_fsmap.c

··· 879 879 struct xfs_mount *mp = tp->t_mountp; 880 880 struct xfs_rtgroup *rtg = NULL; 881 881 struct xfs_btree_cur *bt_cur = NULL; 882 + xfs_daddr_t rtstart_daddr; 882 883 xfs_rtblock_t start_rtb; 883 884 xfs_rtblock_t end_rtb; 884 885 xfs_rgnumber_t start_rg, end_rg; 885 886 uint64_t eofs; 886 887 int error = 0; 887 888 888 - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 889 + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); 889 890 if (keys[0].fmr_physical >= eofs) 890 891 return 0; 891 - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); 892 - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); 892 + 893 + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); 894 + if (keys[0].fmr_physical < rtstart_daddr) { 895 + struct xfs_fsmap_irec frec = { 896 + .owner = XFS_RMAP_OWN_FS, 897 + .len_daddr = rtstart_daddr, 898 + }; 899 + 900 + /* Adjust the low key if we are continuing from where we left off. */ 901 + if (keys[0].fmr_length > 0) { 902 + info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; 903 + return 0; 904 + } 905 + 906 + /* Fabricate an rmap entry for space occupied by the data dev */ 907 + error = xfs_getfsmap_helper(tp, info, &frec); 908 + if (error) 909 + return error; 910 + } 911 + 912 + start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); 913 + end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + 914 + min(eofs - 1, keys[1].fmr_physical)); 893 915 894 916 info->missing_owner = XFS_FMR_OWN_FREE; 895 917 ··· 1026 1004 } 1027 1005 #endif /* CONFIG_XFS_RT */ 1028 1006 1007 + static uint32_t 1008 + xfs_getfsmap_device( 1009 + struct xfs_mount *mp, 1010 + enum xfs_device dev) 1011 + { 1012 + if (mp->m_sb.sb_rtstart) 1013 + return dev; 1014 + 1015 + switch (dev) { 1016 + case XFS_DEV_DATA: 1017 + return new_encode_dev(mp->m_ddev_targp->bt_dev); 1018 + case XFS_DEV_LOG: 1019 + return new_encode_dev(mp->m_logdev_targp->bt_dev); 1020 + case XFS_DEV_RT: 1021 + if (!mp->m_rtdev_targp) 1022 + break; 1023 + return new_encode_dev(mp->m_rtdev_targp->bt_dev); 1024 + } 1025 + 1026 + return -1; 1027 + } 1028 + 1029 1029 /* Do we recognize the device? */ 1030 1030 STATIC bool 1031 1031 xfs_getfsmap_is_valid_device( 1032 1032 struct xfs_mount *mp, 1033 1033 struct xfs_fsmap *fm) 1034 1034 { 1035 - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || 1036 - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) 1037 - return true; 1038 - if (mp->m_logdev_targp && 1039 - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) 1040 - return true; 1041 - if (mp->m_rtdev_targp && 1042 - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) 1043 - return true; 1044 - return false; 1035 + return fm->fmr_device == 0 || 1036 + fm->fmr_device == UINT_MAX || 1037 + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || 1038 + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || 1039 + (mp->m_rtdev_targp && 1040 + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); 1045 1041 } 1046 1042 1047 1043 /* Ensure that the low key is less than the high key. */ ··· 1166 1126 /* Set up our device handlers. */ 1167 1127 memset(handlers, 0, sizeof(handlers)); 1168 1128 handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1169 - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); 1129 + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); 1170 1130 if (use_rmap) 1171 1131 handlers[0].fn = xfs_getfsmap_datadev_rmapbt; 1172 1132 else ··· 1174 1134 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1175 1135 handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, 1176 1136 mp->m_sb.sb_logblocks); 1177 - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); 1137 + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); 1178 1138 handlers[1].fn = xfs_getfsmap_logdev; 1179 1139 } 1180 1140 #ifdef CONFIG_XFS_RT 1181 - if (mp->m_rtdev_targp) { 1141 + /* 1142 + * For zoned file systems there is no rtbitmap, so only support fsmap 1143 + * if the callers is privileged enough to use the full rmap version. 1144 + */ 1145 + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { 1182 1146 handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 1183 - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); 1147 + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); 1184 1148 if (use_rmap) 1185 1149 handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; 1186 1150 else ··· 1274 1230 1275 1231 if (tp) 1276 1232 xfs_trans_cancel(tp); 1277 - head->fmh_oflags = FMH_OF_DEV_T; 1233 + 1234 + /* 1235 + * For internal RT device we need to report different synthetic devices 1236 + * for a single physical device, and thus can't report the actual dev_t. 1237 + */ 1238 + if (!mp->m_sb.sb_rtstart) 1239 + head->fmh_oflags = FMH_OF_DEV_T; 1278 1240 return error; 1279 1241 } 1280 1242

+36 -31

fs/xfs/xfs_fsops.c

··· 24 24 #include "xfs_rtalloc.h" 25 25 #include "xfs_rtrmap_btree.h" 26 26 #include "xfs_rtrefcount_btree.h" 27 + #include "xfs_metafile.h" 27 28 28 29 /* 29 30 * Write new AG headers to disk. Non-transactional, but need to be ··· 111 110 if (nb > mp->m_sb.sb_dblocks) { 112 111 error = xfs_buf_read_uncached(mp->m_ddev_targp, 113 112 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 114 - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); 113 + XFS_FSS_TO_BB(mp, 1), &bp, NULL); 115 114 if (error) 116 115 return error; 117 116 xfs_buf_relse(bp); ··· 301 300 struct xfs_mount *mp, 302 301 struct xfs_growfs_data *in) 303 302 { 304 - int error = 0; 303 + int error; 305 304 306 305 if (!capable(CAP_SYS_ADMIN)) 307 306 return -EPERM; 308 307 if (!mutex_trylock(&mp->m_growlock)) 309 308 return -EWOULDBLOCK; 310 309 310 + /* we can't grow the data section when an internal RT section exists */ 311 + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) { 312 + error = -EINVAL; 313 + goto out_unlock; 314 + } 315 + 311 316 /* update imaxpct separately to the physical grow of the filesystem */ 312 317 if (in->imaxpct != mp->m_sb.sb_imax_pct) { 313 318 error = xfs_growfs_imaxpct(mp, in->imaxpct); 314 319 if (error) 315 - goto out_error; 320 + goto out_unlock; 316 321 } 317 322 318 323 if (in->newblocks != mp->m_sb.sb_dblocks) { 319 324 error = xfs_growfs_data_private(mp, in); 320 325 if (error) 321 - goto out_error; 326 + goto out_unlock; 322 327 } 323 328 324 329 /* Post growfs calculations needed to reflect new state in operations */ ··· 338 331 /* Update secondary superblocks now the physical grow has completed */ 339 332 error = xfs_update_secondary_sbs(mp); 340 333 341 - out_error: 342 334 /* 343 - * Increment the generation unconditionally, the error could be from 344 - * updating the secondary superblocks, in which case the new size 345 - * is live already. 335 + * Increment the generation unconditionally, after trying to update the 336 + * secondary superblocks, as the new size is live already at this point. 346 337 */ 347 338 mp->m_generation++; 339 + out_unlock: 348 340 mutex_unlock(&mp->m_growlock); 349 341 return error; 350 342 } ··· 372 366 int 373 367 xfs_reserve_blocks( 374 368 struct xfs_mount *mp, 369 + enum xfs_free_counter ctr, 375 370 uint64_t request) 376 371 { 377 372 int64_t lcounter, delta; 378 373 int64_t fdblks_delta = 0; 379 374 int64_t free; 380 375 int error = 0; 376 + 377 + ASSERT(ctr < XC_FREE_NR); 381 378 382 379 /* 383 380 * With per-cpu counters, this becomes an interesting problem. we need ··· 400 391 * counters directly since we shouldn't have any problems unreserving 401 392 * space. 402 393 */ 403 - if (mp->m_resblks > request) { 404 - lcounter = mp->m_resblks_avail - request; 394 + if (mp->m_free[ctr].res_total > request) { 395 + lcounter = mp->m_free[ctr].res_avail - request; 405 396 if (lcounter > 0) { /* release unused blocks */ 406 397 fdblks_delta = lcounter; 407 - mp->m_resblks_avail -= lcounter; 398 + mp->m_free[ctr].res_avail -= lcounter; 408 399 } 409 - mp->m_resblks = request; 400 + mp->m_free[ctr].res_total = request; 410 401 if (fdblks_delta) { 411 402 spin_unlock(&mp->m_sb_lock); 412 - xfs_add_fdblocks(mp, fdblks_delta); 403 + xfs_add_freecounter(mp, ctr, fdblks_delta); 413 404 spin_lock(&mp->m_sb_lock); 414 405 } 415 406 ··· 418 409 419 410 /* 420 411 * If the request is larger than the current reservation, reserve the 421 - * blocks before we update the reserve counters. Sample m_fdblocks and 412 + * blocks before we update the reserve counters. Sample m_free and 422 413 * perform a partial reservation if the request exceeds free space. 423 414 * 424 415 * The code below estimates how many blocks it can request from ··· 428 419 * space to fill it because mod_fdblocks will refill an undersized 429 420 * reserve when it can. 430 421 */ 431 - free = percpu_counter_sum(&mp->m_fdblocks) - 432 - xfs_fdblocks_unavailable(mp); 433 - delta = request - mp->m_resblks; 434 - mp->m_resblks = request; 422 + free = xfs_sum_freecounter_raw(mp, ctr) - 423 + xfs_freecounter_unavailable(mp, ctr); 424 + delta = request - mp->m_free[ctr].res_total; 425 + mp->m_free[ctr].res_total = request; 435 426 if (delta > 0 && free > 0) { 436 427 /* 437 428 * We'll either succeed in getting space from the free block ··· 445 436 */ 446 437 fdblks_delta = min(free, delta); 447 438 spin_unlock(&mp->m_sb_lock); 448 - error = xfs_dec_fdblocks(mp, fdblks_delta, 0); 439 + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); 449 440 if (!error) 450 - xfs_add_fdblocks(mp, fdblks_delta); 441 + xfs_add_freecounter(mp, ctr, fdblks_delta); 451 442 spin_lock(&mp->m_sb_lock); 452 443 } 453 444 out: ··· 567 558 return error; 568 559 } 569 560 570 - if (xfs_has_realtime(mp)) { 571 - err2 = xfs_rt_resv_init(mp); 572 - if (err2 && err2 != -ENOSPC) { 573 - xfs_warn(mp, 574 - "Error %d reserving realtime metadata reserve pool.", err2); 575 - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 576 - } 561 + err2 = xfs_metafile_resv_init(mp); 562 + if (err2 && err2 != -ENOSPC) { 563 + xfs_warn(mp, 564 + "Error %d reserving realtime metadata reserve pool.", err2); 565 + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 577 566 578 - if (err2 && !error) 567 + if (!error) 579 568 error = err2; 580 569 } 581 570 ··· 589 582 { 590 583 struct xfs_perag *pag = NULL; 591 584 592 - if (xfs_has_realtime(mp)) 593 - xfs_rt_resv_free(mp); 594 - 585 + xfs_metafile_resv_free(mp); 595 586 while ((pag = xfs_perag_next(mp, pag))) 596 587 xfs_ag_resv_free(pag); 597 588 }

+2 -1

fs/xfs/xfs_fsops.h

··· 8 8 9 9 int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); 10 10 int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); 11 - int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); 11 + int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt, 12 + uint64_t request); 12 13 int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); 13 14 14 15 int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);

+3 -3

fs/xfs/xfs_icache.c

··· 2073 2073 { 2074 2074 struct xfs_mount *mp = ip->i_mount; 2075 2075 2076 - if (!XFS_IS_REALTIME_INODE(ip)) 2076 + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) 2077 2077 return false; 2078 2078 2079 - if (__percpu_counter_compare(&mp->m_frextents, 2079 + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, 2080 2080 mp->m_low_rtexts[XFS_LOWSP_5_PCNT], 2081 2081 XFS_FDBLOCKS_BATCH) < 0) 2082 2082 return true; ··· 2104 2104 if (items > mp->m_ino_geo.inodes_per_cluster) 2105 2105 return true; 2106 2106 2107 - if (__percpu_counter_compare(&mp->m_fdblocks, 2107 + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, 2108 2108 mp->m_low_space[XFS_LOWSP_5_PCNT], 2109 2109 XFS_FDBLOCKS_BATCH) < 0) 2110 2110 return true;

+3 -3

fs/xfs/xfs_inode.c

··· 1721 1721 * to mark all the active inodes on the buffer stale. 1722 1722 */ 1723 1723 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1724 - mp->m_bsize * igeo->blocks_per_cluster, 1725 - XBF_UNMAPPED, &bp); 1724 + mp->m_bsize * igeo->blocks_per_cluster, 0, &bp); 1726 1725 if (error) 1727 1726 return error; 1728 1727 ··· 3073 3074 xfs_is_always_cow_inode( 3074 3075 const struct xfs_inode *ip) 3075 3076 { 3076 - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); 3077 + return xfs_is_zoned_inode(ip) || 3078 + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount)); 3077 3079 }

+14 -14

fs/xfs/xfs_inode.h

··· 25 25 typedef struct xfs_inode { 26 26 /* Inode linking and identification information. */ 27 27 struct xfs_mount *i_mount; /* fs mount struct ptr */ 28 - union { 29 - struct { 30 - struct xfs_dquot *i_udquot; /* user dquot */ 31 - struct xfs_dquot *i_gdquot; /* group dquot */ 32 - struct xfs_dquot *i_pdquot; /* project dquot */ 33 - }; 34 - 35 - /* 36 - * Space that has been set aside to accomodate expansions of a 37 - * metadata btree rooted in this file. 38 - */ 39 - uint64_t i_meta_resv_asked; 40 - }; 28 + struct xfs_dquot *i_udquot; /* user dquot */ 29 + struct xfs_dquot *i_gdquot; /* group dquot */ 30 + struct xfs_dquot *i_pdquot; /* project dquot */ 41 31 42 32 /* Inode location stuff */ 43 33 xfs_ino_t i_ino; /* inode number (agno/agino)*/ ··· 59 69 xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ 60 70 prid_t i_projid; /* owner's project id */ 61 71 xfs_extlen_t i_extsize; /* basic/minimum extent size */ 62 - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ 72 + /* 73 + * i_used_blocks is used for zoned rtrmap inodes, 74 + * i_cowextsize is used for other v3 inodes, 75 + * i_flushiter for v1/2 inodes 76 + */ 63 77 union { 78 + uint32_t i_used_blocks; /* used blocks in RTG */ 64 79 xfs_extlen_t i_cowextsize; /* basic cow extent size */ 65 80 uint16_t i_flushiter; /* incremented on flush */ 66 81 }; ··· 302 307 return ip->i_ino == mp->m_sb.sb_rbmino || 303 308 ip->i_ino == mp->m_sb.sb_rsumino || 304 309 xfs_is_quota_inode(&mp->m_sb, ip->i_ino); 310 + } 311 + 312 + static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) 313 + { 314 + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); 305 315 } 306 316 307 317 bool xfs_is_always_cow_inode(const struct xfs_inode *ip);

+1

fs/xfs/xfs_inode_item.c

··· 596 596 to->di_changecount = inode_peek_iversion(inode); 597 597 to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); 598 598 to->di_flags2 = ip->i_diflags2; 599 + /* also covers the di_used_blocks union arm: */ 599 600 to->di_cowextsize = ip->i_cowextsize; 600 601 to->di_ino = ip->i_ino; 601 602 to->di_lsn = lsn;

+1

fs/xfs/xfs_inode_item_recover.c

··· 203 203 to->di_crtime = xfs_log_dinode_to_disk_ts(from, 204 204 from->di_crtime); 205 205 to->di_flags2 = cpu_to_be64(from->di_flags2); 206 + /* also covers the di_used_blocks union arm: */ 206 207 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 207 208 to->di_ino = cpu_to_be64(from->di_ino); 208 209 to->di_lsn = cpu_to_be64(lsn);

+6 -6

fs/xfs/xfs_ioctl.c

··· 1131 1131 error = mnt_want_write_file(filp); 1132 1132 if (error) 1133 1133 return error; 1134 - error = xfs_reserve_blocks(mp, fsop.resblks); 1134 + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); 1135 1135 mnt_drop_write_file(filp); 1136 1136 if (error) 1137 1137 return error; 1138 1138 } 1139 1139 1140 1140 spin_lock(&mp->m_sb_lock); 1141 - fsop.resblks = mp->m_resblks; 1142 - fsop.resblks_avail = mp->m_resblks_avail; 1141 + fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; 1142 + fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; 1143 1143 spin_unlock(&mp->m_sb_lock); 1144 1144 1145 1145 if (copy_to_user(arg, &fsop, sizeof(fsop))) ··· 1155 1155 struct xfs_fsop_counts out = { 1156 1156 .allocino = percpu_counter_read_positive(&mp->m_icount), 1157 1157 .freeino = percpu_counter_read_positive(&mp->m_ifree), 1158 - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - 1159 - xfs_fdblocks_unavailable(mp), 1160 - .freertx = percpu_counter_read_positive(&mp->m_frextents), 1158 + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - 1159 + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), 1160 + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), 1161 1161 }; 1162 1162 1163 1163 if (copy_to_user(uarg, &out, sizeof(out)))

+521 -7

fs/xfs/xfs_iomap.c

··· 30 30 #include "xfs_reflink.h" 31 31 #include "xfs_health.h" 32 32 #include "xfs_rtbitmap.h" 33 + #include "xfs_icache.h" 34 + #include "xfs_zone_alloc.h" 33 35 34 36 #define XFS_ALLOC_ALIGN(mp, off) \ 35 37 (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) ··· 433 431 434 432 static int64_t 435 433 xfs_iomap_freesp( 436 - struct percpu_counter *counter, 434 + struct xfs_mount *mp, 435 + unsigned int idx, 437 436 uint64_t low_space[XFS_LOWSP_MAX], 438 437 int *shift) 439 438 { 440 439 int64_t freesp; 441 440 442 - freesp = percpu_counter_read_positive(counter); 441 + freesp = xfs_estimate_freecounter(mp, idx); 443 442 if (freesp < low_space[XFS_LOWSP_5_PCNT]) { 444 443 *shift = 2; 445 444 if (freesp < low_space[XFS_LOWSP_4_PCNT]) ··· 539 536 540 537 if (unlikely(XFS_IS_REALTIME_INODE(ip))) 541 538 freesp = xfs_rtbxlen_to_blen(mp, 542 - xfs_iomap_freesp(&mp->m_frextents, 539 + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, 543 540 mp->m_low_rtexts, &shift)); 544 541 else 545 - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, 542 + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, 546 543 &shift); 547 544 548 545 /* ··· 969 966 .iomap_begin = xfs_direct_write_iomap_begin, 970 967 }; 971 968 969 + #ifdef CONFIG_XFS_RT 970 + /* 971 + * This is really simple. The space has already been reserved before taking the 972 + * IOLOCK, the actual block allocation is done just before submitting the bio 973 + * and only recorded in the extent map on I/O completion. 974 + */ 975 + static int 976 + xfs_zoned_direct_write_iomap_begin( 977 + struct inode *inode, 978 + loff_t offset, 979 + loff_t length, 980 + unsigned flags, 981 + struct iomap *iomap, 982 + struct iomap *srcmap) 983 + { 984 + struct xfs_inode *ip = XFS_I(inode); 985 + int error; 986 + 987 + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); 988 + 989 + /* 990 + * Needs to be pushed down into the allocator so that only writes into 991 + * a single zone can be supported. 992 + */ 993 + if (flags & IOMAP_NOWAIT) 994 + return -EAGAIN; 995 + 996 + /* 997 + * Ensure the extent list is in memory in so that we don't have to do 998 + * read it from the I/O completion handler. 999 + */ 1000 + if (xfs_need_iread_extents(&ip->i_df)) { 1001 + xfs_ilock(ip, XFS_ILOCK_EXCL); 1002 + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 1003 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 1004 + if (error) 1005 + return error; 1006 + } 1007 + 1008 + iomap->type = IOMAP_MAPPED; 1009 + iomap->flags = IOMAP_F_DIRTY; 1010 + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; 1011 + iomap->offset = offset; 1012 + iomap->length = length; 1013 + iomap->flags = IOMAP_F_ANON_WRITE; 1014 + return 0; 1015 + } 1016 + 1017 + const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { 1018 + .iomap_begin = xfs_zoned_direct_write_iomap_begin, 1019 + }; 1020 + #endif /* CONFIG_XFS_RT */ 1021 + 972 1022 static int 973 1023 xfs_dax_write_iomap_end( 974 1024 struct inode *inode, ··· 1046 990 .iomap_begin = xfs_direct_write_iomap_begin, 1047 991 .iomap_end = xfs_dax_write_iomap_end, 1048 992 }; 993 + 994 + /* 995 + * Convert a hole to a delayed allocation. 996 + */ 997 + static void 998 + xfs_bmap_add_extent_hole_delay( 999 + struct xfs_inode *ip, /* incore inode pointer */ 1000 + int whichfork, 1001 + struct xfs_iext_cursor *icur, 1002 + struct xfs_bmbt_irec *new) /* new data to add to file extents */ 1003 + { 1004 + struct xfs_ifork *ifp; /* inode fork pointer */ 1005 + xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1006 + xfs_filblks_t newlen=0; /* new indirect size */ 1007 + xfs_filblks_t oldlen=0; /* old indirect size */ 1008 + xfs_bmbt_irec_t right; /* right neighbor extent entry */ 1009 + uint32_t state = xfs_bmap_fork_to_state(whichfork); 1010 + xfs_filblks_t temp; /* temp for indirect calculations */ 1011 + 1012 + ifp = xfs_ifork_ptr(ip, whichfork); 1013 + ASSERT(isnullstartblock(new->br_startblock)); 1014 + 1015 + /* 1016 + * Check and set flags if this segment has a left neighbor 1017 + */ 1018 + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { 1019 + state |= BMAP_LEFT_VALID; 1020 + if (isnullstartblock(left.br_startblock)) 1021 + state |= BMAP_LEFT_DELAY; 1022 + } 1023 + 1024 + /* 1025 + * Check and set flags if the current (right) segment exists. 1026 + * If it doesn't exist, we're converting the hole at end-of-file. 1027 + */ 1028 + if (xfs_iext_get_extent(ifp, icur, &right)) { 1029 + state |= BMAP_RIGHT_VALID; 1030 + if (isnullstartblock(right.br_startblock)) 1031 + state |= BMAP_RIGHT_DELAY; 1032 + } 1033 + 1034 + /* 1035 + * Set contiguity flags on the left and right neighbors. 1036 + * Don't let extents get too large, even if the pieces are contiguous. 1037 + */ 1038 + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && 1039 + left.br_startoff + left.br_blockcount == new->br_startoff && 1040 + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) 1041 + state |= BMAP_LEFT_CONTIG; 1042 + 1043 + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && 1044 + new->br_startoff + new->br_blockcount == right.br_startoff && 1045 + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && 1046 + (!(state & BMAP_LEFT_CONTIG) || 1047 + (left.br_blockcount + new->br_blockcount + 1048 + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) 1049 + state |= BMAP_RIGHT_CONTIG; 1050 + 1051 + /* 1052 + * Switch out based on the contiguity flags. 1053 + */ 1054 + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { 1055 + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: 1056 + /* 1057 + * New allocation is contiguous with delayed allocations 1058 + * on the left and on the right. 1059 + * Merge all three into a single extent record. 1060 + */ 1061 + temp = left.br_blockcount + new->br_blockcount + 1062 + right.br_blockcount; 1063 + 1064 + oldlen = startblockval(left.br_startblock) + 1065 + startblockval(new->br_startblock) + 1066 + startblockval(right.br_startblock); 1067 + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1068 + oldlen); 1069 + left.br_startblock = nullstartblock(newlen); 1070 + left.br_blockcount = temp; 1071 + 1072 + xfs_iext_remove(ip, icur, state); 1073 + xfs_iext_prev(ifp, icur); 1074 + xfs_iext_update_extent(ip, state, icur, &left); 1075 + break; 1076 + 1077 + case BMAP_LEFT_CONTIG: 1078 + /* 1079 + * New allocation is contiguous with a delayed allocation 1080 + * on the left. 1081 + * Merge the new allocation with the left neighbor. 1082 + */ 1083 + temp = left.br_blockcount + new->br_blockcount; 1084 + 1085 + oldlen = startblockval(left.br_startblock) + 1086 + startblockval(new->br_startblock); 1087 + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1088 + oldlen); 1089 + left.br_blockcount = temp; 1090 + left.br_startblock = nullstartblock(newlen); 1091 + 1092 + xfs_iext_prev(ifp, icur); 1093 + xfs_iext_update_extent(ip, state, icur, &left); 1094 + break; 1095 + 1096 + case BMAP_RIGHT_CONTIG: 1097 + /* 1098 + * New allocation is contiguous with a delayed allocation 1099 + * on the right. 1100 + * Merge the new allocation with the right neighbor. 1101 + */ 1102 + temp = new->br_blockcount + right.br_blockcount; 1103 + oldlen = startblockval(new->br_startblock) + 1104 + startblockval(right.br_startblock); 1105 + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1106 + oldlen); 1107 + right.br_startoff = new->br_startoff; 1108 + right.br_startblock = nullstartblock(newlen); 1109 + right.br_blockcount = temp; 1110 + xfs_iext_update_extent(ip, state, icur, &right); 1111 + break; 1112 + 1113 + case 0: 1114 + /* 1115 + * New allocation is not contiguous with another 1116 + * delayed allocation. 1117 + * Insert a new entry. 1118 + */ 1119 + oldlen = newlen = 0; 1120 + xfs_iext_insert(ip, icur, new, state); 1121 + break; 1122 + } 1123 + if (oldlen != newlen) { 1124 + ASSERT(oldlen > newlen); 1125 + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); 1126 + 1127 + /* 1128 + * Nothing to do for disk quota accounting here. 1129 + */ 1130 + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); 1131 + } 1132 + } 1133 + 1134 + /* 1135 + * Add a delayed allocation extent to an inode. Blocks are reserved from the 1136 + * global pool and the extent inserted into the inode in-core extent tree. 1137 + * 1138 + * On entry, got refers to the first extent beyond the offset of the extent to 1139 + * allocate or eof is specified if no such extent exists. On return, got refers 1140 + * to the extent record that was inserted to the inode fork. 1141 + * 1142 + * Note that the allocated extent may have been merged with contiguous extents 1143 + * during insertion into the inode fork. Thus, got does not reflect the current 1144 + * state of the inode fork on return. If necessary, the caller can use lastx to 1145 + * look up the updated record in the inode fork. 1146 + */ 1147 + static int 1148 + xfs_bmapi_reserve_delalloc( 1149 + struct xfs_inode *ip, 1150 + int whichfork, 1151 + xfs_fileoff_t off, 1152 + xfs_filblks_t len, 1153 + xfs_filblks_t prealloc, 1154 + struct xfs_bmbt_irec *got, 1155 + struct xfs_iext_cursor *icur, 1156 + int eof) 1157 + { 1158 + struct xfs_mount *mp = ip->i_mount; 1159 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 1160 + xfs_extlen_t alen; 1161 + xfs_extlen_t indlen; 1162 + uint64_t fdblocks; 1163 + int error; 1164 + xfs_fileoff_t aoff; 1165 + bool use_cowextszhint = 1166 + whichfork == XFS_COW_FORK && !prealloc; 1167 + 1168 + retry: 1169 + /* 1170 + * Cap the alloc length. Keep track of prealloc so we know whether to 1171 + * tag the inode before we return. 1172 + */ 1173 + aoff = off; 1174 + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); 1175 + if (!eof) 1176 + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); 1177 + if (prealloc && alen >= len) 1178 + prealloc = alen - len; 1179 + 1180 + /* 1181 + * If we're targetting the COW fork but aren't creating a speculative 1182 + * posteof preallocation, try to expand the reservation to align with 1183 + * the COW extent size hint if there's sufficient free space. 1184 + * 1185 + * Unlike the data fork, the CoW cancellation functions will free all 1186 + * the reservations at inactivation, so we don't require that every 1187 + * delalloc reservation have a dirty pagecache. 1188 + */ 1189 + if (use_cowextszhint) { 1190 + struct xfs_bmbt_irec prev; 1191 + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); 1192 + 1193 + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) 1194 + prev.br_startoff = NULLFILEOFF; 1195 + 1196 + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, 1197 + 1, 0, &aoff, &alen); 1198 + ASSERT(!error); 1199 + } 1200 + 1201 + /* 1202 + * Make a transaction-less quota reservation for delayed allocation 1203 + * blocks. This number gets adjusted later. We return if we haven't 1204 + * allocated blocks already inside this loop. 1205 + */ 1206 + error = xfs_quota_reserve_blkres(ip, alen); 1207 + if (error) 1208 + goto out; 1209 + 1210 + /* 1211 + * Split changing sb for alen and indlen since they could be coming 1212 + * from different places. 1213 + */ 1214 + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); 1215 + ASSERT(indlen > 0); 1216 + 1217 + fdblocks = indlen; 1218 + if (XFS_IS_REALTIME_INODE(ip)) { 1219 + ASSERT(!xfs_is_zoned_inode(ip)); 1220 + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); 1221 + if (error) 1222 + goto out_unreserve_quota; 1223 + } else { 1224 + fdblocks += alen; 1225 + } 1226 + 1227 + error = xfs_dec_fdblocks(mp, fdblocks, false); 1228 + if (error) 1229 + goto out_unreserve_frextents; 1230 + 1231 + ip->i_delayed_blks += alen; 1232 + xfs_mod_delalloc(ip, alen, indlen); 1233 + 1234 + got->br_startoff = aoff; 1235 + got->br_startblock = nullstartblock(indlen); 1236 + got->br_blockcount = alen; 1237 + got->br_state = XFS_EXT_NORM; 1238 + 1239 + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); 1240 + 1241 + /* 1242 + * Tag the inode if blocks were preallocated. Note that COW fork 1243 + * preallocation can occur at the start or end of the extent, even when 1244 + * prealloc == 0, so we must also check the aligned offset and length. 1245 + */ 1246 + if (whichfork == XFS_DATA_FORK && prealloc) 1247 + xfs_inode_set_eofblocks_tag(ip); 1248 + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) 1249 + xfs_inode_set_cowblocks_tag(ip); 1250 + 1251 + return 0; 1252 + 1253 + out_unreserve_frextents: 1254 + if (XFS_IS_REALTIME_INODE(ip)) 1255 + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); 1256 + out_unreserve_quota: 1257 + if (XFS_IS_QUOTA_ON(mp)) 1258 + xfs_quota_unreserve_blkres(ip, alen); 1259 + out: 1260 + if (error == -ENOSPC || error == -EDQUOT) { 1261 + trace_xfs_delalloc_enospc(ip, off, len); 1262 + 1263 + if (prealloc || use_cowextszhint) { 1264 + /* retry without any preallocation */ 1265 + use_cowextszhint = false; 1266 + prealloc = 0; 1267 + goto retry; 1268 + } 1269 + } 1270 + return error; 1271 + } 1272 + 1273 + static int 1274 + xfs_zoned_buffered_write_iomap_begin( 1275 + struct inode *inode, 1276 + loff_t offset, 1277 + loff_t count, 1278 + unsigned flags, 1279 + struct iomap *iomap, 1280 + struct iomap *srcmap) 1281 + { 1282 + struct iomap_iter *iter = 1283 + container_of(iomap, struct iomap_iter, iomap); 1284 + struct xfs_zone_alloc_ctx *ac = iter->private; 1285 + struct xfs_inode *ip = XFS_I(inode); 1286 + struct xfs_mount *mp = ip->i_mount; 1287 + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 1288 + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); 1289 + u16 iomap_flags = IOMAP_F_SHARED; 1290 + unsigned int lockmode = XFS_ILOCK_EXCL; 1291 + xfs_filblks_t count_fsb; 1292 + xfs_extlen_t indlen; 1293 + struct xfs_bmbt_irec got; 1294 + struct xfs_iext_cursor icur; 1295 + int error = 0; 1296 + 1297 + ASSERT(!xfs_get_extsz_hint(ip)); 1298 + ASSERT(!(flags & IOMAP_UNSHARE)); 1299 + ASSERT(ac); 1300 + 1301 + if (xfs_is_shutdown(mp)) 1302 + return -EIO; 1303 + 1304 + error = xfs_qm_dqattach(ip); 1305 + if (error) 1306 + return error; 1307 + 1308 + error = xfs_ilock_for_iomap(ip, flags, &lockmode); 1309 + if (error) 1310 + return error; 1311 + 1312 + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || 1313 + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { 1314 + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); 1315 + error = -EFSCORRUPTED; 1316 + goto out_unlock; 1317 + } 1318 + 1319 + XFS_STATS_INC(mp, xs_blk_mapw); 1320 + 1321 + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); 1322 + if (error) 1323 + goto out_unlock; 1324 + 1325 + /* 1326 + * For zeroing operations check if there is any data to zero first. 1327 + * 1328 + * For regular writes we always need to allocate new blocks, but need to 1329 + * provide the source mapping when the range is unaligned to support 1330 + * read-modify-write of the whole block in the page cache. 1331 + * 1332 + * In either case we need to limit the reported range to the boundaries 1333 + * of the source map in the data fork. 1334 + */ 1335 + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || 1336 + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || 1337 + (flags & IOMAP_ZERO)) { 1338 + struct xfs_bmbt_irec smap; 1339 + struct xfs_iext_cursor scur; 1340 + 1341 + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, 1342 + &smap)) 1343 + smap.br_startoff = end_fsb; /* fake hole until EOF */ 1344 + if (smap.br_startoff > offset_fsb) { 1345 + /* 1346 + * We never need to allocate blocks for zeroing a hole. 1347 + */ 1348 + if (flags & IOMAP_ZERO) { 1349 + xfs_hole_to_iomap(ip, iomap, offset_fsb, 1350 + smap.br_startoff); 1351 + goto out_unlock; 1352 + } 1353 + end_fsb = min(end_fsb, smap.br_startoff); 1354 + } else { 1355 + end_fsb = min(end_fsb, 1356 + smap.br_startoff + smap.br_blockcount); 1357 + xfs_trim_extent(&smap, offset_fsb, 1358 + end_fsb - offset_fsb); 1359 + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, 1360 + xfs_iomap_inode_sequence(ip, 0)); 1361 + if (error) 1362 + goto out_unlock; 1363 + } 1364 + } 1365 + 1366 + if (!ip->i_cowfp) 1367 + xfs_ifork_init_cow(ip); 1368 + 1369 + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 1370 + got.br_startoff = end_fsb; 1371 + if (got.br_startoff <= offset_fsb) { 1372 + trace_xfs_reflink_cow_found(ip, &got); 1373 + goto done; 1374 + } 1375 + 1376 + /* 1377 + * Cap the maximum length to keep the chunks of work done here somewhat 1378 + * symmetric with the work writeback does. 1379 + */ 1380 + end_fsb = min(end_fsb, got.br_startoff); 1381 + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, 1382 + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); 1383 + 1384 + /* 1385 + * The block reservation is supposed to cover all blocks that the 1386 + * operation could possible write, but there is a nasty corner case 1387 + * where blocks could be stolen from underneath us: 1388 + * 1389 + * 1) while this thread iterates over a larger buffered write, 1390 + * 2) another thread is causing a write fault that calls into 1391 + * ->page_mkwrite in range this thread writes to, using up the 1392 + * delalloc reservation created by a previous call to this function. 1393 + * 3) another thread does direct I/O on the range that the write fault 1394 + * happened on, which causes writeback of the dirty data. 1395 + * 4) this then set the stale flag, which cuts the current iomap 1396 + * iteration short, causing the new call to ->iomap_begin that gets 1397 + * us here again, but now without a sufficient reservation. 1398 + * 1399 + * This is a very unusual I/O pattern, and nothing but generic/095 is 1400 + * known to hit it. There's not really much we can do here, so turn this 1401 + * into a short write. 1402 + */ 1403 + if (count_fsb > ac->reserved_blocks) { 1404 + xfs_warn_ratelimited(mp, 1405 + "Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", 1406 + ip->i_ino, current->comm); 1407 + count_fsb = ac->reserved_blocks; 1408 + if (!count_fsb) { 1409 + error = -EIO; 1410 + goto out_unlock; 1411 + } 1412 + } 1413 + 1414 + error = xfs_quota_reserve_blkres(ip, count_fsb); 1415 + if (error) 1416 + goto out_unlock; 1417 + 1418 + indlen = xfs_bmap_worst_indlen(ip, count_fsb); 1419 + error = xfs_dec_fdblocks(mp, indlen, false); 1420 + if (error) 1421 + goto out_unlock; 1422 + ip->i_delayed_blks += count_fsb; 1423 + xfs_mod_delalloc(ip, count_fsb, indlen); 1424 + 1425 + got.br_startoff = offset_fsb; 1426 + got.br_startblock = nullstartblock(indlen); 1427 + got.br_blockcount = count_fsb; 1428 + got.br_state = XFS_EXT_NORM; 1429 + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); 1430 + ac->reserved_blocks -= count_fsb; 1431 + iomap_flags |= IOMAP_F_NEW; 1432 + 1433 + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), 1434 + XFS_COW_FORK, &got); 1435 + done: 1436 + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, 1437 + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); 1438 + out_unlock: 1439 + xfs_iunlock(ip, lockmode); 1440 + return error; 1441 + } 1049 1442 1050 1443 static int 1051 1444 xfs_buffered_write_iomap_begin( ··· 1521 1016 1522 1017 if (xfs_is_shutdown(mp)) 1523 1018 return -EIO; 1019 + 1020 + if (xfs_is_zoned_inode(ip)) 1021 + return xfs_zoned_buffered_write_iomap_begin(inode, offset, 1022 + count, flags, iomap, srcmap); 1524 1023 1525 1024 /* we can't use delayed allocations when using extent size hints */ 1526 1025 if (xfs_get_extsz_hint(ip)) ··· 1758 1249 loff_t length, 1759 1250 struct iomap *iomap) 1760 1251 { 1252 + struct iomap_iter *iter = 1253 + container_of(iomap, struct iomap_iter, iomap); 1254 + 1761 1255 xfs_bmap_punch_delalloc_range(XFS_I(inode), 1762 1256 (iomap->flags & IOMAP_F_SHARED) ? 1763 1257 XFS_COW_FORK : XFS_DATA_FORK, 1764 - offset, offset + length); 1258 + offset, offset + length, iter->private); 1765 1259 } 1766 1260 1767 1261 static int ··· 2001 1489 struct xfs_inode *ip, 2002 1490 loff_t pos, 2003 1491 loff_t len, 1492 + struct xfs_zone_alloc_ctx *ac, 2004 1493 bool *did_zero) 2005 1494 { 2006 1495 struct inode *inode = VFS_I(ip); ··· 2012 1499 return dax_zero_range(inode, pos, len, did_zero, 2013 1500 &xfs_dax_write_iomap_ops); 2014 1501 return iomap_zero_range(inode, pos, len, did_zero, 2015 - &xfs_buffered_write_iomap_ops, NULL); 1502 + &xfs_buffered_write_iomap_ops, ac); 2016 1503 } 2017 1504 2018 1505 int 2019 1506 xfs_truncate_page( 2020 1507 struct xfs_inode *ip, 2021 1508 loff_t pos, 1509 + struct xfs_zone_alloc_ctx *ac, 2022 1510 bool *did_zero) 2023 1511 { 2024 1512 struct inode *inode = VFS_I(ip); ··· 2028 1514 return dax_truncate_page(inode, pos, did_zero, 2029 1515 &xfs_dax_write_iomap_ops); 2030 1516 return iomap_truncate_page(inode, pos, did_zero, 2031 - &xfs_buffered_write_iomap_ops, NULL); 1517 + &xfs_buffered_write_iomap_ops, ac); 2032 1518 }

+5 -2

fs/xfs/xfs_iomap.h

··· 10 10 11 11 struct xfs_inode; 12 12 struct xfs_bmbt_irec; 13 + struct xfs_zone_alloc_ctx; 13 14 14 15 int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, 15 16 xfs_fileoff_t count_fsb, unsigned int flags, ··· 25 24 u16 iomap_flags, u64 sequence_cookie); 26 25 27 26 int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, 28 - bool *did_zero); 29 - int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); 27 + struct xfs_zone_alloc_ctx *ac, bool *did_zero); 28 + int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, 29 + struct xfs_zone_alloc_ctx *ac, bool *did_zero); 30 30 31 31 static inline xfs_filblks_t 32 32 xfs_aligned_fsb_count( ··· 51 49 52 50 extern const struct iomap_ops xfs_buffered_write_iomap_ops; 53 51 extern const struct iomap_ops xfs_direct_write_iomap_ops; 52 + extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops; 54 53 extern const struct iomap_ops xfs_read_iomap_ops; 55 54 extern const struct iomap_ops xfs_seek_iomap_ops; 56 55 extern const struct iomap_ops xfs_xattr_iomap_ops;

+29 -2

fs/xfs/xfs_iops.c

··· 29 29 #include "xfs_xattr.h" 30 30 #include "xfs_file.h" 31 31 #include "xfs_bmap.h" 32 + #include "xfs_zone_alloc.h" 32 33 33 34 #include <linux/posix_acl.h> 34 35 #include <linux/security.h> ··· 855 854 uint lock_flags = 0; 856 855 uint resblks = 0; 857 856 bool did_zeroing = false; 857 + struct xfs_zone_alloc_ctx ac = { }; 858 858 859 859 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); 860 860 ASSERT(S_ISREG(inode->i_mode)); ··· 892 890 inode_dio_wait(inode); 893 891 894 892 /* 893 + * Normally xfs_zoned_space_reserve is supposed to be called outside the 894 + * IOLOCK. For truncate we can't do that since ->setattr is called with 895 + * it already held by the VFS. So for now chicken out and try to 896 + * allocate space under it. 897 + * 898 + * To avoid deadlocks this means we can't block waiting for space, which 899 + * can lead to spurious -ENOSPC if there are no directly available 900 + * blocks. We mitigate this a bit by allowing zeroing to dip into the 901 + * reserved pool, but eventually the VFS calling convention needs to 902 + * change. 903 + */ 904 + if (xfs_is_zoned_inode(ip)) { 905 + error = xfs_zoned_space_reserve(ip, 1, 906 + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); 907 + if (error) { 908 + if (error == -EAGAIN) 909 + return -ENOSPC; 910 + return error; 911 + } 912 + } 913 + 914 + /* 895 915 * File data changes must be complete before we start the transaction to 896 916 * modify the inode. This needs to be done before joining the inode to 897 917 * the transaction because the inode cannot be unlocked once it is a ··· 926 902 if (newsize > oldsize) { 927 903 trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); 928 904 error = xfs_zero_range(ip, oldsize, newsize - oldsize, 929 - &did_zeroing); 905 + &ac, &did_zeroing); 930 906 } else { 931 - error = xfs_truncate_page(ip, newsize, &did_zeroing); 907 + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); 932 908 } 909 + 910 + if (xfs_is_zoned_inode(ip)) 911 + xfs_zoned_space_unreserve(ip, &ac); 933 912 934 913 if (error) 935 914 return error;

+4

fs/xfs/xfs_log.c

··· 20 20 #include "xfs_sysfs.h" 21 21 #include "xfs_sb.h" 22 22 #include "xfs_health.h" 23 + #include "xfs_zone_alloc.h" 23 24 24 25 struct kmem_cache *xfs_log_ticket_cache; 25 26 ··· 3541 3540 spin_unlock(&log->l_icloglock); 3542 3541 3543 3542 wake_up_var(&log->l_opstate); 3543 + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) 3544 + xfs_zoned_wake_all(log->l_mp); 3545 + 3544 3546 return log_error; 3545 3547 } 3546 3548

+4

fs/xfs/xfs_message.c

··· 173 173 .opstate = XFS_OPSTATE_WARNED_METADIR, 174 174 .name = "metadata directory tree", 175 175 }, 176 + [XFS_EXPERIMENTAL_ZONED] = { 177 + .opstate = XFS_OPSTATE_WARNED_ZONED, 178 + .name = "zoned RT device", 179 + }, 176 180 }; 177 181 ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); 178 182 BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);

+1

fs/xfs/xfs_message.h

··· 99 99 XFS_EXPERIMENTAL_EXCHRANGE, 100 100 XFS_EXPERIMENTAL_PPTR, 101 101 XFS_EXPERIMENTAL_METADIR, 102 + XFS_EXPERIMENTAL_ZONED, 102 103 103 104 XFS_EXPERIMENTAL_MAX, 104 105 };

+126 -92

fs/xfs/xfs_mount.c

··· 40 40 #include "xfs_rtrmap_btree.h" 41 41 #include "xfs_rtrefcount_btree.h" 42 42 #include "scrub/stats.h" 43 + #include "xfs_zone_alloc.h" 43 44 44 45 static DEFINE_MUTEX(xfs_uuid_table_mutex); 45 46 static int xfs_uuid_table_size; ··· 186 185 */ 187 186 reread: 188 187 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 189 - BTOBB(sector_size), 0, &bp, buf_ops); 188 + BTOBB(sector_size), &bp, buf_ops); 190 189 if (error) { 191 190 if (loud) 192 191 xfs_warn(mp, "SB validate failed with error %d.", error); ··· 414 413 } 415 414 error = xfs_buf_read_uncached(mp->m_ddev_targp, 416 415 d - XFS_FSS_TO_BB(mp, 1), 417 - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); 416 + XFS_FSS_TO_BB(mp, 1), &bp, NULL); 418 417 if (error) { 419 418 xfs_warn(mp, "last sector read failed"); 420 419 return error; ··· 431 430 } 432 431 error = xfs_buf_read_uncached(mp->m_logdev_targp, 433 432 d - XFS_FSB_TO_BB(mp, 1), 434 - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); 433 + XFS_FSB_TO_BB(mp, 1), &bp, NULL); 435 434 if (error) { 436 435 xfs_warn(mp, "log device read failed"); 437 436 return error; ··· 462 461 return xfs_sync_sb(mp, false); 463 462 } 464 463 465 - uint64_t 466 - xfs_default_resblks(xfs_mount_t *mp) 467 - { 468 - uint64_t resblks; 464 + static const char *const xfs_free_pool_name[] = { 465 + [XC_FREE_BLOCKS] = "free blocks", 466 + [XC_FREE_RTEXTENTS] = "free rt extents", 467 + [XC_FREE_RTAVAILABLE] = "available rt extents", 468 + }; 469 469 470 - /* 471 - * We default to 5% or 8192 fsbs of space reserved, whichever is 472 - * smaller. This is intended to cover concurrent allocation 473 - * transactions when we initially hit enospc. These each require a 4 474 - * block reservation. Hence by default we cover roughly 2000 concurrent 475 - * allocation reservations. 476 - */ 477 - resblks = mp->m_sb.sb_dblocks; 478 - do_div(resblks, 20); 479 - resblks = min_t(uint64_t, resblks, 8192); 480 - return resblks; 470 + uint64_t 471 + xfs_default_resblks( 472 + struct xfs_mount *mp, 473 + enum xfs_free_counter ctr) 474 + { 475 + switch (ctr) { 476 + case XC_FREE_BLOCKS: 477 + /* 478 + * Default to 5% or 8192 FSBs of space reserved, whichever is 479 + * smaller. 480 + * 481 + * This is intended to cover concurrent allocation transactions 482 + * when we initially hit ENOSPC. These each require a 4 block 483 + * reservation. Hence by default we cover roughly 2000 484 + * concurrent allocation reservations. 485 + */ 486 + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL); 487 + case XC_FREE_RTEXTENTS: 488 + case XC_FREE_RTAVAILABLE: 489 + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) 490 + return xfs_zoned_default_resblks(mp, ctr); 491 + return 0; 492 + default: 493 + ASSERT(0); 494 + return 0; 495 + } 481 496 } 482 497 483 498 /* Ensure the summary counts are correct. */ ··· 560 543 * If we're mounting the rt volume after recovering the log, recompute 561 544 * frextents from the rtbitmap file to fix the inconsistency. 562 545 */ 563 - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { 546 + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) { 564 547 error = xfs_rtalloc_reinit_frextents(mp); 565 548 if (error) 566 549 return error; ··· 695 678 uint quotamount = 0; 696 679 uint quotaflags = 0; 697 680 int error = 0; 681 + int i; 698 682 699 683 xfs_sb_mount_common(mp, sbp); 700 684 ··· 765 747 /* enable fail_at_unmount as default */ 766 748 mp->m_fail_unmount = true; 767 749 768 - super_set_sysfs_name_id(mp->m_super); 769 - 770 - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, 771 - NULL, mp->m_super->s_id); 772 - if (error) 773 - goto out; 774 - 775 - error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, 776 - &mp->m_kobj, "stats"); 777 - if (error) 778 - goto out_remove_sysfs; 779 - 780 - xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); 781 - 782 - error = xfs_error_sysfs_init(mp); 750 + error = xfs_mount_sysfs_init(mp); 783 751 if (error) 784 752 goto out_remove_scrub_stats; 785 753 754 + xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); 755 + 786 756 error = xfs_errortag_init(mp); 787 757 if (error) 788 - goto out_remove_error_sysfs; 758 + goto out_remove_sysfs; 789 759 790 760 error = xfs_uuid_mount(mp); 791 761 if (error) ··· 1037 1031 if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) 1038 1032 xfs_log_clean(mp); 1039 1033 1034 + if (xfs_has_zoned(mp)) { 1035 + error = xfs_mount_zones(mp); 1036 + if (error) 1037 + goto out_rtunmount; 1038 + } 1039 + 1040 1040 /* 1041 1041 * Complete the quota initialisation, post-log-replay component. 1042 1042 */ ··· 1058 1046 * privileged transactions. This is needed so that transaction 1059 1047 * space required for critical operations can dip into this pool 1060 1048 * when at ENOSPC. This is needed for operations like create with 1061 - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1062 - * are not allowed to use this reserved space. 1049 + * attr, unwritten extent conversion at ENOSPC, garbage collection 1050 + * etc. Data allocations are not allowed to use this reserved space. 1063 1051 * 1064 1052 * This may drive us straight to ENOSPC on mount, but that implies 1065 1053 * we were already there on the last unmount. Warn if this occurs. 1066 1054 */ 1067 1055 if (!xfs_is_readonly(mp)) { 1068 - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); 1069 - if (error) 1070 - xfs_warn(mp, 1071 - "Unable to allocate reserve blocks. Continuing without reserve pool."); 1056 + for (i = 0; i < XC_FREE_NR; i++) { 1057 + error = xfs_reserve_blocks(mp, i, 1058 + xfs_default_resblks(mp, i)); 1059 + if (error) 1060 + xfs_warn(mp, 1061 + "Unable to allocate reserve blocks. Continuing without reserve pool for %s.", 1062 + xfs_free_pool_name[i]); 1063 + } 1072 1064 1073 1065 /* Reserve AG blocks for future btree expansion. */ 1074 1066 error = xfs_fs_reserve_ag_blocks(mp); 1075 1067 if (error && error != -ENOSPC) 1076 1068 goto out_agresv; 1069 + 1070 + xfs_zone_gc_start(mp); 1077 1071 } 1078 1072 1079 1073 return 0; ··· 1087 1069 out_agresv: 1088 1070 xfs_fs_unreserve_ag_blocks(mp); 1089 1071 xfs_qm_unmount_quotas(mp); 1072 + if (xfs_has_zoned(mp)) 1073 + xfs_unmount_zones(mp); 1090 1074 out_rtunmount: 1091 1075 xfs_rtunmount_inodes(mp); 1092 1076 out_rele_rip: ··· 1136 1116 xfs_uuid_unmount(mp); 1137 1117 out_remove_errortag: 1138 1118 xfs_errortag_del(mp); 1139 - out_remove_error_sysfs: 1140 - xfs_error_sysfs_del(mp); 1119 + out_remove_sysfs: 1120 + xfs_mount_sysfs_del(mp); 1141 1121 out_remove_scrub_stats: 1142 1122 xchk_stats_unregister(mp->m_scrub_stats); 1143 - xfs_sysfs_del(&mp->m_stats.xs_kobj); 1144 - out_remove_sysfs: 1145 - xfs_sysfs_del(&mp->m_kobj); 1146 1123 out: 1147 1124 return error; 1148 1125 } ··· 1165 1148 xfs_inodegc_flush(mp); 1166 1149 1167 1150 xfs_blockgc_stop(mp); 1151 + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate)) 1152 + xfs_zone_gc_stop(mp); 1168 1153 xfs_fs_unreserve_ag_blocks(mp); 1169 1154 xfs_qm_unmount_quotas(mp); 1155 + if (xfs_has_zoned(mp)) 1156 + xfs_unmount_zones(mp); 1170 1157 xfs_rtunmount_inodes(mp); 1171 1158 xfs_irele(mp->m_rootip); 1172 1159 if (mp->m_metadirip) ··· 1194 1173 * we only every apply deltas to the superblock and hence the incore 1195 1174 * value does not matter.... 1196 1175 */ 1197 - error = xfs_reserve_blocks(mp, 0); 1176 + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0); 1198 1177 if (error) 1199 1178 xfs_warn(mp, "Unable to free reserved block pool. " 1200 1179 "Freespace may not be correct on next mount."); ··· 1216 1195 xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); 1217 1196 xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); 1218 1197 xfs_errortag_del(mp); 1219 - xfs_error_sysfs_del(mp); 1220 1198 xchk_stats_unregister(mp->m_scrub_stats); 1221 - xfs_sysfs_del(&mp->m_stats.xs_kobj); 1222 - xfs_sysfs_del(&mp->m_kobj); 1199 + xfs_mount_sysfs_del(mp); 1223 1200 } 1224 1201 1225 1202 /* ··· 1239 1220 return true; 1240 1221 } 1241 1222 1223 + /* 1224 + * Estimate the amount of free space that is not available to userspace and is 1225 + * not explicitly reserved from the incore fdblocks. This includes: 1226 + * 1227 + * - The minimum number of blocks needed to support splitting a bmap btree 1228 + * - The blocks currently in use by the freespace btrees because they record 1229 + * the actual blocks that will fill per-AG metadata space reservations 1230 + */ 1231 + uint64_t 1232 + xfs_freecounter_unavailable( 1233 + struct xfs_mount *mp, 1234 + enum xfs_free_counter ctr) 1235 + { 1236 + if (ctr != XC_FREE_BLOCKS) 1237 + return 0; 1238 + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); 1239 + } 1240 + 1242 1241 void 1243 1242 xfs_add_freecounter( 1244 1243 struct xfs_mount *mp, 1245 - struct percpu_counter *counter, 1244 + enum xfs_free_counter ctr, 1246 1245 uint64_t delta) 1247 1246 { 1248 - bool has_resv_pool = (counter == &mp->m_fdblocks); 1247 + struct xfs_freecounter *counter = &mp->m_free[ctr]; 1249 1248 uint64_t res_used; 1250 1249 1251 1250 /* 1252 1251 * If the reserve pool is depleted, put blocks back into it first. 1253 1252 * Most of the time the pool is full. 1254 1253 */ 1255 - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { 1256 - percpu_counter_add(counter, delta); 1254 + if (likely(counter->res_avail == counter->res_total)) { 1255 + percpu_counter_add(&counter->count, delta); 1257 1256 return; 1258 1257 } 1259 1258 1260 1259 spin_lock(&mp->m_sb_lock); 1261 - res_used = mp->m_resblks - mp->m_resblks_avail; 1260 + res_used = counter->res_total - counter->res_avail; 1262 1261 if (res_used > delta) { 1263 - mp->m_resblks_avail += delta; 1262 + counter->res_avail += delta; 1264 1263 } else { 1265 1264 delta -= res_used; 1266 - mp->m_resblks_avail = mp->m_resblks; 1267 - percpu_counter_add(counter, delta); 1265 + counter->res_avail = counter->res_total; 1266 + percpu_counter_add(&counter->count, delta); 1268 1267 } 1269 1268 spin_unlock(&mp->m_sb_lock); 1270 1269 } 1271 1270 1271 + 1272 + /* Adjust in-core free blocks or RT extents. */ 1272 1273 int 1273 1274 xfs_dec_freecounter( 1274 1275 struct xfs_mount *mp, 1275 - struct percpu_counter *counter, 1276 + enum xfs_free_counter ctr, 1276 1277 uint64_t delta, 1277 1278 bool rsvd) 1278 1279 { 1279 - int64_t lcounter; 1280 - uint64_t set_aside = 0; 1280 + struct xfs_freecounter *counter = &mp->m_free[ctr]; 1281 1281 s32 batch; 1282 - bool has_resv_pool; 1283 1282 1284 - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); 1285 - has_resv_pool = (counter == &mp->m_fdblocks); 1286 - if (rsvd) 1287 - ASSERT(has_resv_pool); 1283 + ASSERT(ctr < XC_FREE_NR); 1288 1284 1289 1285 /* 1290 1286 * Taking blocks away, need to be more accurate the closer we ··· 1309 1275 * then make everything serialise as we are real close to 1310 1276 * ENOSPC. 1311 1277 */ 1312 - if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, 1278 + if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH, 1313 1279 XFS_FDBLOCKS_BATCH) < 0) 1314 1280 batch = 1; 1315 1281 else ··· 1326 1292 * problems (i.e. transaction abort, pagecache discards, etc.) than 1327 1293 * slightly premature -ENOSPC. 1328 1294 */ 1329 - if (has_resv_pool) 1330 - set_aside = xfs_fdblocks_unavailable(mp); 1331 - percpu_counter_add_batch(counter, -((int64_t)delta), batch); 1332 - if (__percpu_counter_compare(counter, set_aside, 1333 - XFS_FDBLOCKS_BATCH) >= 0) { 1334 - /* we had space! */ 1335 - return 0; 1336 - } 1337 - 1338 - /* 1339 - * lock up the sb for dipping into reserves before releasing the space 1340 - * that took us to ENOSPC. 1341 - */ 1342 - spin_lock(&mp->m_sb_lock); 1343 - percpu_counter_add(counter, delta); 1344 - if (!has_resv_pool || !rsvd) 1345 - goto fdblocks_enospc; 1346 - 1347 - lcounter = (long long)mp->m_resblks_avail - delta; 1348 - if (lcounter >= 0) { 1349 - mp->m_resblks_avail = lcounter; 1350 - spin_unlock(&mp->m_sb_lock); 1351 - return 0; 1352 - } 1353 - xfs_warn_once(mp, 1295 + percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch); 1296 + if (__percpu_counter_compare(&counter->count, 1297 + xfs_freecounter_unavailable(mp, ctr), 1298 + XFS_FDBLOCKS_BATCH) < 0) { 1299 + /* 1300 + * Lock up the sb for dipping into reserves before releasing the 1301 + * space that took us to ENOSPC. 1302 + */ 1303 + spin_lock(&mp->m_sb_lock); 1304 + percpu_counter_add(&counter->count, delta); 1305 + if (!rsvd) 1306 + goto fdblocks_enospc; 1307 + if (delta > counter->res_avail) { 1308 + if (ctr == XC_FREE_BLOCKS) 1309 + xfs_warn_once(mp, 1354 1310 "Reserve blocks depleted! Consider increasing reserve pool size."); 1311 + goto fdblocks_enospc; 1312 + } 1313 + counter->res_avail -= delta; 1314 + trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_); 1315 + spin_unlock(&mp->m_sb_lock); 1316 + } 1317 + 1318 + /* we had space! */ 1319 + return 0; 1355 1320 1356 1321 fdblocks_enospc: 1322 + trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_); 1357 1323 spin_unlock(&mp->m_sb_lock); 1358 1324 return -ENOSPC; 1359 1325 }

+108 -23

fs/xfs/xfs_mount.h

··· 98 98 uint8_t blklog; 99 99 100 100 /* 101 + * Zoned devices can have gaps beyond the usable capacity of a zone and 102 + * the end in the LBA/daddr address space. In other words, the hardware 103 + * equivalent to the RT groups already takes care of the power of 2 104 + * alignment for us. In this case the sparse FSB/RTB address space maps 105 + * 1:1 to the device address space. 106 + */ 107 + bool has_daddr_gaps; 108 + 109 + /* 101 110 * Mask to extract the group-relative block number from a FSB. 102 111 * For a pre-rtgroups filesystem we pretend to have one very large 103 112 * rtgroup, so this mask must be 64-bit. 104 113 */ 105 114 uint64_t blkmask; 115 + 116 + /* 117 + * Start of the first group in the device. This is used to support a 118 + * RT device following the data device on the same block device for 119 + * SMR hard drives. 120 + */ 121 + xfs_fsblock_t start_fsb; 122 + }; 123 + 124 + struct xfs_freecounter { 125 + /* free blocks for general use: */ 126 + struct percpu_counter count; 127 + 128 + /* total reserved blocks: */ 129 + uint64_t res_total; 130 + 131 + /* available reserved blocks: */ 132 + uint64_t res_avail; 133 + 134 + /* reserved blks @ remount,ro: */ 135 + uint64_t res_saved; 106 136 }; 107 137 108 138 /* ··· 228 198 bool m_fail_unmount; 229 199 bool m_finobt_nores; /* no per-AG finobt resv. */ 230 200 bool m_update_sb; /* sb needs update in mount */ 201 + unsigned int m_max_open_zones; 231 202 232 203 /* 233 204 * Bitsets of per-fs metadata that have been checked and/or are sick. ··· 253 222 spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ 254 223 struct percpu_counter m_icount; /* allocated inodes counter */ 255 224 struct percpu_counter m_ifree; /* free inodes counter */ 256 - struct percpu_counter m_fdblocks; /* free block counter */ 257 - struct percpu_counter m_frextents; /* free rt extent counter */ 225 + 226 + struct xfs_freecounter m_free[XC_FREE_NR]; 258 227 259 228 /* 260 229 * Count of data device blocks reserved for delayed allocations, ··· 276 245 atomic64_t m_allocbt_blks; 277 246 278 247 struct xfs_groups m_groups[XG_TYPE_MAX]; 279 - uint64_t m_resblks; /* total reserved blocks */ 280 - uint64_t m_resblks_avail;/* available reserved blocks */ 281 - uint64_t m_resblks_save; /* reserved blks @ remount,ro */ 282 248 struct delayed_work m_reclaim_work; /* background inode reclaim */ 249 + struct xfs_zone_info *m_zone_info; /* zone allocator information */ 283 250 struct dentry *m_debugfs; /* debugfs parent */ 284 251 struct xfs_kobj m_kobj; 285 252 struct xfs_kobj m_error_kobj; ··· 287 258 #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS 288 259 struct xchk_stats *m_scrub_stats; 289 260 #endif 261 + struct xfs_kobj m_zoned_kobj; 290 262 xfs_agnumber_t m_agfrotor; /* last ag where space found */ 291 263 atomic_t m_agirotor; /* last ag dir inode alloced */ 292 264 atomic_t m_rtgrotor; /* last rtgroup rtpicked */ 265 + 266 + struct mutex m_metafile_resv_lock; 267 + uint64_t m_metafile_resv_target; 268 + uint64_t m_metafile_resv_used; 269 + uint64_t m_metafile_resv_avail; 293 270 294 271 /* Memory shrinker to throttle and reprioritize inodegc */ 295 272 struct shrinker *m_inodegc_shrinker; ··· 371 336 #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ 372 337 #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ 373 338 #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ 339 + #define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */ 374 340 375 341 /* Mount features */ 342 + #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ 376 343 #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ 377 344 #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ 378 345 #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ ··· 429 392 __XFS_HAS_FEAT(large_extent_counts, NREXT64) 430 393 __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) 431 394 __XFS_HAS_FEAT(metadir, METADIR) 395 + __XFS_HAS_FEAT(zoned, ZONED) 396 + __XFS_HAS_FEAT(nolifetime, NOLIFETIME) 432 397 433 398 static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) 434 399 { ··· 441 402 static inline bool xfs_has_rtsb(const struct xfs_mount *mp) 442 403 { 443 404 /* all rtgroups filesystems with an rt section have an rtsb */ 444 - return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); 405 + return xfs_has_rtgroups(mp) && 406 + xfs_has_realtime(mp) && 407 + !xfs_has_zoned(mp); 445 408 } 446 409 447 410 static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) ··· 456 415 { 457 416 return xfs_has_metadir(mp) && xfs_has_realtime(mp) && 458 417 xfs_has_reflink(mp); 418 + } 419 + 420 + static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) 421 + { 422 + return !xfs_has_zoned(mp); 459 423 } 460 424 461 425 /* ··· 566 520 #define XFS_OPSTATE_WARNED_METADIR 17 567 521 /* Filesystem should use qflags to determine quotaon status */ 568 522 #define XFS_OPSTATE_RESUMING_QUOTAON 18 523 + /* Kernel has logged a warning about zoned RT device being used on this fs. */ 524 + #define XFS_OPSTATE_WARNED_ZONED 19 525 + /* (Zoned) GC is in progress */ 526 + #define XFS_OPSTATE_ZONEGC_RUNNING 20 569 527 570 528 #define __XFS_IS_OPSTATE(name, NAME) \ 571 529 static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ ··· 614 564 #endif /* CONFIG_XFS_QUOTA */ 615 565 __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) 616 566 __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) 567 + __XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING) 617 568 618 569 static inline bool 619 570 xfs_should_warn(struct xfs_mount *mp, long nr) ··· 684 633 } 685 634 686 635 extern void xfs_uuid_table_free(void); 687 - extern uint64_t xfs_default_resblks(xfs_mount_t *mp); 636 + uint64_t xfs_default_resblks(struct xfs_mount *mp, 637 + enum xfs_free_counter ctr); 688 638 extern int xfs_mountfs(xfs_mount_t *mp); 689 639 extern void xfs_unmountfs(xfs_mount_t *); 690 640 ··· 698 646 */ 699 647 #define XFS_FDBLOCKS_BATCH 1024 700 648 649 + uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp, 650 + enum xfs_free_counter ctr); 651 + 701 652 /* 702 - * Estimate the amount of free space that is not available to userspace and is 703 - * not explicitly reserved from the incore fdblocks. This includes: 704 - * 705 - * - The minimum number of blocks needed to support splitting a bmap btree 706 - * - The blocks currently in use by the freespace btrees because they record 707 - * the actual blocks that will fill per-AG metadata space reservations 653 + * Sum up the freecount, but never return negative values. 708 654 */ 709 - static inline uint64_t 710 - xfs_fdblocks_unavailable( 711 - struct xfs_mount *mp) 655 + static inline s64 xfs_sum_freecounter(struct xfs_mount *mp, 656 + enum xfs_free_counter ctr) 712 657 { 713 - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); 658 + return percpu_counter_sum_positive(&mp->m_free[ctr].count); 714 659 } 715 660 716 - int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, 661 + /* 662 + * Same as above, but does return negative values. Mostly useful for 663 + * special cases like repair and tracing. 664 + */ 665 + static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp, 666 + enum xfs_free_counter ctr) 667 + { 668 + return percpu_counter_sum(&mp->m_free[ctr].count); 669 + } 670 + 671 + /* 672 + * This just provides and estimate without the cpu-local updates, use 673 + * xfs_sum_freecounter for the exact value. 674 + */ 675 + static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp, 676 + enum xfs_free_counter ctr) 677 + { 678 + return percpu_counter_read_positive(&mp->m_free[ctr].count); 679 + } 680 + 681 + static inline int xfs_compare_freecounter(struct xfs_mount *mp, 682 + enum xfs_free_counter ctr, s64 rhs, s32 batch) 683 + { 684 + return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch); 685 + } 686 + 687 + static inline void xfs_set_freecounter(struct xfs_mount *mp, 688 + enum xfs_free_counter ctr, uint64_t val) 689 + { 690 + percpu_counter_set(&mp->m_free[ctr].count, val); 691 + } 692 + 693 + int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, 717 694 uint64_t delta, bool rsvd); 718 - void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, 695 + void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, 719 696 uint64_t delta); 720 697 721 698 static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, 722 699 bool reserved) 723 700 { 724 - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); 701 + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved); 725 702 } 726 703 727 704 static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) 728 705 { 729 - xfs_add_freecounter(mp, &mp->m_fdblocks, delta); 706 + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta); 730 707 } 731 708 732 709 static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) 733 710 { 734 - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); 711 + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false); 735 712 } 736 713 737 714 static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) 738 715 { 739 - xfs_add_freecounter(mp, &mp->m_frextents, delta); 716 + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta); 740 717 } 741 718 742 719 extern int xfs_readsb(xfs_mount_t *, int); ··· 787 706 bool xfs_clear_incompat_log_features(struct xfs_mount *mp); 788 707 void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, 789 708 int64_t ind_delta); 709 + static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) 710 + { 711 + percpu_counter_add(&mp->m_delalloc_blks, delta); 712 + } 790 713 791 714 #endif /* __XFS_MOUNT_H__ */

+2 -1

fs/xfs/xfs_qm.c

··· 1711 1711 * immediately. We only support rtquota if rtgroups are enabled to 1712 1712 * avoid problems with older kernels. 1713 1713 */ 1714 - if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) { 1714 + if (mp->m_sb.sb_rextents && 1715 + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) { 1715 1716 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); 1716 1717 mp->m_qflags = 0; 1717 1718 goto write_changes;

+6 -12

fs/xfs/xfs_reflink.c

··· 235 235 int error = 0; 236 236 237 237 /* Holes, unwritten, and delalloc extents cannot be shared */ 238 - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { 238 + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) { 239 239 *shared = false; 240 240 return 0; 241 241 } ··· 651 651 652 652 if (isnullstartblock(del.br_startblock)) { 653 653 xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, 654 - &del); 654 + &del, 0); 655 655 } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 656 656 ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); 657 657 ··· 1207 1207 if (!xfs_has_rmapbt(mp)) 1208 1208 return 0; 1209 1209 if (XFS_IS_REALTIME_INODE(ip)) { 1210 - struct xfs_rtgroup *rtg; 1211 - xfs_rgnumber_t rgno; 1212 - 1213 - rgno = xfs_rtb_to_rgno(mp, fsb); 1214 - rtg = xfs_rtgroup_get(mp, rgno); 1215 - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) 1216 - error = -ENOSPC; 1217 - xfs_rtgroup_put(rtg); 1218 - return error; 1210 + if (xfs_metafile_resv_critical(mp)) 1211 + return -ENOSPC; 1212 + return 0; 1219 1213 } 1220 1214 1221 1215 agno = XFS_FSB_TO_AGNO(mp, fsb); ··· 1532 1538 return 0; 1533 1539 1534 1540 trace_xfs_zero_eof(ip, isize, pos - isize); 1535 - return xfs_zero_range(ip, isize, pos - isize, NULL); 1541 + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL); 1536 1542 } 1537 1543 1538 1544 /*

+145 -99

fs/xfs/xfs_rtalloc.c

··· 33 33 #include "xfs_trace.h" 34 34 #include "xfs_rtrefcount_btree.h" 35 35 #include "xfs_reflink.h" 36 + #include "xfs_zone_alloc.h" 36 37 37 38 /* 38 39 * Return whether there are any free extents in the size range given ··· 664 663 665 664 for (i = 0; i < XFS_RTGI_MAX; i++) 666 665 xfs_rtginode_irele(&rtg->rtg_inodes[i]); 667 - kvfree(rtg->rtg_rsum_cache); 666 + if (!xfs_has_zoned(rtg_mount(rtg))) 667 + kvfree(rtg->rtg_rsum_cache); 668 668 } 669 669 670 670 static int ··· 839 837 return 0; 840 838 841 839 error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1), 842 - 0, &rtsb_bp); 840 + &rtsb_bp); 843 841 if (error) 844 842 return error; 845 843 ··· 857 855 if (rtg_rmap(args->rtg) != NULL) 858 856 error = xfs_rtrmapbt_init_rtsb(nargs->mp, args->rtg, args->tp); 859 857 858 + return error; 859 + } 860 + 861 + static void 862 + xfs_growfs_rt_sb_fields( 863 + struct xfs_trans *tp, 864 + const struct xfs_mount *nmp) 865 + { 866 + struct xfs_mount *mp = tp->t_mountp; 867 + 868 + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) 869 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, 870 + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); 871 + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) 872 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, 873 + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); 874 + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) 875 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, 876 + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); 877 + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) 878 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, 879 + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); 880 + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) 881 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, 882 + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); 883 + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) 884 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, 885 + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); 886 + } 887 + 888 + static int 889 + xfs_growfs_rt_zoned( 890 + struct xfs_rtgroup *rtg, 891 + xfs_rfsblock_t nrblocks) 892 + { 893 + struct xfs_mount *mp = rtg_mount(rtg); 894 + struct xfs_mount *nmp; 895 + struct xfs_trans *tp; 896 + xfs_rtbxlen_t freed_rtx; 897 + int error; 898 + 899 + /* 900 + * Calculate new sb and mount fields for this round. Also ensure the 901 + * rtg_extents value is uptodate as the rtbitmap code relies on it. 902 + */ 903 + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, 904 + mp->m_sb.sb_rextsize); 905 + if (!nmp) 906 + return -ENOMEM; 907 + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; 908 + 909 + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), 910 + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); 911 + 912 + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); 913 + if (error) 914 + goto out_free; 915 + 916 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 917 + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 918 + 919 + xfs_growfs_rt_sb_fields(tp, nmp); 920 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); 921 + 922 + error = xfs_trans_commit(tp); 923 + if (error) 924 + goto out_free; 925 + 926 + /* 927 + * Ensure the mount RT feature flag is now set, and compute new 928 + * maxlevels for rt btrees. 929 + */ 930 + mp->m_features |= XFS_FEAT_REALTIME; 931 + xfs_rtrmapbt_compute_maxlevels(mp); 932 + xfs_rtrefcountbt_compute_maxlevels(mp); 933 + xfs_zoned_add_available(mp, freed_rtx); 934 + out_free: 935 + kfree(nmp); 860 936 return error; 861 937 } 862 938 ··· 1023 943 /* 1024 944 * Update superblock fields. 1025 945 */ 1026 - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) 1027 - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, 1028 - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); 1029 - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) 1030 - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, 1031 - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); 1032 - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) 1033 - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, 1034 - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); 1035 - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) 1036 - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, 1037 - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); 1038 - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) 1039 - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, 1040 - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); 1041 - if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) 1042 - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, 1043 - nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); 946 + xfs_growfs_rt_sb_fields(args.tp, nmp); 1044 947 1045 948 /* 1046 949 * Free the new extent. ··· 1190 1127 goto out_rele; 1191 1128 } 1192 1129 1130 + if (xfs_has_zoned(mp)) { 1131 + error = xfs_growfs_rt_zoned(rtg, nrblocks); 1132 + goto out_rele; 1133 + } 1134 + 1193 1135 error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); 1194 1136 if (error) 1195 1137 goto out_rele; ··· 1212 1144 goto out_error; 1213 1145 } 1214 1146 1215 - if (old_rsum_cache) 1216 - kvfree(old_rsum_cache); 1217 - xfs_rtgroup_rele(rtg); 1218 - return 0; 1147 + kvfree(old_rsum_cache); 1148 + goto out_rele; 1219 1149 1220 1150 out_error: 1221 1151 /* ··· 1261 1195 1262 1196 if (min_logfsbs > mp->m_sb.sb_logblocks) 1263 1197 return -EINVAL; 1198 + 1199 + if (xfs_has_zoned(mp)) { 1200 + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; 1201 + uint32_t rem; 1202 + 1203 + if (rextsize != 1) 1204 + return -EINVAL; 1205 + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); 1206 + if (rem) { 1207 + xfs_warn(mp, 1208 + "new RT volume size (%lld) not aligned to RT group size (%d)", 1209 + mp->m_sb.sb_rblocks, gblocks); 1210 + return -EINVAL; 1211 + } 1212 + } 1213 + 1264 1214 return 0; 1265 1215 } 1266 1216 ··· 1331 1249 } 1332 1250 1333 1251 /* 1252 + * Read in the last block of the RT device to make sure it is accessible. 1253 + */ 1254 + static int 1255 + xfs_rt_check_size( 1256 + struct xfs_mount *mp, 1257 + xfs_rfsblock_t last_block) 1258 + { 1259 + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block); 1260 + struct xfs_buf *bp; 1261 + int error; 1262 + 1263 + if (XFS_BB_TO_FSB(mp, daddr) != last_block) { 1264 + xfs_warn(mp, "RT device size overflow: %llu != %llu", 1265 + XFS_BB_TO_FSB(mp, daddr), last_block); 1266 + return -EFBIG; 1267 + } 1268 + 1269 + error = xfs_buf_read_uncached(mp->m_rtdev_targp, 1270 + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, 1271 + XFS_FSB_TO_BB(mp, 1), &bp, NULL); 1272 + if (error) 1273 + xfs_warn(mp, "cannot read last RT device sector (%lld)", 1274 + last_block); 1275 + else 1276 + xfs_buf_relse(bp); 1277 + return error; 1278 + } 1279 + 1280 + /* 1334 1281 * Grow the realtime area of the filesystem. 1335 1282 */ 1336 1283 int ··· 1370 1259 xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; 1371 1260 xfs_rgnumber_t new_rgcount = 1; 1372 1261 xfs_rgnumber_t rgno; 1373 - struct xfs_buf *bp; 1374 1262 xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; 1375 1263 int error; 1376 1264 ··· 1412 1302 error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); 1413 1303 if (error) 1414 1304 goto out_unlock; 1415 - /* 1416 - * Read in the last block of the device, make sure it exists. 1417 - */ 1418 - error = xfs_buf_read_uncached(mp->m_rtdev_targp, 1419 - XFS_FSB_TO_BB(mp, in->newblocks - 1), 1420 - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); 1305 + 1306 + error = xfs_rt_check_size(mp, in->newblocks - 1); 1421 1307 if (error) 1422 1308 goto out_unlock; 1423 - xfs_buf_relse(bp); 1424 1309 1425 1310 /* 1426 1311 * Calculate new parameters. These are the final values to be reached. ··· 1481 1376 error = error2; 1482 1377 1483 1378 /* Reset the rt metadata btree space reservations. */ 1484 - xfs_rt_resv_free(mp); 1485 - error2 = xfs_rt_resv_init(mp); 1379 + error2 = xfs_metafile_resv_init(mp); 1486 1380 if (error2 && error2 != -ENOSPC) 1487 1381 error = error2; 1488 1382 } ··· 1511 1407 1512 1408 /* m_blkbb_log is not set up yet */ 1513 1409 error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, 1514 - mp->m_sb.sb_blocksize >> BBSHIFT, 0, &bp, 1410 + mp->m_sb.sb_blocksize >> BBSHIFT, &bp, 1515 1411 &xfs_rtsb_buf_ops); 1516 1412 if (error) { 1517 1413 xfs_warn(mp, "rt sb validate failed with error %d.", error); ··· 1548 1444 xfs_rtmount_init( 1549 1445 struct xfs_mount *mp) /* file system mount structure */ 1550 1446 { 1551 - struct xfs_buf *bp; /* buffer for last block of subvolume */ 1552 - xfs_daddr_t d; /* address of last block of subvolume */ 1553 - int error; 1554 - 1555 1447 if (mp->m_sb.sb_rblocks == 0) 1556 1448 return 0; 1557 1449 if (mp->m_rtdev_targp == NULL) { ··· 1558 1458 1559 1459 mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); 1560 1460 1561 - /* 1562 - * Check that the realtime section is an ok size. 1563 - */ 1564 - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 1565 - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { 1566 - xfs_warn(mp, "realtime mount -- %llu != %llu", 1567 - (unsigned long long) XFS_BB_TO_FSB(mp, d), 1568 - (unsigned long long) mp->m_sb.sb_rblocks); 1569 - return -EFBIG; 1570 - } 1571 - error = xfs_buf_read_uncached(mp->m_rtdev_targp, 1572 - d - XFS_FSB_TO_BB(mp, 1), 1573 - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); 1574 - if (error) { 1575 - xfs_warn(mp, "realtime device size check failed"); 1576 - return error; 1577 - } 1578 - xfs_buf_relse(bp); 1579 - return 0; 1461 + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1); 1580 1462 } 1581 1463 1582 1464 static int ··· 1601 1519 spin_lock(&mp->m_sb_lock); 1602 1520 mp->m_sb.sb_frextents = val; 1603 1521 spin_unlock(&mp->m_sb_lock); 1604 - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); 1522 + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents); 1605 1523 return 0; 1606 - } 1607 - 1608 - /* Free space reservations for rt metadata inodes. */ 1609 - void 1610 - xfs_rt_resv_free( 1611 - struct xfs_mount *mp) 1612 - { 1613 - struct xfs_rtgroup *rtg = NULL; 1614 - unsigned int i; 1615 - 1616 - while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1617 - for (i = 0; i < XFS_RTGI_MAX; i++) 1618 - xfs_metafile_resv_free(rtg->rtg_inodes[i]); 1619 - } 1620 - } 1621 - 1622 - /* Reserve space for rt metadata inodes' space expansion. */ 1623 - int 1624 - xfs_rt_resv_init( 1625 - struct xfs_mount *mp) 1626 - { 1627 - struct xfs_rtgroup *rtg = NULL; 1628 - xfs_filblks_t ask; 1629 - int error = 0; 1630 - 1631 - while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1632 - int err2; 1633 - 1634 - ask = xfs_rtrmapbt_calc_reserves(mp); 1635 - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); 1636 - if (err2 && !error) 1637 - error = err2; 1638 - 1639 - ask = xfs_rtrefcountbt_calc_reserves(mp); 1640 - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); 1641 - if (err2 && !error) 1642 - error = err2; 1643 - } 1644 - 1645 - return error; 1646 1524 } 1647 1525 1648 1526 /* ··· 1655 1613 } 1656 1614 } 1657 1615 1616 + if (xfs_has_zoned(mp)) 1617 + return 0; 1658 1618 return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); 1659 1619 } 1660 1620 ··· 2140 2096 bool initial_user_data = 2141 2097 ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; 2142 2098 int error; 2099 + 2100 + ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); 2143 2101 2144 2102 retry: 2145 2103 error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);

-5

fs/xfs/xfs_rtalloc.h

··· 34 34 xfs_rtmount_inodes( 35 35 struct xfs_mount *mp); /* file system mount structure */ 36 36 37 - void xfs_rt_resv_free(struct xfs_mount *mp); 38 - int xfs_rt_resv_init(struct xfs_mount *mp); 39 - 40 37 /* 41 38 * Grow the realtime area of the filesystem. 42 39 */ ··· 62 65 } 63 66 # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) 64 67 # define xfs_rtunmount_inodes(m) 65 - # define xfs_rt_resv_free(mp) ((void)0) 66 - # define xfs_rt_resv_init(mp) (0) 67 68 68 69 static inline int 69 70 xfs_growfs_check_rtgeom(const struct xfs_mount *mp,

+129 -36

fs/xfs/xfs_super.c

··· 46 46 #include "xfs_exchmaps_item.h" 47 47 #include "xfs_parent.h" 48 48 #include "xfs_rtalloc.h" 49 + #include "xfs_zone_alloc.h" 49 50 #include "scrub/stats.h" 50 51 #include "scrub/rcbag_btree.h" 51 52 ··· 110 109 Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, 111 110 Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, 112 111 Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, 113 - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, 112 + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, 113 + Opt_lifetime, Opt_nolifetime, 114 114 }; 115 115 116 116 static const struct fs_parameter_spec xfs_fs_parameters[] = { ··· 156 154 fsparam_flag("nodiscard", Opt_nodiscard), 157 155 fsparam_flag("dax", Opt_dax), 158 156 fsparam_enum("dax", Opt_dax_enum, dax_param_enums), 157 + fsparam_u32("max_open_zones", Opt_max_open_zones), 158 + fsparam_flag("lifetime", Opt_lifetime), 159 + fsparam_flag("nolifetime", Opt_nolifetime), 159 160 {} 160 161 }; 161 162 ··· 187 182 { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, 188 183 { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, 189 184 { XFS_FEAT_DAX_NEVER, ",dax=never" }, 185 + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, 190 186 { 0, NULL } 191 187 }; 192 188 struct xfs_mount *mp = XFS_M(root->d_sb); ··· 238 232 239 233 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) 240 234 seq_puts(m, ",noquota"); 235 + 236 + if (mp->m_max_open_zones) 237 + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); 241 238 242 239 return 0; 243 240 } ··· 542 533 if (error) 543 534 return error; 544 535 } 545 - if (mp->m_rtdev_targp) { 536 + 537 + if (mp->m_sb.sb_rtstart) { 538 + if (mp->m_rtdev_targp) { 539 + xfs_warn(mp, 540 + "can't use internal and external rtdev at the same time"); 541 + return -EINVAL; 542 + } 543 + mp->m_rtdev_targp = mp->m_ddev_targp; 544 + } else if (mp->m_rtname) { 546 545 error = xfs_setsize_buftarg(mp->m_rtdev_targp, 547 546 mp->m_sb.sb_sectsize); 548 547 if (error) ··· 774 757 { 775 758 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) 776 759 xfs_free_buftarg(mp->m_logdev_targp); 777 - if (mp->m_rtdev_targp) 760 + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) 778 761 xfs_free_buftarg(mp->m_rtdev_targp); 779 762 if (mp->m_ddev_targp) 780 763 xfs_free_buftarg(mp->m_ddev_targp); ··· 831 814 if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { 832 815 xfs_inodegc_stop(mp); 833 816 xfs_blockgc_stop(mp); 817 + xfs_zone_gc_stop(mp); 834 818 } 835 819 836 820 return 0; ··· 852 834 struct kstatfs *st) 853 835 { 854 836 int64_t fdblocks = 855 - percpu_counter_sum(&mp->m_fdblocks); 837 + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); 856 838 857 839 /* make sure st->f_bfree does not underflow */ 858 - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); 840 + st->f_bfree = max(0LL, 841 + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); 842 + 859 843 /* 860 844 * sb_dblocks can change during growfs, but nothing cares about reporting 861 845 * the old or new value during growfs. ··· 876 856 struct kstatfs *st) 877 857 { 878 858 st->f_bfree = xfs_rtbxlen_to_blen(mp, 879 - percpu_counter_sum_positive(&mp->m_frextents)); 880 - st->f_blocks = mp->m_sb.sb_rblocks; 859 + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); 860 + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, 861 + mp->m_free[XC_FREE_RTEXTENTS].res_total); 881 862 } 882 863 883 864 static void ··· 943 922 } 944 923 945 924 STATIC void 946 - xfs_save_resvblks(struct xfs_mount *mp) 925 + xfs_save_resvblks( 926 + struct xfs_mount *mp) 947 927 { 948 - mp->m_resblks_save = mp->m_resblks; 949 - xfs_reserve_blocks(mp, 0); 928 + enum xfs_free_counter i; 929 + 930 + for (i = 0; i < XC_FREE_NR; i++) { 931 + mp->m_free[i].res_saved = mp->m_free[i].res_total; 932 + xfs_reserve_blocks(mp, i, 0); 933 + } 950 934 } 951 935 952 936 STATIC void 953 - xfs_restore_resvblks(struct xfs_mount *mp) 937 + xfs_restore_resvblks( 938 + struct xfs_mount *mp) 954 939 { 955 - uint64_t resblks; 940 + uint64_t resblks; 941 + enum xfs_free_counter i; 956 942 957 - if (mp->m_resblks_save) { 958 - resblks = mp->m_resblks_save; 959 - mp->m_resblks_save = 0; 960 - } else 961 - resblks = xfs_default_resblks(mp); 962 - 963 - xfs_reserve_blocks(mp, resblks); 943 + for (i = 0; i < XC_FREE_NR; i++) { 944 + if (mp->m_free[i].res_saved) { 945 + resblks = mp->m_free[i].res_saved; 946 + mp->m_free[i].res_saved = 0; 947 + } else 948 + resblks = xfs_default_resblks(mp, i); 949 + xfs_reserve_blocks(mp, i, resblks); 950 + } 964 951 } 965 952 966 953 /* ··· 1005 976 if (ret && !xfs_is_readonly(mp)) { 1006 977 xfs_blockgc_start(mp); 1007 978 xfs_inodegc_start(mp); 979 + xfs_zone_gc_start(mp); 1008 980 } 1009 981 1010 982 return ret; ··· 1027 997 * filesystem. 1028 998 */ 1029 999 if (!xfs_is_readonly(mp)) { 1000 + xfs_zone_gc_start(mp); 1030 1001 xfs_blockgc_start(mp); 1031 1002 xfs_inodegc_start(mp); 1032 1003 } ··· 1089 1058 return -EINVAL; 1090 1059 } 1091 1060 1061 + if (!xfs_has_zoned(mp)) { 1062 + if (mp->m_max_open_zones) { 1063 + xfs_warn(mp, 1064 + "max_open_zones mount option only supported on zoned file systems."); 1065 + return -EINVAL; 1066 + } 1067 + if (mp->m_features & XFS_FEAT_NOLIFETIME) { 1068 + xfs_warn(mp, 1069 + "nolifetime mount option only supported on zoned file systems."); 1070 + return -EINVAL; 1071 + } 1072 + } 1073 + 1092 1074 return 0; 1093 1075 } 1094 1076 ··· 1109 1065 xfs_init_percpu_counters( 1110 1066 struct xfs_mount *mp) 1111 1067 { 1112 - int error; 1068 + int error; 1069 + int i; 1113 1070 1114 1071 error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); 1115 1072 if (error) ··· 1120 1075 if (error) 1121 1076 goto free_icount; 1122 1077 1123 - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); 1124 - if (error) 1125 - goto free_ifree; 1126 - 1127 1078 error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); 1128 1079 if (error) 1129 - goto free_fdblocks; 1080 + goto free_ifree; 1130 1081 1131 1082 error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); 1132 1083 if (error) 1133 1084 goto free_delalloc; 1134 1085 1135 - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); 1136 - if (error) 1137 - goto free_delalloc_rt; 1086 + for (i = 0; i < XC_FREE_NR; i++) { 1087 + error = percpu_counter_init(&mp->m_free[i].count, 0, 1088 + GFP_KERNEL); 1089 + if (error) 1090 + goto free_freecounters; 1091 + } 1138 1092 1139 1093 return 0; 1140 1094 1141 - free_delalloc_rt: 1095 + free_freecounters: 1096 + while (--i > 0) 1097 + percpu_counter_destroy(&mp->m_free[i].count); 1142 1098 percpu_counter_destroy(&mp->m_delalloc_rtextents); 1143 1099 free_delalloc: 1144 1100 percpu_counter_destroy(&mp->m_delalloc_blks); 1145 - free_fdblocks: 1146 - percpu_counter_destroy(&mp->m_fdblocks); 1147 1101 free_ifree: 1148 1102 percpu_counter_destroy(&mp->m_ifree); 1149 1103 free_icount: ··· 1156 1112 { 1157 1113 percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); 1158 1114 percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); 1159 - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); 1160 - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); 1115 + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); 1116 + if (!xfs_has_zoned(mp)) 1117 + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1118 + mp->m_sb.sb_frextents); 1161 1119 } 1162 1120 1163 1121 static void 1164 1122 xfs_destroy_percpu_counters( 1165 1123 struct xfs_mount *mp) 1166 1124 { 1125 + enum xfs_free_counter i; 1126 + 1127 + for (i = 0; i < XC_FREE_NR; i++) 1128 + percpu_counter_destroy(&mp->m_free[i].count); 1167 1129 percpu_counter_destroy(&mp->m_icount); 1168 1130 percpu_counter_destroy(&mp->m_ifree); 1169 - percpu_counter_destroy(&mp->m_fdblocks); 1170 1131 ASSERT(xfs_is_shutdown(mp) || 1171 1132 percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); 1172 1133 percpu_counter_destroy(&mp->m_delalloc_rtextents); 1173 1134 ASSERT(xfs_is_shutdown(mp) || 1174 1135 percpu_counter_sum(&mp->m_delalloc_blks) == 0); 1175 1136 percpu_counter_destroy(&mp->m_delalloc_blks); 1176 - percpu_counter_destroy(&mp->m_frextents); 1177 1137 } 1178 1138 1179 1139 static int ··· 1258 1210 xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); 1259 1211 } 1260 1212 1213 + static int 1214 + xfs_fs_show_stats( 1215 + struct seq_file *m, 1216 + struct dentry *root) 1217 + { 1218 + struct xfs_mount *mp = XFS_M(root->d_sb); 1219 + 1220 + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) 1221 + xfs_zoned_show_stats(m, mp); 1222 + return 0; 1223 + } 1224 + 1261 1225 static const struct super_operations xfs_super_operations = { 1262 1226 .alloc_inode = xfs_fs_alloc_inode, 1263 1227 .destroy_inode = xfs_fs_destroy_inode, ··· 1284 1224 .nr_cached_objects = xfs_fs_nr_cached_objects, 1285 1225 .free_cached_objects = xfs_fs_free_cached_objects, 1286 1226 .shutdown = xfs_fs_shutdown, 1227 + .show_stats = xfs_fs_show_stats, 1287 1228 }; 1288 1229 1289 1230 static int ··· 1496 1435 case Opt_noattr2: 1497 1436 xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); 1498 1437 parsing_mp->m_features |= XFS_FEAT_NOATTR2; 1438 + return 0; 1439 + case Opt_max_open_zones: 1440 + parsing_mp->m_max_open_zones = result.uint_32; 1441 + return 0; 1442 + case Opt_lifetime: 1443 + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; 1444 + return 0; 1445 + case Opt_nolifetime: 1446 + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; 1499 1447 return 0; 1500 1448 default: 1501 1449 xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); ··· 1850 1780 mp->m_features &= ~XFS_FEAT_DISCARD; 1851 1781 } 1852 1782 1853 - if (xfs_has_metadir(mp)) 1783 + if (xfs_has_zoned(mp)) { 1784 + if (!xfs_has_metadir(mp)) { 1785 + xfs_alert(mp, 1786 + "metadir feature required for zoned realtime devices."); 1787 + error = -EINVAL; 1788 + goto out_filestream_unmount; 1789 + } 1790 + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); 1791 + } else if (xfs_has_metadir(mp)) { 1854 1792 xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); 1793 + } 1855 1794 1856 1795 if (xfs_has_reflink(mp)) { 1857 1796 if (xfs_has_realtime(mp) && ··· 1868 1789 xfs_alert(mp, 1869 1790 "reflink not compatible with realtime extent size %u!", 1870 1791 mp->m_sb.sb_rextsize); 1792 + error = -EINVAL; 1793 + goto out_filestream_unmount; 1794 + } 1795 + 1796 + if (xfs_has_zoned(mp)) { 1797 + xfs_alert(mp, 1798 + "reflink not compatible with zoned RT device!"); 1871 1799 error = -EINVAL; 1872 1800 goto out_filestream_unmount; 1873 1801 } ··· 2003 1917 /* Re-enable the background inode inactivation worker. */ 2004 1918 xfs_inodegc_start(mp); 2005 1919 1920 + /* Restart zone reclaim */ 1921 + xfs_zone_gc_start(mp); 1922 + 2006 1923 return 0; 2007 1924 } 2008 1925 ··· 2049 1960 * we send inodes straight to reclaim, so no inodes will be queued. 2050 1961 */ 2051 1962 xfs_inodegc_stop(mp); 1963 + 1964 + /* Stop zone reclaim */ 1965 + xfs_zone_gc_stop(mp); 2052 1966 2053 1967 /* Free the per-AG metadata reservation pool. */ 2054 1968 xfs_fs_unreserve_ag_blocks(mp); ··· 2174 2082 for (i = 0; i < XG_TYPE_MAX; i++) 2175 2083 xa_init(&mp->m_groups[i].xa); 2176 2084 mutex_init(&mp->m_growlock); 2085 + mutex_init(&mp->m_metafile_resv_lock); 2177 2086 INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); 2178 2087 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 2179 2088 mp->m_kobj.kobject.kset = xfs_kset;

+68 -7

fs/xfs/xfs_sysfs.c

··· 13 13 #include "xfs_log.h" 14 14 #include "xfs_log_priv.h" 15 15 #include "xfs_mount.h" 16 + #include "xfs_zones.h" 16 17 17 18 struct xfs_sysfs_attr { 18 19 struct attribute attr; ··· 70 69 }; 71 70 ATTRIBUTE_GROUPS(xfs_mp); 72 71 73 - const struct kobj_type xfs_mp_ktype = { 72 + static const struct kobj_type xfs_mp_ktype = { 74 73 .release = xfs_sysfs_release, 75 74 .sysfs_ops = &xfs_sysfs_ops, 76 75 .default_groups = xfs_mp_groups, ··· 702 701 return error; 703 702 } 704 703 704 + static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) 705 + { 706 + return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); 707 + } 708 + 709 + static ssize_t 710 + max_open_zones_show( 711 + struct kobject *kobj, 712 + char *buf) 713 + { 714 + /* only report the open zones available for user data */ 715 + return sysfs_emit(buf, "%u\n", 716 + zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); 717 + } 718 + XFS_SYSFS_ATTR_RO(max_open_zones); 719 + 720 + static struct attribute *xfs_zoned_attrs[] = { 721 + ATTR_LIST(max_open_zones), 722 + NULL, 723 + }; 724 + ATTRIBUTE_GROUPS(xfs_zoned); 725 + 726 + static const struct kobj_type xfs_zoned_ktype = { 727 + .release = xfs_sysfs_release, 728 + .sysfs_ops = &xfs_sysfs_ops, 729 + .default_groups = xfs_zoned_groups, 730 + }; 731 + 705 732 int 706 - xfs_error_sysfs_init( 733 + xfs_mount_sysfs_init( 707 734 struct xfs_mount *mp) 708 735 { 709 736 int error; 737 + 738 + super_set_sysfs_name_id(mp->m_super); 739 + 740 + /* .../xfs/<dev>/ */ 741 + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, 742 + NULL, mp->m_super->s_id); 743 + if (error) 744 + return error; 745 + 746 + /* .../xfs/<dev>/stats/ */ 747 + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, 748 + &mp->m_kobj, "stats"); 749 + if (error) 750 + goto out_remove_fsdir; 710 751 711 752 /* .../xfs/<dev>/error/ */ 712 753 error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, 713 754 &mp->m_kobj, "error"); 714 755 if (error) 715 - return error; 756 + goto out_remove_stats_dir; 716 757 758 + /* .../xfs/<dev>/error/fail_at_unmount */ 717 759 error = sysfs_create_file(&mp->m_error_kobj.kobject, 718 760 ATTR_LIST(fail_at_unmount)); 719 761 720 762 if (error) 721 - goto out_error; 763 + goto out_remove_error_dir; 722 764 723 765 /* .../xfs/<dev>/error/metadata/ */ 724 766 error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, 725 767 "metadata", &mp->m_error_meta_kobj, 726 768 xfs_error_meta_init); 727 769 if (error) 728 - goto out_error; 770 + goto out_remove_error_dir; 771 + 772 + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { 773 + /* .../xfs/<dev>/zoned/ */ 774 + error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, 775 + &mp->m_kobj, "zoned"); 776 + if (error) 777 + goto out_remove_error_dir; 778 + } 729 779 730 780 return 0; 731 781 732 - out_error: 782 + out_remove_error_dir: 733 783 xfs_sysfs_del(&mp->m_error_kobj); 784 + out_remove_stats_dir: 785 + xfs_sysfs_del(&mp->m_stats.xs_kobj); 786 + out_remove_fsdir: 787 + xfs_sysfs_del(&mp->m_kobj); 734 788 return error; 735 789 } 736 790 737 791 void 738 - xfs_error_sysfs_del( 792 + xfs_mount_sysfs_del( 739 793 struct xfs_mount *mp) 740 794 { 741 795 struct xfs_error_cfg *cfg; 742 796 int i, j; 797 + 798 + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) 799 + xfs_sysfs_del(&mp->m_zoned_kobj); 743 800 744 801 for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { 745 802 for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { ··· 808 749 } 809 750 xfs_sysfs_del(&mp->m_error_meta_kobj); 810 751 xfs_sysfs_del(&mp->m_error_kobj); 752 + xfs_sysfs_del(&mp->m_stats.xs_kobj); 753 + xfs_sysfs_del(&mp->m_kobj); 811 754 } 812 755 813 756 struct xfs_error_cfg *

+2 -3

fs/xfs/xfs_sysfs.h

··· 7 7 #ifndef __XFS_SYSFS_H__ 8 8 #define __XFS_SYSFS_H__ 9 9 10 - extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ 11 10 extern const struct kobj_type xfs_dbg_ktype; /* debug */ 12 11 extern const struct kobj_type xfs_log_ktype; /* xlog */ 13 12 extern const struct kobj_type xfs_stats_ktype; /* stats */ ··· 52 53 wait_for_completion(&kobj->complete); 53 54 } 54 55 55 - int xfs_error_sysfs_init(struct xfs_mount *mp); 56 - void xfs_error_sysfs_del(struct xfs_mount *mp); 56 + int xfs_mount_sysfs_init(struct xfs_mount *mp); 57 + void xfs_mount_sysfs_del(struct xfs_mount *mp); 57 58 58 59 #endif /* __XFS_SYSFS_H__ */

+2

fs/xfs/xfs_trace.c

··· 49 49 #include "xfs_metafile.h" 50 50 #include "xfs_metadir.h" 51 51 #include "xfs_rtgroup.h" 52 + #include "xfs_zone_alloc.h" 53 + #include "xfs_zone_priv.h" 52 54 53 55 /* 54 56 * We include this last to have the helpers above available for the trace

+203 -15

fs/xfs/xfs_trace.h

··· 102 102 struct xfs_refcount_intent; 103 103 struct xfs_metadir_update; 104 104 struct xfs_rtgroup; 105 + struct xfs_open_zone; 105 106 106 107 #define XFS_ATTR_FILTER_FLAGS \ 107 108 { XFS_ATTR_ROOT, "ROOT" }, \ ··· 265 264 DEFINE_GROUP_REF_EVENT(xfs_group_grab); 266 265 DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); 267 266 DEFINE_GROUP_REF_EVENT(xfs_group_rele); 267 + 268 + #ifdef CONFIG_XFS_RT 269 + DECLARE_EVENT_CLASS(xfs_zone_class, 270 + TP_PROTO(struct xfs_rtgroup *rtg), 271 + TP_ARGS(rtg), 272 + TP_STRUCT__entry( 273 + __field(dev_t, dev) 274 + __field(xfs_rgnumber_t, rgno) 275 + __field(xfs_rgblock_t, used) 276 + __field(unsigned int, nr_open) 277 + ), 278 + TP_fast_assign( 279 + struct xfs_mount *mp = rtg_mount(rtg); 280 + 281 + __entry->dev = mp->m_super->s_dev; 282 + __entry->rgno = rtg_rgno(rtg); 283 + __entry->used = rtg_rmap(rtg)->i_used_blocks; 284 + __entry->nr_open = mp->m_zone_info->zi_nr_open_zones; 285 + ), 286 + TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", 287 + MAJOR(__entry->dev), MINOR(__entry->dev), 288 + __entry->rgno, 289 + __entry->used, 290 + __entry->nr_open) 291 + ); 292 + 293 + #define DEFINE_ZONE_EVENT(name) \ 294 + DEFINE_EVENT(xfs_zone_class, name, \ 295 + TP_PROTO(struct xfs_rtgroup *rtg), \ 296 + TP_ARGS(rtg)) 297 + DEFINE_ZONE_EVENT(xfs_zone_emptied); 298 + DEFINE_ZONE_EVENT(xfs_zone_full); 299 + DEFINE_ZONE_EVENT(xfs_zone_opened); 300 + DEFINE_ZONE_EVENT(xfs_zone_reset); 301 + DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); 302 + 303 + TRACE_EVENT(xfs_zone_free_blocks, 304 + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, 305 + xfs_extlen_t len), 306 + TP_ARGS(rtg, rgbno, len), 307 + TP_STRUCT__entry( 308 + __field(dev_t, dev) 309 + __field(xfs_rgnumber_t, rgno) 310 + __field(xfs_rgblock_t, used) 311 + __field(xfs_rgblock_t, rgbno) 312 + __field(xfs_extlen_t, len) 313 + ), 314 + TP_fast_assign( 315 + __entry->dev = rtg_mount(rtg)->m_super->s_dev; 316 + __entry->rgno = rtg_rgno(rtg); 317 + __entry->used = rtg_rmap(rtg)->i_used_blocks; 318 + __entry->rgbno = rgbno; 319 + __entry->len = len; 320 + ), 321 + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", 322 + MAJOR(__entry->dev), MINOR(__entry->dev), 323 + __entry->rgno, 324 + __entry->used, 325 + __entry->rgbno, 326 + __entry->len) 327 + ); 328 + 329 + DECLARE_EVENT_CLASS(xfs_zone_alloc_class, 330 + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, 331 + xfs_extlen_t len), 332 + TP_ARGS(oz, rgbno, len), 333 + TP_STRUCT__entry( 334 + __field(dev_t, dev) 335 + __field(xfs_rgnumber_t, rgno) 336 + __field(xfs_rgblock_t, used) 337 + __field(xfs_rgblock_t, written) 338 + __field(xfs_rgblock_t, write_pointer) 339 + __field(xfs_rgblock_t, rgbno) 340 + __field(xfs_extlen_t, len) 341 + ), 342 + TP_fast_assign( 343 + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; 344 + __entry->rgno = rtg_rgno(oz->oz_rtg); 345 + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; 346 + __entry->written = oz->oz_written; 347 + __entry->write_pointer = oz->oz_write_pointer; 348 + __entry->rgbno = rgbno; 349 + __entry->len = len; 350 + ), 351 + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", 352 + MAJOR(__entry->dev), MINOR(__entry->dev), 353 + __entry->rgno, 354 + __entry->used, 355 + __entry->written, 356 + __entry->write_pointer, 357 + __entry->rgbno, 358 + __entry->len) 359 + ); 360 + 361 + #define DEFINE_ZONE_ALLOC_EVENT(name) \ 362 + DEFINE_EVENT(xfs_zone_alloc_class, name, \ 363 + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \ 364 + xfs_extlen_t len), \ 365 + TP_ARGS(oz, rgbno, len)) 366 + DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); 367 + DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); 368 + 369 + TRACE_EVENT(xfs_zone_gc_select_victim, 370 + TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), 371 + TP_ARGS(rtg, bucket), 372 + TP_STRUCT__entry( 373 + __field(dev_t, dev) 374 + __field(xfs_rgnumber_t, rgno) 375 + __field(xfs_rgblock_t, used) 376 + __field(unsigned int, bucket) 377 + ), 378 + TP_fast_assign( 379 + __entry->dev = rtg_mount(rtg)->m_super->s_dev; 380 + __entry->rgno = rtg_rgno(rtg); 381 + __entry->used = rtg_rmap(rtg)->i_used_blocks; 382 + __entry->bucket = bucket; 383 + ), 384 + TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u", 385 + MAJOR(__entry->dev), MINOR(__entry->dev), 386 + __entry->rgno, 387 + __entry->used, 388 + __entry->bucket) 389 + ); 390 + 391 + TRACE_EVENT(xfs_zones_mount, 392 + TP_PROTO(struct xfs_mount *mp), 393 + TP_ARGS(mp), 394 + TP_STRUCT__entry( 395 + __field(dev_t, dev) 396 + __field(xfs_rgnumber_t, rgcount) 397 + __field(uint32_t, blocks) 398 + __field(unsigned int, max_open_zones) 399 + ), 400 + TP_fast_assign( 401 + __entry->dev = mp->m_super->s_dev; 402 + __entry->rgcount = mp->m_sb.sb_rgcount; 403 + __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks; 404 + __entry->max_open_zones = mp->m_max_open_zones; 405 + ), 406 + TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u", 407 + MAJOR(__entry->dev), MINOR(__entry->dev), 408 + __entry->rgcount, 409 + __entry->blocks, 410 + __entry->max_open_zones) 411 + ); 412 + #endif /* CONFIG_XFS_RT */ 268 413 269 414 TRACE_EVENT(xfs_inodegc_worker, 270 415 TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), ··· 692 545 DEFINE_BUF_EVENT(xfs_buf_error_relse); 693 546 DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); 694 547 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); 548 + DEFINE_BUF_EVENT(xfs_buf_backing_folio); 549 + DEFINE_BUF_EVENT(xfs_buf_backing_kmem); 550 + DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc); 551 + DEFINE_BUF_EVENT(xfs_buf_backing_fallback); 695 552 696 553 /* not really buffer traces, but the buf provides useful information */ 697 554 DEFINE_BUF_EVENT(xfs_btree_corrupt); ··· 1747 1596 DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); 1748 1597 DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); 1749 1598 DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); 1599 + DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); 1750 1600 1751 1601 DECLARE_EVENT_CLASS(xfs_itrunc_class, 1752 1602 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), ··· 4135 3983 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); 4136 3984 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); 4137 3985 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); 3986 + DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); 4138 3987 4139 3988 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); 4140 3989 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); ··· 5759 5606 /* metadata inode space reservations */ 5760 5607 5761 5608 DECLARE_EVENT_CLASS(xfs_metafile_resv_class, 5762 - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), 5763 - TP_ARGS(ip, len), 5609 + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), 5610 + TP_ARGS(mp, len), 5764 5611 TP_STRUCT__entry( 5765 5612 __field(dev_t, dev) 5766 - __field(xfs_ino_t, ino) 5767 5613 __field(unsigned long long, freeblks) 5768 5614 __field(unsigned long long, reserved) 5769 5615 __field(unsigned long long, asked) ··· 5770 5618 __field(unsigned long long, len) 5771 5619 ), 5772 5620 TP_fast_assign( 5773 - struct xfs_mount *mp = ip->i_mount; 5774 - 5775 5621 __entry->dev = mp->m_super->s_dev; 5776 - __entry->ino = ip->i_ino; 5777 - __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); 5778 - __entry->reserved = ip->i_delayed_blks; 5779 - __entry->asked = ip->i_meta_resv_asked; 5780 - __entry->used = ip->i_nblocks; 5622 + __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); 5623 + __entry->reserved = mp->m_metafile_resv_avail; 5624 + __entry->asked = mp->m_metafile_resv_target; 5625 + __entry->used = mp->m_metafile_resv_used; 5781 5626 __entry->len = len; 5782 5627 ), 5783 - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", 5628 + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", 5784 5629 MAJOR(__entry->dev), MINOR(__entry->dev), 5785 - __entry->ino, 5786 5630 __entry->freeblks, 5787 5631 __entry->reserved, 5788 5632 __entry->asked, ··· 5787 5639 ) 5788 5640 #define DEFINE_METAFILE_RESV_EVENT(name) \ 5789 5641 DEFINE_EVENT(xfs_metafile_resv_class, name, \ 5790 - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ 5791 - TP_ARGS(ip, len)) 5642 + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ 5643 + TP_ARGS(mp, len)) 5792 5644 DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); 5793 5645 DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); 5794 5646 DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); 5795 5647 DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); 5796 5648 DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); 5797 - DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); 5649 + DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); 5798 5650 5799 5651 #ifdef CONFIG_XFS_RT 5800 5652 TRACE_EVENT(xfs_growfs_check_rtgeom, ··· 5816 5668 __entry->min_logfsbs) 5817 5669 ); 5818 5670 #endif /* CONFIG_XFS_RT */ 5671 + 5672 + TRACE_DEFINE_ENUM(XC_FREE_BLOCKS); 5673 + TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS); 5674 + TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE); 5675 + 5676 + DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class, 5677 + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, 5678 + uint64_t delta, unsigned long caller_ip), 5679 + TP_ARGS(mp, ctr, delta, caller_ip), 5680 + TP_STRUCT__entry( 5681 + __field(dev_t, dev) 5682 + __field(enum xfs_free_counter, ctr) 5683 + __field(uint64_t, delta) 5684 + __field(uint64_t, avail) 5685 + __field(uint64_t, total) 5686 + __field(unsigned long, caller_ip) 5687 + ), 5688 + TP_fast_assign( 5689 + __entry->dev = mp->m_super->s_dev; 5690 + __entry->ctr = ctr; 5691 + __entry->delta = delta; 5692 + __entry->avail = mp->m_free[ctr].res_avail; 5693 + __entry->total = mp->m_free[ctr].res_total; 5694 + __entry->caller_ip = caller_ip; 5695 + ), 5696 + TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS", 5697 + MAJOR(__entry->dev), MINOR(__entry->dev), 5698 + __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR), 5699 + __entry->delta, 5700 + __entry->avail, 5701 + __entry->total, 5702 + (char *)__entry->caller_ip) 5703 + ) 5704 + #define DEFINE_FREEBLOCKS_RESV_EVENT(name) \ 5705 + DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ 5706 + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \ 5707 + uint64_t delta, unsigned long caller_ip), \ 5708 + TP_ARGS(mp, ctr, delta, caller_ip)) 5709 + DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); 5710 + DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); 5819 5711 5820 5712 #endif /* _TRACE_XFS_H */ 5821 5713

+1220

fs/xfs/xfs_zone_alloc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2023-2025 Christoph Hellwig. 4 + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_shared.h" 8 + #include "xfs_format.h" 9 + #include "xfs_log_format.h" 10 + #include "xfs_error.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_mount.h" 13 + #include "xfs_inode.h" 14 + #include "xfs_iomap.h" 15 + #include "xfs_trans.h" 16 + #include "xfs_alloc.h" 17 + #include "xfs_bmap.h" 18 + #include "xfs_bmap_btree.h" 19 + #include "xfs_trans_space.h" 20 + #include "xfs_refcount.h" 21 + #include "xfs_rtbitmap.h" 22 + #include "xfs_rtrmap_btree.h" 23 + #include "xfs_zone_alloc.h" 24 + #include "xfs_zone_priv.h" 25 + #include "xfs_zones.h" 26 + #include "xfs_trace.h" 27 + 28 + void 29 + xfs_open_zone_put( 30 + struct xfs_open_zone *oz) 31 + { 32 + if (atomic_dec_and_test(&oz->oz_ref)) { 33 + xfs_rtgroup_rele(oz->oz_rtg); 34 + kfree(oz); 35 + } 36 + } 37 + 38 + static inline uint32_t 39 + xfs_zone_bucket( 40 + struct xfs_mount *mp, 41 + uint32_t used_blocks) 42 + { 43 + return XFS_ZONE_USED_BUCKETS * used_blocks / 44 + mp->m_groups[XG_TYPE_RTG].blocks; 45 + } 46 + 47 + static inline void 48 + xfs_zone_add_to_bucket( 49 + struct xfs_zone_info *zi, 50 + xfs_rgnumber_t rgno, 51 + uint32_t to_bucket) 52 + { 53 + __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); 54 + zi->zi_used_bucket_entries[to_bucket]++; 55 + } 56 + 57 + static inline void 58 + xfs_zone_remove_from_bucket( 59 + struct xfs_zone_info *zi, 60 + xfs_rgnumber_t rgno, 61 + uint32_t from_bucket) 62 + { 63 + __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); 64 + zi->zi_used_bucket_entries[from_bucket]--; 65 + } 66 + 67 + static void 68 + xfs_zone_account_reclaimable( 69 + struct xfs_rtgroup *rtg, 70 + uint32_t freed) 71 + { 72 + struct xfs_group *xg = &rtg->rtg_group; 73 + struct xfs_mount *mp = rtg_mount(rtg); 74 + struct xfs_zone_info *zi = mp->m_zone_info; 75 + uint32_t used = rtg_rmap(rtg)->i_used_blocks; 76 + xfs_rgnumber_t rgno = rtg_rgno(rtg); 77 + uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); 78 + uint32_t to_bucket = xfs_zone_bucket(mp, used); 79 + bool was_full = (used + freed == rtg_blocks(rtg)); 80 + 81 + /* 82 + * This can be called from log recovery, where the zone_info structure 83 + * hasn't been allocated yet. Skip all work as xfs_mount_zones will 84 + * add the zones to the right buckets before the file systems becomes 85 + * active. 86 + */ 87 + if (!zi) 88 + return; 89 + 90 + if (!used) { 91 + /* 92 + * The zone is now empty, remove it from the bottom bucket and 93 + * trigger a reset. 94 + */ 95 + trace_xfs_zone_emptied(rtg); 96 + 97 + if (!was_full) 98 + xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); 99 + 100 + spin_lock(&zi->zi_used_buckets_lock); 101 + if (!was_full) 102 + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 103 + spin_unlock(&zi->zi_used_buckets_lock); 104 + 105 + spin_lock(&zi->zi_reset_list_lock); 106 + xg->xg_next_reset = zi->zi_reset_list; 107 + zi->zi_reset_list = xg; 108 + spin_unlock(&zi->zi_reset_list_lock); 109 + 110 + if (zi->zi_gc_thread) 111 + wake_up_process(zi->zi_gc_thread); 112 + } else if (was_full) { 113 + /* 114 + * The zone transitioned from full, mark it up as reclaimable 115 + * and wake up GC which might be waiting for zones to reclaim. 116 + */ 117 + spin_lock(&zi->zi_used_buckets_lock); 118 + xfs_zone_add_to_bucket(zi, rgno, to_bucket); 119 + spin_unlock(&zi->zi_used_buckets_lock); 120 + 121 + xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); 122 + if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) 123 + wake_up_process(zi->zi_gc_thread); 124 + } else if (to_bucket != from_bucket) { 125 + /* 126 + * Move the zone to a new bucket if it dropped below the 127 + * threshold. 128 + */ 129 + spin_lock(&zi->zi_used_buckets_lock); 130 + xfs_zone_add_to_bucket(zi, rgno, to_bucket); 131 + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 132 + spin_unlock(&zi->zi_used_buckets_lock); 133 + } 134 + } 135 + 136 + static void 137 + xfs_open_zone_mark_full( 138 + struct xfs_open_zone *oz) 139 + { 140 + struct xfs_rtgroup *rtg = oz->oz_rtg; 141 + struct xfs_mount *mp = rtg_mount(rtg); 142 + struct xfs_zone_info *zi = mp->m_zone_info; 143 + uint32_t used = rtg_rmap(rtg)->i_used_blocks; 144 + 145 + trace_xfs_zone_full(rtg); 146 + 147 + WRITE_ONCE(rtg->rtg_open_zone, NULL); 148 + 149 + spin_lock(&zi->zi_open_zones_lock); 150 + if (oz->oz_is_gc) { 151 + ASSERT(current == zi->zi_gc_thread); 152 + zi->zi_open_gc_zone = NULL; 153 + } else { 154 + zi->zi_nr_open_zones--; 155 + list_del_init(&oz->oz_entry); 156 + } 157 + spin_unlock(&zi->zi_open_zones_lock); 158 + xfs_open_zone_put(oz); 159 + 160 + wake_up_all(&zi->zi_zone_wait); 161 + if (used < rtg_blocks(rtg)) 162 + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 163 + } 164 + 165 + static void 166 + xfs_zone_record_blocks( 167 + struct xfs_trans *tp, 168 + xfs_fsblock_t fsbno, 169 + xfs_filblks_t len, 170 + struct xfs_open_zone *oz, 171 + bool used) 172 + { 173 + struct xfs_mount *mp = tp->t_mountp; 174 + struct xfs_rtgroup *rtg = oz->oz_rtg; 175 + struct xfs_inode *rmapip = rtg_rmap(rtg); 176 + 177 + trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); 178 + 179 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 180 + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 181 + if (used) { 182 + rmapip->i_used_blocks += len; 183 + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); 184 + } else { 185 + xfs_add_frextents(mp, len); 186 + } 187 + oz->oz_written += len; 188 + if (oz->oz_written == rtg_blocks(rtg)) 189 + xfs_open_zone_mark_full(oz); 190 + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 191 + } 192 + 193 + static int 194 + xfs_zoned_map_extent( 195 + struct xfs_trans *tp, 196 + struct xfs_inode *ip, 197 + struct xfs_bmbt_irec *new, 198 + struct xfs_open_zone *oz, 199 + xfs_fsblock_t old_startblock) 200 + { 201 + struct xfs_bmbt_irec data; 202 + int nmaps = 1; 203 + int error; 204 + 205 + /* Grab the corresponding mapping in the data fork. */ 206 + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, 207 + &nmaps, 0); 208 + if (error) 209 + return error; 210 + 211 + /* 212 + * Cap the update to the existing extent in the data fork because we can 213 + * only overwrite one extent at a time. 214 + */ 215 + ASSERT(new->br_blockcount >= data.br_blockcount); 216 + new->br_blockcount = data.br_blockcount; 217 + 218 + /* 219 + * If a data write raced with this GC write, keep the existing data in 220 + * the data fork, mark our newly written GC extent as reclaimable, then 221 + * move on to the next extent. 222 + */ 223 + if (old_startblock != NULLFSBLOCK && 224 + old_startblock != data.br_startblock) 225 + goto skip; 226 + 227 + trace_xfs_reflink_cow_remap_from(ip, new); 228 + trace_xfs_reflink_cow_remap_to(ip, &data); 229 + 230 + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, 231 + XFS_IEXT_REFLINK_END_COW_CNT); 232 + if (error) 233 + return error; 234 + 235 + if (data.br_startblock != HOLESTARTBLOCK) { 236 + ASSERT(data.br_startblock != DELAYSTARTBLOCK); 237 + ASSERT(!isnullstartblock(data.br_startblock)); 238 + 239 + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); 240 + if (xfs_is_reflink_inode(ip)) { 241 + xfs_refcount_decrease_extent(tp, true, &data); 242 + } else { 243 + error = xfs_free_extent_later(tp, data.br_startblock, 244 + data.br_blockcount, NULL, 245 + XFS_AG_RESV_NONE, 246 + XFS_FREE_EXTENT_REALTIME); 247 + if (error) 248 + return error; 249 + } 250 + } 251 + 252 + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, 253 + true); 254 + 255 + /* Map the new blocks into the data fork. */ 256 + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); 257 + return 0; 258 + 259 + skip: 260 + trace_xfs_reflink_cow_remap_skip(ip, new); 261 + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, 262 + false); 263 + return 0; 264 + } 265 + 266 + int 267 + xfs_zoned_end_io( 268 + struct xfs_inode *ip, 269 + xfs_off_t offset, 270 + xfs_off_t count, 271 + xfs_daddr_t daddr, 272 + struct xfs_open_zone *oz, 273 + xfs_fsblock_t old_startblock) 274 + { 275 + struct xfs_mount *mp = ip->i_mount; 276 + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 277 + struct xfs_bmbt_irec new = { 278 + .br_startoff = XFS_B_TO_FSBT(mp, offset), 279 + .br_startblock = xfs_daddr_to_rtb(mp, daddr), 280 + .br_state = XFS_EXT_NORM, 281 + }; 282 + unsigned int resblks = 283 + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 284 + struct xfs_trans *tp; 285 + int error; 286 + 287 + if (xfs_is_shutdown(mp)) 288 + return -EIO; 289 + 290 + while (new.br_startoff < end_fsb) { 291 + new.br_blockcount = end_fsb - new.br_startoff; 292 + 293 + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 294 + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); 295 + if (error) 296 + return error; 297 + xfs_ilock(ip, XFS_ILOCK_EXCL); 298 + xfs_trans_ijoin(tp, ip, 0); 299 + 300 + error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); 301 + if (error) 302 + xfs_trans_cancel(tp); 303 + else 304 + error = xfs_trans_commit(tp); 305 + xfs_iunlock(ip, XFS_ILOCK_EXCL); 306 + if (error) 307 + return error; 308 + 309 + new.br_startoff += new.br_blockcount; 310 + new.br_startblock += new.br_blockcount; 311 + if (old_startblock != NULLFSBLOCK) 312 + old_startblock += new.br_blockcount; 313 + } 314 + 315 + return 0; 316 + } 317 + 318 + /* 319 + * "Free" blocks allocated in a zone. 320 + * 321 + * Just decrement the used blocks counter and report the space as freed. 322 + */ 323 + int 324 + xfs_zone_free_blocks( 325 + struct xfs_trans *tp, 326 + struct xfs_rtgroup *rtg, 327 + xfs_fsblock_t fsbno, 328 + xfs_filblks_t len) 329 + { 330 + struct xfs_mount *mp = tp->t_mountp; 331 + struct xfs_inode *rmapip = rtg_rmap(rtg); 332 + 333 + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); 334 + 335 + if (len > rmapip->i_used_blocks) { 336 + xfs_err(mp, 337 + "trying to free more blocks (%lld) than used counter (%u).", 338 + len, rmapip->i_used_blocks); 339 + ASSERT(len <= rmapip->i_used_blocks); 340 + xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); 341 + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 342 + return -EFSCORRUPTED; 343 + } 344 + 345 + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); 346 + 347 + rmapip->i_used_blocks -= len; 348 + /* 349 + * Don't add open zones to the reclaimable buckets. The I/O completion 350 + * for writing the last block will take care of accounting for already 351 + * unused blocks instead. 352 + */ 353 + if (!READ_ONCE(rtg->rtg_open_zone)) 354 + xfs_zone_account_reclaimable(rtg, len); 355 + xfs_add_frextents(mp, len); 356 + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 357 + return 0; 358 + } 359 + 360 + /* 361 + * Check if the zone containing the data just before the offset we are 362 + * writing to is still open and has space. 363 + */ 364 + static struct xfs_open_zone * 365 + xfs_last_used_zone( 366 + struct iomap_ioend *ioend) 367 + { 368 + struct xfs_inode *ip = XFS_I(ioend->io_inode); 369 + struct xfs_mount *mp = ip->i_mount; 370 + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); 371 + struct xfs_rtgroup *rtg = NULL; 372 + struct xfs_open_zone *oz = NULL; 373 + struct xfs_iext_cursor icur; 374 + struct xfs_bmbt_irec got; 375 + 376 + xfs_ilock(ip, XFS_ILOCK_SHARED); 377 + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, 378 + &icur, &got)) { 379 + xfs_iunlock(ip, XFS_ILOCK_SHARED); 380 + return NULL; 381 + } 382 + xfs_iunlock(ip, XFS_ILOCK_SHARED); 383 + 384 + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); 385 + if (!rtg) 386 + return NULL; 387 + 388 + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); 389 + oz = READ_ONCE(rtg->rtg_open_zone); 390 + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) 391 + oz = NULL; 392 + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); 393 + 394 + xfs_rtgroup_rele(rtg); 395 + return oz; 396 + } 397 + 398 + static struct xfs_group * 399 + xfs_find_free_zone( 400 + struct xfs_mount *mp, 401 + unsigned long start, 402 + unsigned long end) 403 + { 404 + struct xfs_zone_info *zi = mp->m_zone_info; 405 + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); 406 + struct xfs_group *xg; 407 + 408 + xas_lock(&xas); 409 + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) 410 + if (atomic_inc_not_zero(&xg->xg_active_ref)) 411 + goto found; 412 + xas_unlock(&xas); 413 + return NULL; 414 + 415 + found: 416 + xas_clear_mark(&xas, XFS_RTG_FREE); 417 + atomic_dec(&zi->zi_nr_free_zones); 418 + zi->zi_free_zone_cursor = xg->xg_gno; 419 + xas_unlock(&xas); 420 + return xg; 421 + } 422 + 423 + static struct xfs_open_zone * 424 + xfs_init_open_zone( 425 + struct xfs_rtgroup *rtg, 426 + xfs_rgblock_t write_pointer, 427 + enum rw_hint write_hint, 428 + bool is_gc) 429 + { 430 + struct xfs_open_zone *oz; 431 + 432 + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); 433 + spin_lock_init(&oz->oz_alloc_lock); 434 + atomic_set(&oz->oz_ref, 1); 435 + oz->oz_rtg = rtg; 436 + oz->oz_write_pointer = write_pointer; 437 + oz->oz_written = write_pointer; 438 + oz->oz_write_hint = write_hint; 439 + oz->oz_is_gc = is_gc; 440 + 441 + /* 442 + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap 443 + * inode, but we don't really want to take that here because we are 444 + * under the zone_list_lock. Ensure the pointer is only set for a fully 445 + * initialized open zone structure so that a racy lookup finding it is 446 + * fine. 447 + */ 448 + WRITE_ONCE(rtg->rtg_open_zone, oz); 449 + return oz; 450 + } 451 + 452 + /* 453 + * Find a completely free zone, open it, and return a reference. 454 + */ 455 + struct xfs_open_zone * 456 + xfs_open_zone( 457 + struct xfs_mount *mp, 458 + enum rw_hint write_hint, 459 + bool is_gc) 460 + { 461 + struct xfs_zone_info *zi = mp->m_zone_info; 462 + struct xfs_group *xg; 463 + 464 + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); 465 + if (!xg) 466 + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); 467 + if (!xg) 468 + return NULL; 469 + 470 + set_current_state(TASK_RUNNING); 471 + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); 472 + } 473 + 474 + static struct xfs_open_zone * 475 + xfs_try_open_zone( 476 + struct xfs_mount *mp, 477 + enum rw_hint write_hint) 478 + { 479 + struct xfs_zone_info *zi = mp->m_zone_info; 480 + struct xfs_open_zone *oz; 481 + 482 + if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) 483 + return NULL; 484 + if (atomic_read(&zi->zi_nr_free_zones) < 485 + XFS_GC_ZONES - XFS_OPEN_GC_ZONES) 486 + return NULL; 487 + 488 + /* 489 + * Increment the open zone count to reserve our slot before dropping 490 + * zi_open_zones_lock. 491 + */ 492 + zi->zi_nr_open_zones++; 493 + spin_unlock(&zi->zi_open_zones_lock); 494 + oz = xfs_open_zone(mp, write_hint, false); 495 + spin_lock(&zi->zi_open_zones_lock); 496 + if (!oz) { 497 + zi->zi_nr_open_zones--; 498 + return NULL; 499 + } 500 + 501 + atomic_inc(&oz->oz_ref); 502 + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 503 + 504 + /* 505 + * If this was the last free zone, other waiters might be waiting 506 + * on us to write to it as well. 507 + */ 508 + wake_up_all(&zi->zi_zone_wait); 509 + 510 + if (xfs_zoned_need_gc(mp)) 511 + wake_up_process(zi->zi_gc_thread); 512 + 513 + trace_xfs_zone_opened(oz->oz_rtg); 514 + return oz; 515 + } 516 + 517 + /* 518 + * For data with short or medium lifetime, try to colocated it into an 519 + * already open zone with a matching temperature. 520 + */ 521 + static bool 522 + xfs_colocate_eagerly( 523 + enum rw_hint file_hint) 524 + { 525 + switch (file_hint) { 526 + case WRITE_LIFE_MEDIUM: 527 + case WRITE_LIFE_SHORT: 528 + case WRITE_LIFE_NONE: 529 + return true; 530 + default: 531 + return false; 532 + } 533 + } 534 + 535 + static bool 536 + xfs_good_hint_match( 537 + struct xfs_open_zone *oz, 538 + enum rw_hint file_hint) 539 + { 540 + switch (oz->oz_write_hint) { 541 + case WRITE_LIFE_LONG: 542 + case WRITE_LIFE_EXTREME: 543 + /* colocate long and extreme */ 544 + if (file_hint == WRITE_LIFE_LONG || 545 + file_hint == WRITE_LIFE_EXTREME) 546 + return true; 547 + break; 548 + case WRITE_LIFE_MEDIUM: 549 + /* colocate medium with medium */ 550 + if (file_hint == WRITE_LIFE_MEDIUM) 551 + return true; 552 + break; 553 + case WRITE_LIFE_SHORT: 554 + case WRITE_LIFE_NONE: 555 + case WRITE_LIFE_NOT_SET: 556 + /* colocate short and none */ 557 + if (file_hint <= WRITE_LIFE_SHORT) 558 + return true; 559 + break; 560 + } 561 + return false; 562 + } 563 + 564 + static bool 565 + xfs_try_use_zone( 566 + struct xfs_zone_info *zi, 567 + enum rw_hint file_hint, 568 + struct xfs_open_zone *oz, 569 + bool lowspace) 570 + { 571 + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) 572 + return false; 573 + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) 574 + return false; 575 + if (!atomic_inc_not_zero(&oz->oz_ref)) 576 + return false; 577 + 578 + /* 579 + * If we have a hint set for the data, use that for the zone even if 580 + * some data was written already without any hint set, but don't change 581 + * the temperature after that as that would make little sense without 582 + * tracking per-temperature class written block counts, which is 583 + * probably overkill anyway. 584 + */ 585 + if (file_hint != WRITE_LIFE_NOT_SET && 586 + oz->oz_write_hint == WRITE_LIFE_NOT_SET) 587 + oz->oz_write_hint = file_hint; 588 + 589 + /* 590 + * If we couldn't match by inode or life time we just pick the first 591 + * zone with enough space above. For that we want the least busy zone 592 + * for some definition of "least" busy. For now this simple LRU 593 + * algorithm that rotates every zone to the end of the list will do it, 594 + * even if it isn't exactly cache friendly. 595 + */ 596 + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) 597 + list_move_tail(&oz->oz_entry, &zi->zi_open_zones); 598 + return true; 599 + } 600 + 601 + static struct xfs_open_zone * 602 + xfs_select_open_zone_lru( 603 + struct xfs_zone_info *zi, 604 + enum rw_hint file_hint, 605 + bool lowspace) 606 + { 607 + struct xfs_open_zone *oz; 608 + 609 + lockdep_assert_held(&zi->zi_open_zones_lock); 610 + 611 + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) 612 + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) 613 + return oz; 614 + 615 + cond_resched_lock(&zi->zi_open_zones_lock); 616 + return NULL; 617 + } 618 + 619 + static struct xfs_open_zone * 620 + xfs_select_open_zone_mru( 621 + struct xfs_zone_info *zi, 622 + enum rw_hint file_hint) 623 + { 624 + struct xfs_open_zone *oz; 625 + 626 + lockdep_assert_held(&zi->zi_open_zones_lock); 627 + 628 + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) 629 + if (xfs_try_use_zone(zi, file_hint, oz, false)) 630 + return oz; 631 + 632 + cond_resched_lock(&zi->zi_open_zones_lock); 633 + return NULL; 634 + } 635 + 636 + static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) 637 + { 638 + if (xfs_has_nolifetime(ip->i_mount)) 639 + return WRITE_LIFE_NOT_SET; 640 + return VFS_I(ip)->i_write_hint; 641 + } 642 + 643 + /* 644 + * Try to pack inodes that are written back after they were closed tight instead 645 + * of trying to open new zones for them or spread them to the least recently 646 + * used zone. This optimizes the data layout for workloads that untar or copy 647 + * a lot of small files. Right now this does not separate multiple such 648 + * streams. 649 + */ 650 + static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) 651 + { 652 + return !inode_is_open_for_write(VFS_I(ip)) && 653 + !(ip->i_diflags & XFS_DIFLAG_APPEND); 654 + } 655 + 656 + /* 657 + * Pick a new zone for writes. 658 + * 659 + * If we aren't using up our budget of open zones just open a new one from the 660 + * freelist. Else try to find one that matches the expected data lifetime. If 661 + * we don't find one that is good pick any zone that is available. 662 + */ 663 + static struct xfs_open_zone * 664 + xfs_select_zone_nowait( 665 + struct xfs_mount *mp, 666 + enum rw_hint write_hint, 667 + bool pack_tight) 668 + { 669 + struct xfs_zone_info *zi = mp->m_zone_info; 670 + struct xfs_open_zone *oz = NULL; 671 + 672 + if (xfs_is_shutdown(mp)) 673 + return NULL; 674 + 675 + /* 676 + * Try to fill up open zones with matching temperature if available. It 677 + * is better to try to co-locate data when this is favorable, so we can 678 + * activate empty zones when it is statistically better to separate 679 + * data. 680 + */ 681 + spin_lock(&zi->zi_open_zones_lock); 682 + if (xfs_colocate_eagerly(write_hint)) 683 + oz = xfs_select_open_zone_lru(zi, write_hint, false); 684 + else if (pack_tight) 685 + oz = xfs_select_open_zone_mru(zi, write_hint); 686 + if (oz) 687 + goto out_unlock; 688 + 689 + /* 690 + * See if we can open a new zone and use that. 691 + */ 692 + oz = xfs_try_open_zone(mp, write_hint); 693 + if (oz) 694 + goto out_unlock; 695 + 696 + /* 697 + * Try to colocate cold data with other cold data if we failed to open a 698 + * new zone for it. 699 + */ 700 + if (write_hint != WRITE_LIFE_NOT_SET && 701 + !xfs_colocate_eagerly(write_hint)) 702 + oz = xfs_select_open_zone_lru(zi, write_hint, false); 703 + if (!oz) 704 + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); 705 + if (!oz) 706 + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); 707 + out_unlock: 708 + spin_unlock(&zi->zi_open_zones_lock); 709 + return oz; 710 + } 711 + 712 + static struct xfs_open_zone * 713 + xfs_select_zone( 714 + struct xfs_mount *mp, 715 + enum rw_hint write_hint, 716 + bool pack_tight) 717 + { 718 + struct xfs_zone_info *zi = mp->m_zone_info; 719 + DEFINE_WAIT (wait); 720 + struct xfs_open_zone *oz; 721 + 722 + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 723 + if (oz) 724 + return oz; 725 + 726 + for (;;) { 727 + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); 728 + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 729 + if (oz) 730 + break; 731 + schedule(); 732 + } 733 + finish_wait(&zi->zi_zone_wait, &wait); 734 + return oz; 735 + } 736 + 737 + static unsigned int 738 + xfs_zone_alloc_blocks( 739 + struct xfs_open_zone *oz, 740 + xfs_filblks_t count_fsb, 741 + sector_t *sector, 742 + bool *is_seq) 743 + { 744 + struct xfs_rtgroup *rtg = oz->oz_rtg; 745 + struct xfs_mount *mp = rtg_mount(rtg); 746 + xfs_rgblock_t rgbno; 747 + 748 + spin_lock(&oz->oz_alloc_lock); 749 + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, 750 + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); 751 + if (!count_fsb) { 752 + spin_unlock(&oz->oz_alloc_lock); 753 + return 0; 754 + } 755 + rgbno = oz->oz_write_pointer; 756 + oz->oz_write_pointer += count_fsb; 757 + spin_unlock(&oz->oz_alloc_lock); 758 + 759 + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); 760 + 761 + *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 762 + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); 763 + if (!*is_seq) 764 + *sector += XFS_FSB_TO_BB(mp, rgbno); 765 + return XFS_FSB_TO_B(mp, count_fsb); 766 + } 767 + 768 + void 769 + xfs_mark_rtg_boundary( 770 + struct iomap_ioend *ioend) 771 + { 772 + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 773 + sector_t sector = ioend->io_bio.bi_iter.bi_sector; 774 + 775 + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) 776 + ioend->io_flags |= IOMAP_IOEND_BOUNDARY; 777 + } 778 + 779 + static void 780 + xfs_submit_zoned_bio( 781 + struct iomap_ioend *ioend, 782 + struct xfs_open_zone *oz, 783 + bool is_seq) 784 + { 785 + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; 786 + ioend->io_private = oz; 787 + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ 788 + 789 + if (is_seq) { 790 + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; 791 + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; 792 + } else { 793 + xfs_mark_rtg_boundary(ioend); 794 + } 795 + 796 + submit_bio(&ioend->io_bio); 797 + } 798 + 799 + void 800 + xfs_zone_alloc_and_submit( 801 + struct iomap_ioend *ioend, 802 + struct xfs_open_zone **oz) 803 + { 804 + struct xfs_inode *ip = XFS_I(ioend->io_inode); 805 + struct xfs_mount *mp = ip->i_mount; 806 + enum rw_hint write_hint = xfs_inode_write_hint(ip); 807 + bool pack_tight = xfs_zoned_pack_tight(ip); 808 + unsigned int alloc_len; 809 + struct iomap_ioend *split; 810 + bool is_seq; 811 + 812 + if (xfs_is_shutdown(mp)) 813 + goto out_error; 814 + 815 + /* 816 + * If we don't have a cached zone in this write context, see if the 817 + * last extent before the one we are writing to points to an active 818 + * zone. If so, just continue writing to it. 819 + */ 820 + if (!*oz && ioend->io_offset) 821 + *oz = xfs_last_used_zone(ioend); 822 + if (!*oz) { 823 + select_zone: 824 + *oz = xfs_select_zone(mp, write_hint, pack_tight); 825 + if (!*oz) 826 + goto out_error; 827 + } 828 + 829 + alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), 830 + &ioend->io_sector, &is_seq); 831 + if (!alloc_len) { 832 + xfs_open_zone_put(*oz); 833 + goto select_zone; 834 + } 835 + 836 + while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { 837 + if (IS_ERR(split)) 838 + goto out_split_error; 839 + alloc_len -= split->io_bio.bi_iter.bi_size; 840 + xfs_submit_zoned_bio(split, *oz, is_seq); 841 + if (!alloc_len) { 842 + xfs_open_zone_put(*oz); 843 + goto select_zone; 844 + } 845 + } 846 + 847 + xfs_submit_zoned_bio(ioend, *oz, is_seq); 848 + return; 849 + 850 + out_split_error: 851 + ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); 852 + out_error: 853 + bio_io_error(&ioend->io_bio); 854 + } 855 + 856 + /* 857 + * Wake up all threads waiting for a zoned space allocation when the file system 858 + * is shut down. 859 + */ 860 + void 861 + xfs_zoned_wake_all( 862 + struct xfs_mount *mp) 863 + { 864 + /* 865 + * Don't wake up if there is no m_zone_info. This is complicated by the 866 + * fact that unmount can't atomically clear m_zone_info and thus we need 867 + * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE 868 + * during log recovery so we can't entirely rely on that either. 869 + */ 870 + if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) 871 + wake_up_all(&mp->m_zone_info->zi_zone_wait); 872 + } 873 + 874 + /* 875 + * Check if @rgbno in @rgb is a potentially valid block. It might still be 876 + * unused, but that information is only found in the rmap. 877 + */ 878 + bool 879 + xfs_zone_rgbno_is_valid( 880 + struct xfs_rtgroup *rtg, 881 + xfs_rgnumber_t rgbno) 882 + { 883 + lockdep_assert_held(&rtg_rmap(rtg)->i_lock); 884 + 885 + if (rtg->rtg_open_zone) 886 + return rgbno < rtg->rtg_open_zone->oz_write_pointer; 887 + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, 888 + rtg_rgno(rtg), XFS_RTG_FREE); 889 + } 890 + 891 + static void 892 + xfs_free_open_zones( 893 + struct xfs_zone_info *zi) 894 + { 895 + struct xfs_open_zone *oz; 896 + 897 + spin_lock(&zi->zi_open_zones_lock); 898 + while ((oz = list_first_entry_or_null(&zi->zi_open_zones, 899 + struct xfs_open_zone, oz_entry))) { 900 + list_del(&oz->oz_entry); 901 + xfs_open_zone_put(oz); 902 + } 903 + spin_unlock(&zi->zi_open_zones_lock); 904 + } 905 + 906 + struct xfs_init_zones { 907 + struct xfs_mount *mp; 908 + uint64_t available; 909 + uint64_t reclaimable; 910 + }; 911 + 912 + static int 913 + xfs_init_zone( 914 + struct xfs_init_zones *iz, 915 + struct xfs_rtgroup *rtg, 916 + struct blk_zone *zone) 917 + { 918 + struct xfs_mount *mp = rtg_mount(rtg); 919 + struct xfs_zone_info *zi = mp->m_zone_info; 920 + uint64_t used = rtg_rmap(rtg)->i_used_blocks; 921 + xfs_rgblock_t write_pointer, highest_rgbno; 922 + int error; 923 + 924 + if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) 925 + return -EFSCORRUPTED; 926 + 927 + /* 928 + * For sequential write required zones we retrieved the hardware write 929 + * pointer above. 930 + * 931 + * For conventional zones or conventional devices we don't have that 932 + * luxury. Instead query the rmap to find the highest recorded block 933 + * and set the write pointer to the block after that. In case of a 934 + * power loss this misses blocks where the data I/O has completed but 935 + * not recorded in the rmap yet, and it also rewrites blocks if the most 936 + * recently written ones got deleted again before unmount, but this is 937 + * the best we can do without hardware support. 938 + */ 939 + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { 940 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 941 + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); 942 + if (highest_rgbno == NULLRGBLOCK) 943 + write_pointer = 0; 944 + else 945 + write_pointer = highest_rgbno + 1; 946 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 947 + } 948 + 949 + /* 950 + * If there are no used blocks, but the zone is not in empty state yet 951 + * we lost power before the zoned reset. In that case finish the work 952 + * here. 953 + */ 954 + if (write_pointer == rtg_blocks(rtg) && used == 0) { 955 + error = xfs_zone_gc_reset_sync(rtg); 956 + if (error) 957 + return error; 958 + write_pointer = 0; 959 + } 960 + 961 + if (write_pointer == 0) { 962 + /* zone is empty */ 963 + atomic_inc(&zi->zi_nr_free_zones); 964 + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 965 + iz->available += rtg_blocks(rtg); 966 + } else if (write_pointer < rtg_blocks(rtg)) { 967 + /* zone is open */ 968 + struct xfs_open_zone *oz; 969 + 970 + atomic_inc(&rtg_group(rtg)->xg_active_ref); 971 + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, 972 + false); 973 + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 974 + zi->zi_nr_open_zones++; 975 + 976 + iz->available += (rtg_blocks(rtg) - write_pointer); 977 + iz->reclaimable += write_pointer - used; 978 + } else if (used < rtg_blocks(rtg)) { 979 + /* zone fully written, but has freed blocks */ 980 + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 981 + iz->reclaimable += (rtg_blocks(rtg) - used); 982 + } 983 + 984 + return 0; 985 + } 986 + 987 + static int 988 + xfs_get_zone_info_cb( 989 + struct blk_zone *zone, 990 + unsigned int idx, 991 + void *data) 992 + { 993 + struct xfs_init_zones *iz = data; 994 + struct xfs_mount *mp = iz->mp; 995 + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); 996 + xfs_rgnumber_t rgno; 997 + struct xfs_rtgroup *rtg; 998 + int error; 999 + 1000 + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { 1001 + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); 1002 + return -EFSCORRUPTED; 1003 + } 1004 + 1005 + rgno = xfs_rtb_to_rgno(mp, zsbno); 1006 + rtg = xfs_rtgroup_grab(mp, rgno); 1007 + if (!rtg) { 1008 + xfs_warn(mp, "realtime group not found for zone %u.", rgno); 1009 + return -EFSCORRUPTED; 1010 + } 1011 + error = xfs_init_zone(iz, rtg, zone); 1012 + xfs_rtgroup_rele(rtg); 1013 + return error; 1014 + } 1015 + 1016 + /* 1017 + * Calculate the max open zone limit based on the of number of 1018 + * backing zones available 1019 + */ 1020 + static inline uint32_t 1021 + xfs_max_open_zones( 1022 + struct xfs_mount *mp) 1023 + { 1024 + unsigned int max_open, max_open_data_zones; 1025 + /* 1026 + * We need two zones for every open data zone, 1027 + * one in reserve as we don't reclaim open zones. One data zone 1028 + * and its spare is included in XFS_MIN_ZONES. 1029 + */ 1030 + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; 1031 + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; 1032 + 1033 + /* 1034 + * Cap the max open limit to 1/4 of available space 1035 + */ 1036 + max_open = min(max_open, mp->m_sb.sb_rgcount / 4); 1037 + 1038 + return max(XFS_MIN_OPEN_ZONES, max_open); 1039 + } 1040 + 1041 + /* 1042 + * Normally we use the open zone limit that the device reports. If there is 1043 + * none let the user pick one from the command line. 1044 + * 1045 + * If the device doesn't report an open zone limit and there is no override, 1046 + * allow to hold about a quarter of the zones open. In theory we could allow 1047 + * all to be open, but at that point we run into GC deadlocks because we can't 1048 + * reclaim open zones. 1049 + * 1050 + * When used on conventional SSDs a lower open limit is advisable as we'll 1051 + * otherwise overwhelm the FTL just as much as a conventional block allocator. 1052 + * 1053 + * Note: To debug the open zone management code, force max_open to 1 here. 1054 + */ 1055 + static int 1056 + xfs_calc_open_zones( 1057 + struct xfs_mount *mp) 1058 + { 1059 + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 1060 + unsigned int bdev_open_zones = bdev_max_open_zones(bdev); 1061 + 1062 + if (!mp->m_max_open_zones) { 1063 + if (bdev_open_zones) 1064 + mp->m_max_open_zones = bdev_open_zones; 1065 + else 1066 + mp->m_max_open_zones = xfs_max_open_zones(mp); 1067 + } 1068 + 1069 + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { 1070 + xfs_notice(mp, "need at least %u open zones.", 1071 + XFS_MIN_OPEN_ZONES); 1072 + return -EIO; 1073 + } 1074 + 1075 + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { 1076 + mp->m_max_open_zones = bdev_open_zones; 1077 + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", 1078 + bdev_open_zones); 1079 + } 1080 + 1081 + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { 1082 + mp->m_max_open_zones = xfs_max_open_zones(mp); 1083 + xfs_info(mp, 1084 + "limiting open zones to %u due to total zone count (%u)", 1085 + mp->m_max_open_zones, mp->m_sb.sb_rgcount); 1086 + } 1087 + 1088 + return 0; 1089 + } 1090 + 1091 + static unsigned long * 1092 + xfs_alloc_bucket_bitmap( 1093 + struct xfs_mount *mp) 1094 + { 1095 + return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), 1096 + sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); 1097 + } 1098 + 1099 + static struct xfs_zone_info * 1100 + xfs_alloc_zone_info( 1101 + struct xfs_mount *mp) 1102 + { 1103 + struct xfs_zone_info *zi; 1104 + int i; 1105 + 1106 + zi = kzalloc(sizeof(*zi), GFP_KERNEL); 1107 + if (!zi) 1108 + return NULL; 1109 + INIT_LIST_HEAD(&zi->zi_open_zones); 1110 + INIT_LIST_HEAD(&zi->zi_reclaim_reservations); 1111 + spin_lock_init(&zi->zi_reset_list_lock); 1112 + spin_lock_init(&zi->zi_open_zones_lock); 1113 + spin_lock_init(&zi->zi_reservation_lock); 1114 + init_waitqueue_head(&zi->zi_zone_wait); 1115 + spin_lock_init(&zi->zi_used_buckets_lock); 1116 + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { 1117 + zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); 1118 + if (!zi->zi_used_bucket_bitmap[i]) 1119 + goto out_free_bitmaps; 1120 + } 1121 + return zi; 1122 + 1123 + out_free_bitmaps: 1124 + while (--i > 0) 1125 + kvfree(zi->zi_used_bucket_bitmap[i]); 1126 + kfree(zi); 1127 + return NULL; 1128 + } 1129 + 1130 + static void 1131 + xfs_free_zone_info( 1132 + struct xfs_zone_info *zi) 1133 + { 1134 + int i; 1135 + 1136 + xfs_free_open_zones(zi); 1137 + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) 1138 + kvfree(zi->zi_used_bucket_bitmap[i]); 1139 + kfree(zi); 1140 + } 1141 + 1142 + int 1143 + xfs_mount_zones( 1144 + struct xfs_mount *mp) 1145 + { 1146 + struct xfs_init_zones iz = { 1147 + .mp = mp, 1148 + }; 1149 + struct xfs_buftarg *bt = mp->m_rtdev_targp; 1150 + int error; 1151 + 1152 + if (!bt) { 1153 + xfs_notice(mp, "RT device missing."); 1154 + return -EINVAL; 1155 + } 1156 + 1157 + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { 1158 + xfs_notice(mp, "invalid flag combination."); 1159 + return -EFSCORRUPTED; 1160 + } 1161 + if (mp->m_sb.sb_rextsize != 1) { 1162 + xfs_notice(mp, "zoned file systems do not support rextsize."); 1163 + return -EFSCORRUPTED; 1164 + } 1165 + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { 1166 + xfs_notice(mp, 1167 + "zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); 1168 + return -EFSCORRUPTED; 1169 + } 1170 + 1171 + error = xfs_calc_open_zones(mp); 1172 + if (error) 1173 + return error; 1174 + 1175 + mp->m_zone_info = xfs_alloc_zone_info(mp); 1176 + if (!mp->m_zone_info) 1177 + return -ENOMEM; 1178 + 1179 + xfs_info(mp, "%u zones of %u blocks size (%u max open)", 1180 + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, 1181 + mp->m_max_open_zones); 1182 + trace_xfs_zones_mount(mp); 1183 + 1184 + if (bdev_is_zoned(bt->bt_bdev)) { 1185 + error = blkdev_report_zones(bt->bt_bdev, 1186 + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), 1187 + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); 1188 + if (error < 0) 1189 + goto out_free_zone_info; 1190 + } else { 1191 + struct xfs_rtgroup *rtg = NULL; 1192 + 1193 + while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1194 + error = xfs_init_zone(&iz, rtg, NULL); 1195 + if (error) 1196 + goto out_free_zone_info; 1197 + } 1198 + } 1199 + 1200 + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); 1201 + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1202 + iz.available + iz.reclaimable); 1203 + 1204 + error = xfs_zone_gc_mount(mp); 1205 + if (error) 1206 + goto out_free_zone_info; 1207 + return 0; 1208 + 1209 + out_free_zone_info: 1210 + xfs_free_zone_info(mp->m_zone_info); 1211 + return error; 1212 + } 1213 + 1214 + void 1215 + xfs_unmount_zones( 1216 + struct xfs_mount *mp) 1217 + { 1218 + xfs_zone_gc_unmount(mp); 1219 + xfs_free_zone_info(mp->m_zone_info); 1220 + }

+70

fs/xfs/xfs_zone_alloc.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _XFS_ZONE_ALLOC_H 3 + #define _XFS_ZONE_ALLOC_H 4 + 5 + struct iomap_ioend; 6 + struct xfs_open_zone; 7 + 8 + struct xfs_zone_alloc_ctx { 9 + struct xfs_open_zone *open_zone; 10 + xfs_filblks_t reserved_blocks; 11 + }; 12 + 13 + /* 14 + * Grab any available space, even if it is less than what the caller asked for. 15 + */ 16 + #define XFS_ZR_GREEDY (1U << 0) 17 + /* 18 + * Only grab instantly available space, don't wait or GC. 19 + */ 20 + #define XFS_ZR_NOWAIT (1U << 1) 21 + /* 22 + * Dip into the reserved pool. 23 + */ 24 + #define XFS_ZR_RESERVED (1U << 2) 25 + 26 + int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, 27 + unsigned int flags, struct xfs_zone_alloc_ctx *ac); 28 + void xfs_zoned_space_unreserve(struct xfs_inode *ip, 29 + struct xfs_zone_alloc_ctx *ac); 30 + void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); 31 + 32 + void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, 33 + struct xfs_open_zone **oz); 34 + int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, 35 + xfs_fsblock_t fsbno, xfs_filblks_t len); 36 + int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, 37 + xfs_daddr_t daddr, struct xfs_open_zone *oz, 38 + xfs_fsblock_t old_startblock); 39 + void xfs_open_zone_put(struct xfs_open_zone *oz); 40 + 41 + void xfs_zoned_wake_all(struct xfs_mount *mp); 42 + bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); 43 + void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); 44 + 45 + uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, 46 + enum xfs_free_counter ctr); 47 + void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp); 48 + 49 + #ifdef CONFIG_XFS_RT 50 + int xfs_mount_zones(struct xfs_mount *mp); 51 + void xfs_unmount_zones(struct xfs_mount *mp); 52 + void xfs_zone_gc_start(struct xfs_mount *mp); 53 + void xfs_zone_gc_stop(struct xfs_mount *mp); 54 + #else 55 + static inline int xfs_mount_zones(struct xfs_mount *mp) 56 + { 57 + return -EIO; 58 + } 59 + static inline void xfs_unmount_zones(struct xfs_mount *mp) 60 + { 61 + } 62 + static inline void xfs_zone_gc_start(struct xfs_mount *mp) 63 + { 64 + } 65 + static inline void xfs_zone_gc_stop(struct xfs_mount *mp) 66 + { 67 + } 68 + #endif /* CONFIG_XFS_RT */ 69 + 70 + #endif /* _XFS_ZONE_ALLOC_H */

+1165

fs/xfs/xfs_zone_gc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2023-2025 Christoph Hellwig. 4 + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_shared.h" 8 + #include "xfs_format.h" 9 + #include "xfs_log_format.h" 10 + #include "xfs_trans_resv.h" 11 + #include "xfs_mount.h" 12 + #include "xfs_inode.h" 13 + #include "xfs_btree.h" 14 + #include "xfs_trans.h" 15 + #include "xfs_icache.h" 16 + #include "xfs_rmap.h" 17 + #include "xfs_rtbitmap.h" 18 + #include "xfs_rtrmap_btree.h" 19 + #include "xfs_zone_alloc.h" 20 + #include "xfs_zone_priv.h" 21 + #include "xfs_zones.h" 22 + #include "xfs_trace.h" 23 + 24 + /* 25 + * Implement Garbage Collection (GC) of partially used zoned. 26 + * 27 + * To support the purely sequential writes in each zone, zoned XFS needs to be 28 + * able to move data remaining in a zone out of it to reset the zone to prepare 29 + * for writing to it again. 30 + * 31 + * This is done by the GC thread implemented in this file. To support that a 32 + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 33 + * write the garbage collected data into. 34 + * 35 + * Whenever the available space is below the chosen threshold, the GC thread 36 + * looks for potential non-empty but not fully used zones that are worth 37 + * reclaiming. Once found the rmap for the victim zone is queried, and after 38 + * a bit of sorting to reduce fragmentation, the still live extents are read 39 + * into memory and written to the GC target zone, and the bmap btree of the 40 + * files is updated to point to the new location. To avoid taking the IOLOCK 41 + * and MMAPLOCK for the entire GC process and thus affecting the latency of 42 + * user reads and writes to the files, the GC writes are speculative and the 43 + * I/O completion checks that no other writes happened for the affected regions 44 + * before remapping. 45 + * 46 + * Once a zone does not contain any valid data, be that through GC or user 47 + * block removal, it is queued for for a zone reset. The reset operation 48 + * carefully ensures that the RT device cache is flushed and all transactions 49 + * referencing the rmap have been committed to disk. 50 + */ 51 + 52 + /* 53 + * Size of each GC scratch pad. This is also the upper bound for each 54 + * GC I/O, which helps to keep latency down. 55 + */ 56 + #define XFS_GC_CHUNK_SIZE SZ_1M 57 + 58 + /* 59 + * Scratchpad data to read GCed data into. 60 + * 61 + * The offset member tracks where the next allocation starts, and freed tracks 62 + * the amount of space that is not used anymore. 63 + */ 64 + #define XFS_ZONE_GC_NR_SCRATCH 2 65 + struct xfs_zone_scratch { 66 + struct folio *folio; 67 + unsigned int offset; 68 + unsigned int freed; 69 + }; 70 + 71 + /* 72 + * Chunk that is read and written for each GC operation. 73 + * 74 + * Note that for writes to actual zoned devices, the chunk can be split when 75 + * reaching the hardware limit. 76 + */ 77 + struct xfs_gc_bio { 78 + struct xfs_zone_gc_data *data; 79 + 80 + /* 81 + * Entry into the reading/writing/resetting list. Only accessed from 82 + * the GC thread, so no locking needed. 83 + */ 84 + struct list_head entry; 85 + 86 + /* 87 + * State of this gc_bio. Done means the current I/O completed. 88 + * Set from the bio end I/O handler, read from the GC thread. 89 + */ 90 + enum { 91 + XFS_GC_BIO_NEW, 92 + XFS_GC_BIO_DONE, 93 + } state; 94 + 95 + /* 96 + * Pointer to the inode and byte range in the inode that this 97 + * GC chunk is operating on. 98 + */ 99 + struct xfs_inode *ip; 100 + loff_t offset; 101 + unsigned int len; 102 + 103 + /* 104 + * Existing startblock (in the zone to be freed) and newly assigned 105 + * daddr in the zone GCed into. 106 + */ 107 + xfs_fsblock_t old_startblock; 108 + xfs_daddr_t new_daddr; 109 + struct xfs_zone_scratch *scratch; 110 + 111 + /* Are we writing to a sequential write required zone? */ 112 + bool is_seq; 113 + 114 + /* Open Zone being written to */ 115 + struct xfs_open_zone *oz; 116 + 117 + /* Bio used for reads and writes, including the bvec used by it */ 118 + struct bio_vec bv; 119 + struct bio bio; /* must be last */ 120 + }; 121 + 122 + #define XFS_ZONE_GC_RECS 1024 123 + 124 + /* iterator, needs to be reinitialized for each victim zone */ 125 + struct xfs_zone_gc_iter { 126 + struct xfs_rtgroup *victim_rtg; 127 + unsigned int rec_count; 128 + unsigned int rec_idx; 129 + xfs_agblock_t next_startblock; 130 + struct xfs_rmap_irec *recs; 131 + }; 132 + 133 + /* 134 + * Per-mount GC state. 135 + */ 136 + struct xfs_zone_gc_data { 137 + struct xfs_mount *mp; 138 + 139 + /* bioset used to allocate the gc_bios */ 140 + struct bio_set bio_set; 141 + 142 + /* 143 + * Scratchpad used, and index to indicated which one is used. 144 + */ 145 + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; 146 + unsigned int scratch_idx; 147 + 148 + /* 149 + * List of bios currently being read, written and reset. 150 + * These lists are only accessed by the GC thread itself, and must only 151 + * be processed in order. 152 + */ 153 + struct list_head reading; 154 + struct list_head writing; 155 + struct list_head resetting; 156 + 157 + /* 158 + * Iterator for the victim zone. 159 + */ 160 + struct xfs_zone_gc_iter iter; 161 + }; 162 + 163 + /* 164 + * We aim to keep enough zones free in stock to fully use the open zone limit 165 + * for data placement purposes. 166 + */ 167 + bool 168 + xfs_zoned_need_gc( 169 + struct xfs_mount *mp) 170 + { 171 + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) 172 + return false; 173 + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < 174 + mp->m_groups[XG_TYPE_RTG].blocks * 175 + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 176 + return true; 177 + return false; 178 + } 179 + 180 + static struct xfs_zone_gc_data * 181 + xfs_zone_gc_data_alloc( 182 + struct xfs_mount *mp) 183 + { 184 + struct xfs_zone_gc_data *data; 185 + int i; 186 + 187 + data = kzalloc(sizeof(*data), GFP_KERNEL); 188 + if (!data) 189 + return NULL; 190 + data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 191 + GFP_KERNEL); 192 + if (!data->iter.recs) 193 + goto out_free_data; 194 + 195 + /* 196 + * We actually only need a single bio_vec. It would be nice to have 197 + * a flag that only allocates the inline bvecs and not the separate 198 + * bvec pool. 199 + */ 200 + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 201 + BIOSET_NEED_BVECS)) 202 + goto out_free_recs; 203 + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { 204 + data->scratch[i].folio = 205 + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); 206 + if (!data->scratch[i].folio) 207 + goto out_free_scratch; 208 + } 209 + INIT_LIST_HEAD(&data->reading); 210 + INIT_LIST_HEAD(&data->writing); 211 + INIT_LIST_HEAD(&data->resetting); 212 + data->mp = mp; 213 + return data; 214 + 215 + out_free_scratch: 216 + while (--i >= 0) 217 + folio_put(data->scratch[i].folio); 218 + bioset_exit(&data->bio_set); 219 + out_free_recs: 220 + kfree(data->iter.recs); 221 + out_free_data: 222 + kfree(data); 223 + return NULL; 224 + } 225 + 226 + static void 227 + xfs_zone_gc_data_free( 228 + struct xfs_zone_gc_data *data) 229 + { 230 + int i; 231 + 232 + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) 233 + folio_put(data->scratch[i].folio); 234 + bioset_exit(&data->bio_set); 235 + kfree(data->iter.recs); 236 + kfree(data); 237 + } 238 + 239 + static void 240 + xfs_zone_gc_iter_init( 241 + struct xfs_zone_gc_iter *iter, 242 + struct xfs_rtgroup *victim_rtg) 243 + 244 + { 245 + iter->next_startblock = 0; 246 + iter->rec_count = 0; 247 + iter->rec_idx = 0; 248 + iter->victim_rtg = victim_rtg; 249 + } 250 + 251 + /* 252 + * Query the rmap of the victim zone to gather the records to evacuate. 253 + */ 254 + static int 255 + xfs_zone_gc_query_cb( 256 + struct xfs_btree_cur *cur, 257 + const struct xfs_rmap_irec *irec, 258 + void *private) 259 + { 260 + struct xfs_zone_gc_iter *iter = private; 261 + 262 + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 263 + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 264 + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 265 + 266 + iter->recs[iter->rec_count] = *irec; 267 + if (++iter->rec_count == XFS_ZONE_GC_RECS) { 268 + iter->next_startblock = 269 + irec->rm_startblock + irec->rm_blockcount; 270 + return 1; 271 + } 272 + return 0; 273 + } 274 + 275 + #define cmp_int(l, r) ((l > r) - (l < r)) 276 + 277 + static int 278 + xfs_zone_gc_rmap_rec_cmp( 279 + const void *a, 280 + const void *b) 281 + { 282 + const struct xfs_rmap_irec *reca = a; 283 + const struct xfs_rmap_irec *recb = b; 284 + int diff; 285 + 286 + diff = cmp_int(reca->rm_owner, recb->rm_owner); 287 + if (diff) 288 + return diff; 289 + return cmp_int(reca->rm_offset, recb->rm_offset); 290 + } 291 + 292 + static int 293 + xfs_zone_gc_query( 294 + struct xfs_mount *mp, 295 + struct xfs_zone_gc_iter *iter) 296 + { 297 + struct xfs_rtgroup *rtg = iter->victim_rtg; 298 + struct xfs_rmap_irec ri_low = { }; 299 + struct xfs_rmap_irec ri_high; 300 + struct xfs_btree_cur *cur; 301 + struct xfs_trans *tp; 302 + int error; 303 + 304 + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 305 + if (iter->next_startblock == rtg_blocks(rtg)) 306 + goto done; 307 + 308 + ASSERT(iter->next_startblock < rtg_blocks(rtg)); 309 + ri_low.rm_startblock = iter->next_startblock; 310 + memset(&ri_high, 0xFF, sizeof(ri_high)); 311 + 312 + iter->rec_idx = 0; 313 + iter->rec_count = 0; 314 + 315 + error = xfs_trans_alloc_empty(mp, &tp); 316 + if (error) 317 + return error; 318 + 319 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 320 + cur = xfs_rtrmapbt_init_cursor(tp, rtg); 321 + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 322 + xfs_zone_gc_query_cb, iter); 323 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 324 + xfs_btree_del_cursor(cur, error < 0 ? error : 0); 325 + xfs_trans_cancel(tp); 326 + 327 + if (error < 0) 328 + return error; 329 + 330 + /* 331 + * Sort the rmap records by inode number and increasing offset to 332 + * defragment the mappings. 333 + * 334 + * This could be further enhanced by an even bigger look ahead window, 335 + * but that's better left until we have better detection of changes to 336 + * inode mapping to avoid the potential of GCing already dead data. 337 + */ 338 + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 339 + xfs_zone_gc_rmap_rec_cmp, NULL); 340 + 341 + if (error == 0) { 342 + /* 343 + * We finished iterating through the zone. 344 + */ 345 + iter->next_startblock = rtg_blocks(rtg); 346 + if (iter->rec_count == 0) 347 + goto done; 348 + } 349 + 350 + return 0; 351 + done: 352 + xfs_rtgroup_rele(iter->victim_rtg); 353 + iter->victim_rtg = NULL; 354 + return 0; 355 + } 356 + 357 + static bool 358 + xfs_zone_gc_iter_next( 359 + struct xfs_mount *mp, 360 + struct xfs_zone_gc_iter *iter, 361 + struct xfs_rmap_irec *chunk_rec, 362 + struct xfs_inode **ipp) 363 + { 364 + struct xfs_rmap_irec *irec; 365 + int error; 366 + 367 + if (!iter->victim_rtg) 368 + return false; 369 + 370 + retry: 371 + if (iter->rec_idx == iter->rec_count) { 372 + error = xfs_zone_gc_query(mp, iter); 373 + if (error) 374 + goto fail; 375 + if (!iter->victim_rtg) 376 + return false; 377 + } 378 + 379 + irec = &iter->recs[iter->rec_idx]; 380 + error = xfs_iget(mp, NULL, irec->rm_owner, 381 + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 382 + if (error) { 383 + /* 384 + * If the inode was already deleted, skip over it. 385 + */ 386 + if (error == -ENOENT) { 387 + iter->rec_idx++; 388 + goto retry; 389 + } 390 + goto fail; 391 + } 392 + 393 + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 394 + iter->rec_idx++; 395 + xfs_irele(*ipp); 396 + goto retry; 397 + } 398 + 399 + *chunk_rec = *irec; 400 + return true; 401 + 402 + fail: 403 + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 404 + return false; 405 + } 406 + 407 + static void 408 + xfs_zone_gc_iter_advance( 409 + struct xfs_zone_gc_iter *iter, 410 + xfs_extlen_t count_fsb) 411 + { 412 + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 413 + 414 + irec->rm_offset += count_fsb; 415 + irec->rm_startblock += count_fsb; 416 + irec->rm_blockcount -= count_fsb; 417 + if (!irec->rm_blockcount) 418 + iter->rec_idx++; 419 + } 420 + 421 + static struct xfs_rtgroup * 422 + xfs_zone_gc_pick_victim_from( 423 + struct xfs_mount *mp, 424 + uint32_t bucket) 425 + { 426 + struct xfs_zone_info *zi = mp->m_zone_info; 427 + uint32_t victim_used = U32_MAX; 428 + struct xfs_rtgroup *victim_rtg = NULL; 429 + uint32_t bit; 430 + 431 + if (!zi->zi_used_bucket_entries[bucket]) 432 + return NULL; 433 + 434 + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 435 + mp->m_sb.sb_rgcount) { 436 + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 437 + 438 + if (!rtg) 439 + continue; 440 + 441 + /* skip zones that are just waiting for a reset */ 442 + if (rtg_rmap(rtg)->i_used_blocks == 0 || 443 + rtg_rmap(rtg)->i_used_blocks >= victim_used) { 444 + xfs_rtgroup_rele(rtg); 445 + continue; 446 + } 447 + 448 + if (victim_rtg) 449 + xfs_rtgroup_rele(victim_rtg); 450 + victim_rtg = rtg; 451 + victim_used = rtg_rmap(rtg)->i_used_blocks; 452 + 453 + /* 454 + * Any zone that is less than 1 percent used is fair game for 455 + * instant reclaim. All of these zones are in the last 456 + * bucket, so avoid the expensive division for the zones 457 + * in the other buckets. 458 + */ 459 + if (bucket == 0 && 460 + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 461 + break; 462 + } 463 + 464 + return victim_rtg; 465 + } 466 + 467 + /* 468 + * Iterate through all zones marked as reclaimable and find a candidate to 469 + * reclaim. 470 + */ 471 + static bool 472 + xfs_zone_gc_select_victim( 473 + struct xfs_zone_gc_data *data) 474 + { 475 + struct xfs_zone_gc_iter *iter = &data->iter; 476 + struct xfs_mount *mp = data->mp; 477 + struct xfs_zone_info *zi = mp->m_zone_info; 478 + struct xfs_rtgroup *victim_rtg = NULL; 479 + unsigned int bucket; 480 + 481 + if (xfs_is_shutdown(mp)) 482 + return false; 483 + 484 + if (iter->victim_rtg) 485 + return true; 486 + 487 + /* 488 + * Don't start new work if we are asked to stop or park. 489 + */ 490 + if (kthread_should_stop() || kthread_should_park()) 491 + return false; 492 + 493 + if (!xfs_zoned_need_gc(mp)) 494 + return false; 495 + 496 + spin_lock(&zi->zi_used_buckets_lock); 497 + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 498 + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 499 + if (victim_rtg) 500 + break; 501 + } 502 + spin_unlock(&zi->zi_used_buckets_lock); 503 + 504 + if (!victim_rtg) 505 + return false; 506 + 507 + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 508 + xfs_zone_gc_iter_init(iter, victim_rtg); 509 + return true; 510 + } 511 + 512 + static struct xfs_open_zone * 513 + xfs_zone_gc_steal_open( 514 + struct xfs_zone_info *zi) 515 + { 516 + struct xfs_open_zone *oz, *found = NULL; 517 + 518 + spin_lock(&zi->zi_open_zones_lock); 519 + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 520 + if (!found || 521 + oz->oz_write_pointer < found->oz_write_pointer) 522 + found = oz; 523 + } 524 + 525 + if (found) { 526 + found->oz_is_gc = true; 527 + list_del_init(&found->oz_entry); 528 + zi->zi_nr_open_zones--; 529 + } 530 + 531 + spin_unlock(&zi->zi_open_zones_lock); 532 + return found; 533 + } 534 + 535 + static struct xfs_open_zone * 536 + xfs_zone_gc_select_target( 537 + struct xfs_mount *mp) 538 + { 539 + struct xfs_zone_info *zi = mp->m_zone_info; 540 + struct xfs_open_zone *oz = zi->zi_open_gc_zone; 541 + 542 + /* 543 + * We need to wait for pending writes to finish. 544 + */ 545 + if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 546 + return NULL; 547 + 548 + ASSERT(zi->zi_nr_open_zones <= 549 + mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 550 + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 551 + if (oz) 552 + trace_xfs_zone_gc_target_opened(oz->oz_rtg); 553 + spin_lock(&zi->zi_open_zones_lock); 554 + zi->zi_open_gc_zone = oz; 555 + spin_unlock(&zi->zi_open_zones_lock); 556 + return oz; 557 + } 558 + 559 + /* 560 + * Ensure we have a valid open zone to write the GC data to. 561 + * 562 + * If the current target zone has space keep writing to it, else first wait for 563 + * all pending writes and then pick a new one. 564 + */ 565 + static struct xfs_open_zone * 566 + xfs_zone_gc_ensure_target( 567 + struct xfs_mount *mp) 568 + { 569 + struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 570 + 571 + if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) 572 + return xfs_zone_gc_select_target(mp); 573 + return oz; 574 + } 575 + 576 + static unsigned int 577 + xfs_zone_gc_scratch_available( 578 + struct xfs_zone_gc_data *data) 579 + { 580 + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; 581 + } 582 + 583 + static bool 584 + xfs_zone_gc_space_available( 585 + struct xfs_zone_gc_data *data) 586 + { 587 + struct xfs_open_zone *oz; 588 + 589 + oz = xfs_zone_gc_ensure_target(data->mp); 590 + if (!oz) 591 + return false; 592 + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && 593 + xfs_zone_gc_scratch_available(data); 594 + } 595 + 596 + static void 597 + xfs_zone_gc_end_io( 598 + struct bio *bio) 599 + { 600 + struct xfs_gc_bio *chunk = 601 + container_of(bio, struct xfs_gc_bio, bio); 602 + struct xfs_zone_gc_data *data = chunk->data; 603 + 604 + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 605 + wake_up_process(data->mp->m_zone_info->zi_gc_thread); 606 + } 607 + 608 + static struct xfs_open_zone * 609 + xfs_zone_gc_alloc_blocks( 610 + struct xfs_zone_gc_data *data, 611 + xfs_extlen_t *count_fsb, 612 + xfs_daddr_t *daddr, 613 + bool *is_seq) 614 + { 615 + struct xfs_mount *mp = data->mp; 616 + struct xfs_open_zone *oz; 617 + 618 + oz = xfs_zone_gc_ensure_target(mp); 619 + if (!oz) 620 + return NULL; 621 + 622 + *count_fsb = min(*count_fsb, 623 + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); 624 + 625 + /* 626 + * Directly allocate GC blocks from the reserved pool. 627 + * 628 + * If we'd take them from the normal pool we could be stealing blocks 629 + * from a regular writer, which would then have to wait for GC and 630 + * deadlock. 631 + */ 632 + spin_lock(&mp->m_sb_lock); 633 + *count_fsb = min(*count_fsb, 634 + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); 635 + *count_fsb = min3(*count_fsb, 636 + mp->m_free[XC_FREE_RTEXTENTS].res_avail, 637 + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 638 + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 639 + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 640 + spin_unlock(&mp->m_sb_lock); 641 + 642 + if (!*count_fsb) 643 + return NULL; 644 + 645 + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 646 + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 647 + if (!*is_seq) 648 + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); 649 + oz->oz_write_pointer += *count_fsb; 650 + atomic_inc(&oz->oz_ref); 651 + return oz; 652 + } 653 + 654 + static bool 655 + xfs_zone_gc_start_chunk( 656 + struct xfs_zone_gc_data *data) 657 + { 658 + struct xfs_zone_gc_iter *iter = &data->iter; 659 + struct xfs_mount *mp = data->mp; 660 + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 661 + struct xfs_open_zone *oz; 662 + struct xfs_rmap_irec irec; 663 + struct xfs_gc_bio *chunk; 664 + struct xfs_inode *ip; 665 + struct bio *bio; 666 + xfs_daddr_t daddr; 667 + bool is_seq; 668 + 669 + if (xfs_is_shutdown(mp)) 670 + return false; 671 + 672 + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 673 + return false; 674 + oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 675 + &is_seq); 676 + if (!oz) { 677 + xfs_irele(ip); 678 + return false; 679 + } 680 + 681 + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); 682 + 683 + chunk = container_of(bio, struct xfs_gc_bio, bio); 684 + chunk->ip = ip; 685 + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 686 + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 687 + chunk->old_startblock = 688 + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 689 + chunk->new_daddr = daddr; 690 + chunk->is_seq = is_seq; 691 + chunk->scratch = &data->scratch[data->scratch_idx]; 692 + chunk->data = data; 693 + chunk->oz = oz; 694 + 695 + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 696 + bio->bi_end_io = xfs_zone_gc_end_io; 697 + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, 698 + chunk->scratch->offset); 699 + chunk->scratch->offset += chunk->len; 700 + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { 701 + data->scratch_idx = 702 + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; 703 + } 704 + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 705 + list_add_tail(&chunk->entry, &data->reading); 706 + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 707 + 708 + submit_bio(bio); 709 + return true; 710 + } 711 + 712 + static void 713 + xfs_zone_gc_free_chunk( 714 + struct xfs_gc_bio *chunk) 715 + { 716 + list_del(&chunk->entry); 717 + xfs_open_zone_put(chunk->oz); 718 + xfs_irele(chunk->ip); 719 + bio_put(&chunk->bio); 720 + } 721 + 722 + static void 723 + xfs_zone_gc_submit_write( 724 + struct xfs_zone_gc_data *data, 725 + struct xfs_gc_bio *chunk) 726 + { 727 + if (chunk->is_seq) { 728 + chunk->bio.bi_opf &= ~REQ_OP_WRITE; 729 + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 730 + } 731 + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 732 + chunk->bio.bi_end_io = xfs_zone_gc_end_io; 733 + submit_bio(&chunk->bio); 734 + } 735 + 736 + static struct xfs_gc_bio * 737 + xfs_zone_gc_split_write( 738 + struct xfs_zone_gc_data *data, 739 + struct xfs_gc_bio *chunk) 740 + { 741 + struct queue_limits *lim = 742 + &bdev_get_queue(chunk->bio.bi_bdev)->limits; 743 + struct xfs_gc_bio *split_chunk; 744 + int split_sectors; 745 + unsigned int split_len; 746 + struct bio *split; 747 + unsigned int nsegs; 748 + 749 + if (!chunk->is_seq) 750 + return NULL; 751 + 752 + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 753 + lim->max_zone_append_sectors << SECTOR_SHIFT); 754 + if (!split_sectors) 755 + return NULL; 756 + 757 + /* ensure the split chunk is still block size aligned */ 758 + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 759 + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 760 + split_len = split_sectors << SECTOR_SHIFT; 761 + 762 + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 763 + split_chunk = container_of(split, struct xfs_gc_bio, bio); 764 + split_chunk->data = data; 765 + ihold(VFS_I(chunk->ip)); 766 + split_chunk->ip = chunk->ip; 767 + split_chunk->is_seq = chunk->is_seq; 768 + split_chunk->scratch = chunk->scratch; 769 + split_chunk->offset = chunk->offset; 770 + split_chunk->len = split_len; 771 + split_chunk->old_startblock = chunk->old_startblock; 772 + split_chunk->new_daddr = chunk->new_daddr; 773 + split_chunk->oz = chunk->oz; 774 + atomic_inc(&chunk->oz->oz_ref); 775 + 776 + chunk->offset += split_len; 777 + chunk->len -= split_len; 778 + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 779 + 780 + /* add right before the original chunk */ 781 + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 782 + list_add_tail(&split_chunk->entry, &chunk->entry); 783 + return split_chunk; 784 + } 785 + 786 + static void 787 + xfs_zone_gc_write_chunk( 788 + struct xfs_gc_bio *chunk) 789 + { 790 + struct xfs_zone_gc_data *data = chunk->data; 791 + struct xfs_mount *mp = chunk->ip->i_mount; 792 + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; 793 + struct xfs_gc_bio *split_chunk; 794 + 795 + if (chunk->bio.bi_status) 796 + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 797 + if (xfs_is_shutdown(mp)) { 798 + xfs_zone_gc_free_chunk(chunk); 799 + return; 800 + } 801 + 802 + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 803 + list_move_tail(&chunk->entry, &data->writing); 804 + 805 + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); 806 + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, 807 + folio_offset); 808 + 809 + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 810 + xfs_zone_gc_submit_write(data, split_chunk); 811 + xfs_zone_gc_submit_write(data, chunk); 812 + } 813 + 814 + static void 815 + xfs_zone_gc_finish_chunk( 816 + struct xfs_gc_bio *chunk) 817 + { 818 + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 819 + struct xfs_inode *ip = chunk->ip; 820 + struct xfs_mount *mp = ip->i_mount; 821 + int error; 822 + 823 + if (chunk->bio.bi_status) 824 + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 825 + if (xfs_is_shutdown(mp)) { 826 + xfs_zone_gc_free_chunk(chunk); 827 + return; 828 + } 829 + 830 + chunk->scratch->freed += chunk->len; 831 + if (chunk->scratch->freed == chunk->scratch->offset) { 832 + chunk->scratch->offset = 0; 833 + chunk->scratch->freed = 0; 834 + } 835 + 836 + /* 837 + * Cycle through the iolock and wait for direct I/O and layouts to 838 + * ensure no one is reading from the old mapping before it goes away. 839 + * 840 + * Note that xfs_zoned_end_io() below checks that no other writer raced 841 + * with us to update the mapping by checking that the old startblock 842 + * didn't change. 843 + */ 844 + xfs_ilock(ip, iolock); 845 + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 846 + if (!error) 847 + inode_dio_wait(VFS_I(ip)); 848 + xfs_iunlock(ip, iolock); 849 + if (error) 850 + goto free; 851 + 852 + if (chunk->is_seq) 853 + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 854 + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 855 + chunk->new_daddr, chunk->oz, chunk->old_startblock); 856 + free: 857 + if (error) 858 + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 859 + xfs_zone_gc_free_chunk(chunk); 860 + } 861 + 862 + static void 863 + xfs_zone_gc_finish_reset( 864 + struct xfs_gc_bio *chunk) 865 + { 866 + struct xfs_rtgroup *rtg = chunk->bio.bi_private; 867 + struct xfs_mount *mp = rtg_mount(rtg); 868 + struct xfs_zone_info *zi = mp->m_zone_info; 869 + 870 + if (chunk->bio.bi_status) { 871 + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 872 + goto out; 873 + } 874 + 875 + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 876 + atomic_inc(&zi->zi_nr_free_zones); 877 + 878 + xfs_zoned_add_available(mp, rtg_blocks(rtg)); 879 + 880 + wake_up_all(&zi->zi_zone_wait); 881 + out: 882 + list_del(&chunk->entry); 883 + bio_put(&chunk->bio); 884 + } 885 + 886 + static bool 887 + xfs_zone_gc_prepare_reset( 888 + struct bio *bio, 889 + struct xfs_rtgroup *rtg) 890 + { 891 + trace_xfs_zone_reset(rtg); 892 + 893 + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 894 + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 895 + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 896 + if (!bdev_max_discard_sectors(bio->bi_bdev)) 897 + return false; 898 + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; 899 + bio->bi_iter.bi_size = 900 + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); 901 + } 902 + 903 + return true; 904 + } 905 + 906 + int 907 + xfs_zone_gc_reset_sync( 908 + struct xfs_rtgroup *rtg) 909 + { 910 + int error = 0; 911 + struct bio bio; 912 + 913 + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 914 + REQ_OP_ZONE_RESET); 915 + if (xfs_zone_gc_prepare_reset(&bio, rtg)) 916 + error = submit_bio_wait(&bio); 917 + bio_uninit(&bio); 918 + 919 + return error; 920 + } 921 + 922 + static void 923 + xfs_zone_gc_reset_zones( 924 + struct xfs_zone_gc_data *data, 925 + struct xfs_group *reset_list) 926 + { 927 + struct xfs_group *next = reset_list; 928 + 929 + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 930 + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 931 + return; 932 + } 933 + 934 + do { 935 + struct xfs_rtgroup *rtg = to_rtg(next); 936 + struct xfs_gc_bio *chunk; 937 + struct bio *bio; 938 + 939 + xfs_log_force_inode(rtg_rmap(rtg)); 940 + 941 + next = rtg_group(rtg)->xg_next_reset; 942 + rtg_group(rtg)->xg_next_reset = NULL; 943 + 944 + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 945 + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 946 + bio->bi_private = rtg; 947 + bio->bi_end_io = xfs_zone_gc_end_io; 948 + 949 + chunk = container_of(bio, struct xfs_gc_bio, bio); 950 + chunk->data = data; 951 + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 952 + list_add_tail(&chunk->entry, &data->resetting); 953 + 954 + /* 955 + * Also use the bio to drive the state machine when neither 956 + * zone reset nor discard is supported to keep things simple. 957 + */ 958 + if (xfs_zone_gc_prepare_reset(bio, rtg)) 959 + submit_bio(bio); 960 + else 961 + bio_endio(bio); 962 + } while (next); 963 + } 964 + 965 + /* 966 + * Handle the work to read and write data for GC and to reset the zones, 967 + * including handling all completions. 968 + * 969 + * Note that the order of the chunks is preserved so that we don't undo the 970 + * optimal order established by xfs_zone_gc_query(). 971 + */ 972 + static bool 973 + xfs_zone_gc_handle_work( 974 + struct xfs_zone_gc_data *data) 975 + { 976 + struct xfs_zone_info *zi = data->mp->m_zone_info; 977 + struct xfs_gc_bio *chunk, *next; 978 + struct xfs_group *reset_list; 979 + struct blk_plug plug; 980 + 981 + spin_lock(&zi->zi_reset_list_lock); 982 + reset_list = zi->zi_reset_list; 983 + zi->zi_reset_list = NULL; 984 + spin_unlock(&zi->zi_reset_list_lock); 985 + 986 + if (!xfs_zone_gc_select_victim(data) || 987 + !xfs_zone_gc_space_available(data)) { 988 + if (list_empty(&data->reading) && 989 + list_empty(&data->writing) && 990 + list_empty(&data->resetting) && 991 + !reset_list) 992 + return false; 993 + } 994 + 995 + __set_current_state(TASK_RUNNING); 996 + try_to_freeze(); 997 + 998 + if (reset_list) 999 + xfs_zone_gc_reset_zones(data, reset_list); 1000 + 1001 + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1002 + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1003 + break; 1004 + xfs_zone_gc_finish_reset(chunk); 1005 + } 1006 + 1007 + list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1008 + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1009 + break; 1010 + xfs_zone_gc_finish_chunk(chunk); 1011 + } 1012 + 1013 + blk_start_plug(&plug); 1014 + list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1015 + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1016 + break; 1017 + xfs_zone_gc_write_chunk(chunk); 1018 + } 1019 + blk_finish_plug(&plug); 1020 + 1021 + blk_start_plug(&plug); 1022 + while (xfs_zone_gc_start_chunk(data)) 1023 + ; 1024 + blk_finish_plug(&plug); 1025 + return true; 1026 + } 1027 + 1028 + /* 1029 + * Note that the current GC algorithm would break reflinks and thus duplicate 1030 + * data that was shared by multiple owners before. Because of that reflinks 1031 + * are currently not supported on zoned file systems and can't be created or 1032 + * mounted. 1033 + */ 1034 + static int 1035 + xfs_zoned_gcd( 1036 + void *private) 1037 + { 1038 + struct xfs_zone_gc_data *data = private; 1039 + struct xfs_mount *mp = data->mp; 1040 + struct xfs_zone_info *zi = mp->m_zone_info; 1041 + unsigned int nofs_flag; 1042 + 1043 + nofs_flag = memalloc_nofs_save(); 1044 + set_freezable(); 1045 + 1046 + for (;;) { 1047 + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1048 + xfs_set_zonegc_running(mp); 1049 + if (xfs_zone_gc_handle_work(data)) 1050 + continue; 1051 + 1052 + if (list_empty(&data->reading) && 1053 + list_empty(&data->writing) && 1054 + list_empty(&data->resetting) && 1055 + !zi->zi_reset_list) { 1056 + xfs_clear_zonegc_running(mp); 1057 + xfs_zoned_resv_wake_all(mp); 1058 + 1059 + if (kthread_should_stop()) { 1060 + __set_current_state(TASK_RUNNING); 1061 + break; 1062 + } 1063 + 1064 + if (kthread_should_park()) { 1065 + __set_current_state(TASK_RUNNING); 1066 + kthread_parkme(); 1067 + continue; 1068 + } 1069 + } 1070 + 1071 + schedule(); 1072 + } 1073 + xfs_clear_zonegc_running(mp); 1074 + 1075 + if (data->iter.victim_rtg) 1076 + xfs_rtgroup_rele(data->iter.victim_rtg); 1077 + 1078 + memalloc_nofs_restore(nofs_flag); 1079 + xfs_zone_gc_data_free(data); 1080 + return 0; 1081 + } 1082 + 1083 + void 1084 + xfs_zone_gc_start( 1085 + struct xfs_mount *mp) 1086 + { 1087 + if (xfs_has_zoned(mp)) 1088 + kthread_unpark(mp->m_zone_info->zi_gc_thread); 1089 + } 1090 + 1091 + void 1092 + xfs_zone_gc_stop( 1093 + struct xfs_mount *mp) 1094 + { 1095 + if (xfs_has_zoned(mp)) 1096 + kthread_park(mp->m_zone_info->zi_gc_thread); 1097 + } 1098 + 1099 + int 1100 + xfs_zone_gc_mount( 1101 + struct xfs_mount *mp) 1102 + { 1103 + struct xfs_zone_info *zi = mp->m_zone_info; 1104 + struct xfs_zone_gc_data *data; 1105 + struct xfs_open_zone *oz; 1106 + int error; 1107 + 1108 + /* 1109 + * If there are no free zones available for GC, pick the open zone with 1110 + * the least used space to GC into. This should only happen after an 1111 + * unclean shutdown near ENOSPC while GC was ongoing. 1112 + * 1113 + * We also need to do this for the first gc zone allocation if we 1114 + * unmounted while at the open limit. 1115 + */ 1116 + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1117 + zi->zi_nr_open_zones == mp->m_max_open_zones) 1118 + oz = xfs_zone_gc_steal_open(zi); 1119 + else 1120 + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1121 + if (!oz) { 1122 + xfs_warn(mp, "unable to allocate a zone for gc"); 1123 + error = -EIO; 1124 + goto out; 1125 + } 1126 + 1127 + trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1128 + zi->zi_open_gc_zone = oz; 1129 + 1130 + data = xfs_zone_gc_data_alloc(mp); 1131 + if (!data) { 1132 + error = -ENOMEM; 1133 + goto out_put_gc_zone; 1134 + } 1135 + 1136 + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1137 + "xfs-zone-gc/%s", mp->m_super->s_id); 1138 + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { 1139 + xfs_warn(mp, "unable to create zone gc thread"); 1140 + error = PTR_ERR(mp->m_zone_info->zi_gc_thread); 1141 + goto out_free_gc_data; 1142 + } 1143 + 1144 + /* xfs_zone_gc_start will unpark for rw mounts */ 1145 + kthread_park(mp->m_zone_info->zi_gc_thread); 1146 + return 0; 1147 + 1148 + out_free_gc_data: 1149 + kfree(data); 1150 + out_put_gc_zone: 1151 + xfs_open_zone_put(zi->zi_open_gc_zone); 1152 + out: 1153 + return error; 1154 + } 1155 + 1156 + void 1157 + xfs_zone_gc_unmount( 1158 + struct xfs_mount *mp) 1159 + { 1160 + struct xfs_zone_info *zi = mp->m_zone_info; 1161 + 1162 + kthread_stop(zi->zi_gc_thread); 1163 + if (zi->zi_open_gc_zone) 1164 + xfs_open_zone_put(zi->zi_open_gc_zone); 1165 + }

+105

fs/xfs/xfs_zone_info.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2023-2025 Christoph Hellwig. 4 + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_shared.h" 8 + #include "xfs_format.h" 9 + #include "xfs_trans_resv.h" 10 + #include "xfs_mount.h" 11 + #include "xfs_inode.h" 12 + #include "xfs_rtgroup.h" 13 + #include "xfs_zone_alloc.h" 14 + #include "xfs_zone_priv.h" 15 + 16 + static const char xfs_write_hint_shorthand[6][16] = { 17 + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"}; 18 + 19 + static inline const char * 20 + xfs_write_hint_to_str( 21 + uint8_t write_hint) 22 + { 23 + if (write_hint > WRITE_LIFE_EXTREME) 24 + return "UNKNOWN"; 25 + return xfs_write_hint_shorthand[write_hint]; 26 + } 27 + 28 + static void 29 + xfs_show_open_zone( 30 + struct seq_file *m, 31 + struct xfs_open_zone *oz) 32 + { 33 + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", 34 + rtg_rgno(oz->oz_rtg), 35 + oz->oz_write_pointer, oz->oz_written, 36 + rtg_rmap(oz->oz_rtg)->i_used_blocks, 37 + xfs_write_hint_to_str(oz->oz_write_hint)); 38 + } 39 + 40 + static void 41 + xfs_show_full_zone_used_distribution( 42 + struct seq_file *m, 43 + struct xfs_mount *mp) 44 + { 45 + struct xfs_zone_info *zi = mp->m_zone_info; 46 + unsigned int reclaimable = 0, full, i; 47 + 48 + spin_lock(&zi->zi_used_buckets_lock); 49 + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { 50 + unsigned int entries = zi->zi_used_bucket_entries[i]; 51 + 52 + seq_printf(m, "\t %2u..%2u%%: %u\n", 53 + i * (100 / XFS_ZONE_USED_BUCKETS), 54 + (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1, 55 + entries); 56 + reclaimable += entries; 57 + } 58 + spin_unlock(&zi->zi_used_buckets_lock); 59 + 60 + full = mp->m_sb.sb_rgcount; 61 + if (zi->zi_open_gc_zone) 62 + full--; 63 + full -= zi->zi_nr_open_zones; 64 + full -= atomic_read(&zi->zi_nr_free_zones); 65 + full -= reclaimable; 66 + 67 + seq_printf(m, "\t 100%%: %u\n", full); 68 + } 69 + 70 + void 71 + xfs_zoned_show_stats( 72 + struct seq_file *m, 73 + struct xfs_mount *mp) 74 + { 75 + struct xfs_zone_info *zi = mp->m_zone_info; 76 + struct xfs_open_zone *oz; 77 + 78 + seq_puts(m, "\n"); 79 + 80 + seq_printf(m, "\tuser free RT blocks: %lld\n", 81 + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); 82 + seq_printf(m, "\treserved free RT blocks: %lld\n", 83 + mp->m_free[XC_FREE_RTEXTENTS].res_avail); 84 + seq_printf(m, "\tuser available RT blocks: %lld\n", 85 + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE)); 86 + seq_printf(m, "\treserved available RT blocks: %lld\n", 87 + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 88 + seq_printf(m, "\tRT reservations required: %d\n", 89 + !list_empty_careful(&zi->zi_reclaim_reservations)); 90 + seq_printf(m, "\tRT GC required: %d\n", 91 + xfs_zoned_need_gc(mp)); 92 + 93 + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); 94 + seq_puts(m, "\topen zones:\n"); 95 + spin_lock(&zi->zi_open_zones_lock); 96 + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) 97 + xfs_show_open_zone(m, oz); 98 + if (zi->zi_open_gc_zone) { 99 + seq_puts(m, "\topen gc zone:\n"); 100 + xfs_show_open_zone(m, zi->zi_open_gc_zone); 101 + } 102 + spin_unlock(&zi->zi_open_zones_lock); 103 + seq_puts(m, "\tused blocks distribution (fully written zones):\n"); 104 + xfs_show_full_zone_used_distribution(m, mp); 105 + }

+119

fs/xfs/xfs_zone_priv.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _XFS_ZONE_PRIV_H 3 + #define _XFS_ZONE_PRIV_H 4 + 5 + struct xfs_open_zone { 6 + /* 7 + * Entry in the open zone list and refcount. Protected by 8 + * zi_open_zones_lock in struct xfs_zone_info. 9 + */ 10 + struct list_head oz_entry; 11 + atomic_t oz_ref; 12 + 13 + /* 14 + * oz_write_pointer is the write pointer at which space is handed out 15 + * for conventional zones, or simple the count of blocks handed out 16 + * so far for sequential write required zones and is protected by 17 + * oz_alloc_lock/ 18 + */ 19 + spinlock_t oz_alloc_lock; 20 + xfs_rgblock_t oz_write_pointer; 21 + 22 + /* 23 + * oz_written is the number of blocks for which we've received a 24 + * write completion. oz_written must always be <= oz_write_pointer 25 + * and is protected by the ILOCK of the rmap inode. 26 + */ 27 + xfs_rgblock_t oz_written; 28 + 29 + /* 30 + * Write hint (data temperature) assigned to this zone, or 31 + * WRITE_LIFE_NOT_SET if none was set. 32 + */ 33 + enum rw_hint oz_write_hint; 34 + 35 + /* 36 + * Is this open zone used for garbage collection? There can only be a 37 + * single open GC zone, which is pointed to by zi_open_gc_zone in 38 + * struct xfs_zone_info. Constant over the life time of an open zone. 39 + */ 40 + bool oz_is_gc; 41 + 42 + /* 43 + * Pointer to the RT groups structure for this open zone. Constant over 44 + * the life time of an open zone. 45 + */ 46 + struct xfs_rtgroup *oz_rtg; 47 + }; 48 + 49 + /* 50 + * Number of bitmap buckets to track reclaimable zones. There are 10 buckets 51 + * so that each 10% of the usable capacity get their own bucket and GC can 52 + * only has to walk the bitmaps of the lesser used zones if there are any. 53 + */ 54 + #define XFS_ZONE_USED_BUCKETS 10u 55 + 56 + struct xfs_zone_info { 57 + /* 58 + * List of pending space reservations: 59 + */ 60 + spinlock_t zi_reservation_lock; 61 + struct list_head zi_reclaim_reservations; 62 + 63 + /* 64 + * List and number of open zones: 65 + */ 66 + spinlock_t zi_open_zones_lock; 67 + struct list_head zi_open_zones; 68 + unsigned int zi_nr_open_zones; 69 + 70 + /* 71 + * Free zone search cursor and number of free zones: 72 + */ 73 + unsigned long zi_free_zone_cursor; 74 + atomic_t zi_nr_free_zones; 75 + 76 + /* 77 + * Wait queue to wait for free zones or open zone resources to become 78 + * available: 79 + */ 80 + wait_queue_head_t zi_zone_wait; 81 + 82 + /* 83 + * Pointer to the GC thread, and the current open zone used by GC 84 + * (if any). 85 + * 86 + * zi_open_gc_zone is mostly private to the GC thread, but can be read 87 + * for debugging from other threads, in which case zi_open_zones_lock 88 + * must be taken to access it. 89 + */ 90 + struct task_struct *zi_gc_thread; 91 + struct xfs_open_zone *zi_open_gc_zone; 92 + 93 + /* 94 + * List of zones that need a reset: 95 + */ 96 + spinlock_t zi_reset_list_lock; 97 + struct xfs_group *zi_reset_list; 98 + 99 + /* 100 + * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help 101 + * garbage collection to quickly find the best candidate for reclaim. 102 + */ 103 + spinlock_t zi_used_buckets_lock; 104 + unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS]; 105 + unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS]; 106 + 107 + }; 108 + 109 + struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, 110 + enum rw_hint write_hint, bool is_gc); 111 + 112 + int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); 113 + bool xfs_zoned_need_gc(struct xfs_mount *mp); 114 + int xfs_zone_gc_mount(struct xfs_mount *mp); 115 + void xfs_zone_gc_unmount(struct xfs_mount *mp); 116 + 117 + void xfs_zoned_resv_wake_all(struct xfs_mount *mp); 118 + 119 + #endif /* _XFS_ZONE_PRIV_H */

+263

fs/xfs/xfs_zone_space_resv.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2023-2025 Christoph Hellwig. 4 + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_shared.h" 8 + #include "xfs_format.h" 9 + #include "xfs_trans_resv.h" 10 + #include "xfs_mount.h" 11 + #include "xfs_inode.h" 12 + #include "xfs_rtbitmap.h" 13 + #include "xfs_zone_alloc.h" 14 + #include "xfs_zone_priv.h" 15 + #include "xfs_zones.h" 16 + 17 + /* 18 + * Note: the zoned allocator does not support a rtextsize > 1, so this code and 19 + * the allocator itself uses file system blocks interchangeable with realtime 20 + * extents without doing the otherwise required conversions. 21 + */ 22 + 23 + /* 24 + * Per-task space reservation. 25 + * 26 + * Tasks that need to wait for GC to free up space allocate one of these 27 + * on-stack and adds it to the per-mount zi_reclaim_reservations lists. 28 + * The GC thread will then wake the tasks in order when space becomes available. 29 + */ 30 + struct xfs_zone_reservation { 31 + struct list_head entry; 32 + struct task_struct *task; 33 + xfs_filblks_t count_fsb; 34 + }; 35 + 36 + /* 37 + * Calculate the number of reserved blocks. 38 + * 39 + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file 40 + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly 41 + * available for writes without waiting for GC. 42 + * 43 + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and 44 + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS 45 + * is further restricted by at least one zone as well as the optional 46 + * persistently reserved blocks. This allows the allocator to run more 47 + * smoothly by not always triggering GC. 48 + */ 49 + uint64_t 50 + xfs_zoned_default_resblks( 51 + struct xfs_mount *mp, 52 + enum xfs_free_counter ctr) 53 + { 54 + switch (ctr) { 55 + case XC_FREE_RTEXTENTS: 56 + return (uint64_t)XFS_RESERVED_ZONES * 57 + mp->m_groups[XG_TYPE_RTG].blocks + 58 + mp->m_sb.sb_rtreserved; 59 + case XC_FREE_RTAVAILABLE: 60 + return (uint64_t)XFS_GC_ZONES * 61 + mp->m_groups[XG_TYPE_RTG].blocks; 62 + default: 63 + ASSERT(0); 64 + return 0; 65 + } 66 + } 67 + 68 + void 69 + xfs_zoned_resv_wake_all( 70 + struct xfs_mount *mp) 71 + { 72 + struct xfs_zone_info *zi = mp->m_zone_info; 73 + struct xfs_zone_reservation *reservation; 74 + 75 + spin_lock(&zi->zi_reservation_lock); 76 + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) 77 + wake_up_process(reservation->task); 78 + spin_unlock(&zi->zi_reservation_lock); 79 + } 80 + 81 + void 82 + xfs_zoned_add_available( 83 + struct xfs_mount *mp, 84 + xfs_filblks_t count_fsb) 85 + { 86 + struct xfs_zone_info *zi = mp->m_zone_info; 87 + struct xfs_zone_reservation *reservation; 88 + 89 + if (list_empty_careful(&zi->zi_reclaim_reservations)) { 90 + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); 91 + return; 92 + } 93 + 94 + spin_lock(&zi->zi_reservation_lock); 95 + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); 96 + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); 97 + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { 98 + if (reservation->count_fsb > count_fsb) 99 + break; 100 + wake_up_process(reservation->task); 101 + count_fsb -= reservation->count_fsb; 102 + 103 + } 104 + spin_unlock(&zi->zi_reservation_lock); 105 + } 106 + 107 + static int 108 + xfs_zoned_space_wait_error( 109 + struct xfs_mount *mp) 110 + { 111 + if (xfs_is_shutdown(mp)) 112 + return -EIO; 113 + if (fatal_signal_pending(current)) 114 + return -EINTR; 115 + return 0; 116 + } 117 + 118 + static int 119 + xfs_zoned_reserve_available( 120 + struct xfs_inode *ip, 121 + xfs_filblks_t count_fsb, 122 + unsigned int flags) 123 + { 124 + struct xfs_mount *mp = ip->i_mount; 125 + struct xfs_zone_info *zi = mp->m_zone_info; 126 + struct xfs_zone_reservation reservation = { 127 + .task = current, 128 + .count_fsb = count_fsb, 129 + }; 130 + int error; 131 + 132 + /* 133 + * If there are no waiters, try to directly grab the available blocks 134 + * from the percpu counter. 135 + * 136 + * If the caller wants to dip into the reserved pool also bypass the 137 + * wait list. This relies on the fact that we have a very graciously 138 + * sized reserved pool that always has enough space. If the reserved 139 + * allocations fail we're in trouble. 140 + */ 141 + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || 142 + (flags & XFS_ZR_RESERVED))) { 143 + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, 144 + flags & XFS_ZR_RESERVED); 145 + if (error != -ENOSPC) 146 + return error; 147 + } 148 + 149 + if (flags & XFS_ZR_NOWAIT) 150 + return -EAGAIN; 151 + 152 + spin_lock(&zi->zi_reservation_lock); 153 + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); 154 + while ((error = xfs_zoned_space_wait_error(mp)) == 0) { 155 + set_current_state(TASK_KILLABLE); 156 + 157 + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, 158 + flags & XFS_ZR_RESERVED); 159 + if (error != -ENOSPC) 160 + break; 161 + 162 + /* 163 + * Make sure to start GC if it is not running already. As we 164 + * check the rtavailable count when filling up zones, GC is 165 + * normally already running at this point, but in some setups 166 + * with very few zones we may completely run out of non- 167 + * reserved blocks in between filling zones. 168 + */ 169 + if (!xfs_is_zonegc_running(mp)) 170 + wake_up_process(zi->zi_gc_thread); 171 + 172 + /* 173 + * If there is no reclaimable group left and we aren't still 174 + * processing a pending GC request give up as we're fully out 175 + * of space. 176 + */ 177 + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && 178 + !xfs_is_zonegc_running(mp)) 179 + break; 180 + 181 + spin_unlock(&zi->zi_reservation_lock); 182 + schedule(); 183 + spin_lock(&zi->zi_reservation_lock); 184 + } 185 + list_del(&reservation.entry); 186 + spin_unlock(&zi->zi_reservation_lock); 187 + 188 + __set_current_state(TASK_RUNNING); 189 + return error; 190 + } 191 + 192 + /* 193 + * Implement greedy space allocation for short writes by trying to grab all 194 + * that is left after locking out other threads from trying to do the same. 195 + * 196 + * This isn't exactly optimal and can hopefully be replaced by a proper 197 + * percpu_counter primitive one day. 198 + */ 199 + static int 200 + xfs_zoned_reserve_extents_greedy( 201 + struct xfs_inode *ip, 202 + xfs_filblks_t *count_fsb, 203 + unsigned int flags) 204 + { 205 + struct xfs_mount *mp = ip->i_mount; 206 + struct xfs_zone_info *zi = mp->m_zone_info; 207 + s64 len = *count_fsb; 208 + int error = -ENOSPC; 209 + 210 + spin_lock(&zi->zi_reservation_lock); 211 + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); 212 + if (len > 0) { 213 + *count_fsb = len; 214 + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, 215 + flags & XFS_ZR_RESERVED); 216 + } 217 + spin_unlock(&zi->zi_reservation_lock); 218 + return error; 219 + } 220 + 221 + int 222 + xfs_zoned_space_reserve( 223 + struct xfs_inode *ip, 224 + xfs_filblks_t count_fsb, 225 + unsigned int flags, 226 + struct xfs_zone_alloc_ctx *ac) 227 + { 228 + struct xfs_mount *mp = ip->i_mount; 229 + int error; 230 + 231 + ASSERT(ac->reserved_blocks == 0); 232 + ASSERT(ac->open_zone == NULL); 233 + 234 + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, 235 + flags & XFS_ZR_RESERVED); 236 + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) 237 + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); 238 + if (error) 239 + return error; 240 + 241 + error = xfs_zoned_reserve_available(ip, count_fsb, flags); 242 + if (error) { 243 + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); 244 + return error; 245 + } 246 + ac->reserved_blocks = count_fsb; 247 + return 0; 248 + } 249 + 250 + void 251 + xfs_zoned_space_unreserve( 252 + struct xfs_inode *ip, 253 + struct xfs_zone_alloc_ctx *ac) 254 + { 255 + if (ac->reserved_blocks > 0) { 256 + struct xfs_mount *mp = ip->i_mount; 257 + 258 + xfs_zoned_add_available(mp, ac->reserved_blocks); 259 + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); 260 + } 261 + if (ac->open_zone) 262 + xfs_open_zone_put(ac->open_zone); 263 + }