Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6#include "xfs.h"
7#include "xfs_shared.h"
8#include "xfs_format.h"
9#include "xfs_trans_resv.h"
10#include "xfs_mount.h"
11#include "xfs_inode.h"
12#include "xfs_rtbitmap.h"
13#include "xfs_icache.h"
14#include "xfs_zone_alloc.h"
15#include "xfs_zone_priv.h"
16#include "xfs_zones.h"
17
18/*
19 * Note: the zoned allocator does not support a rtextsize > 1, so this code and
20 * the allocator itself uses file system blocks interchangeable with realtime
21 * extents without doing the otherwise required conversions.
22 */
23
24/*
25 * Per-task space reservation.
26 *
27 * Tasks that need to wait for GC to free up space allocate one of these
28 * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
29 * The GC thread will then wake the tasks in order when space becomes available.
30 */
31struct xfs_zone_reservation {
32 struct list_head entry;
33 struct task_struct *task;
34 xfs_filblks_t count_fsb;
35};
36
37/*
38 * Calculate the number of reserved blocks.
39 *
40 * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
41 * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
42 * available for writes without waiting for GC.
43 *
44 * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
45 * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
46 * is further restricted by at least one zone as well as the optional
47 * persistently reserved blocks. This allows the allocator to run more
48 * smoothly by not always triggering GC.
49 */
50uint64_t
51xfs_zoned_default_resblks(
52 struct xfs_mount *mp,
53 enum xfs_free_counter ctr)
54{
55 switch (ctr) {
56 case XC_FREE_RTEXTENTS:
57 return xfs_rtgs_to_rfsbs(mp, XFS_RESERVED_ZONES) +
58 mp->m_sb.sb_rtreserved;
59 case XC_FREE_RTAVAILABLE:
60 return xfs_rtgs_to_rfsbs(mp, XFS_GC_ZONES);
61 default:
62 ASSERT(0);
63 return 0;
64 }
65}
66
67void
68xfs_zoned_resv_wake_all(
69 struct xfs_mount *mp)
70{
71 struct xfs_zone_info *zi = mp->m_zone_info;
72 struct xfs_zone_reservation *reservation;
73
74 spin_lock(&zi->zi_reservation_lock);
75 list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
76 wake_up_process(reservation->task);
77 spin_unlock(&zi->zi_reservation_lock);
78}
79
80void
81xfs_zoned_add_available(
82 struct xfs_mount *mp,
83 xfs_filblks_t count_fsb)
84{
85 struct xfs_zone_info *zi = mp->m_zone_info;
86 struct xfs_zone_reservation *reservation;
87
88 if (list_empty_careful(&zi->zi_reclaim_reservations)) {
89 xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
90 return;
91 }
92
93 spin_lock(&zi->zi_reservation_lock);
94 xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
95 count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
96 list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
97 if (reservation->count_fsb > count_fsb)
98 break;
99 wake_up_process(reservation->task);
100 count_fsb -= reservation->count_fsb;
101
102 }
103 spin_unlock(&zi->zi_reservation_lock);
104}
105
106static int
107xfs_zoned_space_wait_error(
108 struct xfs_mount *mp)
109{
110 if (xfs_is_shutdown(mp))
111 return -EIO;
112 if (fatal_signal_pending(current))
113 return -EINTR;
114 return 0;
115}
116
117static int
118xfs_zoned_reserve_available(
119 struct xfs_mount *mp,
120 xfs_filblks_t count_fsb,
121 unsigned int flags)
122{
123 struct xfs_zone_info *zi = mp->m_zone_info;
124 struct xfs_zone_reservation reservation = {
125 .task = current,
126 .count_fsb = count_fsb,
127 };
128 int error;
129
130 /*
131 * If there are no waiters, try to directly grab the available blocks
132 * from the percpu counter.
133 *
134 * If the caller wants to dip into the reserved pool also bypass the
135 * wait list. This relies on the fact that we have a very graciously
136 * sized reserved pool that always has enough space. If the reserved
137 * allocations fail we're in trouble.
138 */
139 if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
140 (flags & XFS_ZR_RESERVED))) {
141 error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
142 flags & XFS_ZR_RESERVED);
143 if (error != -ENOSPC)
144 return error;
145 }
146
147 if (flags & XFS_ZR_NOWAIT)
148 return -EAGAIN;
149
150 spin_lock(&zi->zi_reservation_lock);
151 list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
152 while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
153 set_current_state(TASK_KILLABLE);
154
155 error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
156 flags & XFS_ZR_RESERVED);
157 if (error != -ENOSPC)
158 break;
159
160 /*
161 * Make sure to start GC if it is not running already. As we
162 * check the rtavailable count when filling up zones, GC is
163 * normally already running at this point, but in some setups
164 * with very few zones we may completely run out of non-
165 * reserved blocks in between filling zones.
166 */
167 if (!xfs_is_zonegc_running(mp))
168 wake_up_process(zi->zi_gc_thread);
169
170 /*
171 * If there is no reclaimable group left and we aren't still
172 * processing a pending GC request give up as we're fully out
173 * of space.
174 */
175 if (!xfs_zoned_have_reclaimable(mp->m_zone_info) &&
176 !xfs_is_zonegc_running(mp))
177 break;
178
179 spin_unlock(&zi->zi_reservation_lock);
180 schedule();
181 spin_lock(&zi->zi_reservation_lock);
182 }
183 list_del(&reservation.entry);
184 spin_unlock(&zi->zi_reservation_lock);
185
186 __set_current_state(TASK_RUNNING);
187 return error;
188}
189
190/*
191 * Implement greedy space allocation for short writes by trying to grab all
192 * that is left after locking out other threads from trying to do the same.
193 *
194 * This isn't exactly optimal and can hopefully be replaced by a proper
195 * percpu_counter primitive one day.
196 */
197static int
198xfs_zoned_reserve_extents_greedy(
199 struct xfs_mount *mp,
200 xfs_filblks_t *count_fsb,
201 unsigned int flags)
202{
203 struct xfs_zone_info *zi = mp->m_zone_info;
204 s64 len = *count_fsb;
205 int error = -ENOSPC;
206
207 spin_lock(&zi->zi_reservation_lock);
208 len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
209 if (len > 0) {
210 *count_fsb = len;
211 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
212 flags & XFS_ZR_RESERVED);
213 }
214 spin_unlock(&zi->zi_reservation_lock);
215 return error;
216}
217
218int
219xfs_zoned_space_reserve(
220 struct xfs_mount *mp,
221 xfs_filblks_t count_fsb,
222 unsigned int flags,
223 struct xfs_zone_alloc_ctx *ac)
224{
225 int error;
226
227 ASSERT(ac->reserved_blocks == 0);
228 ASSERT(ac->open_zone == NULL);
229
230 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
231 flags & XFS_ZR_RESERVED);
232 if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) {
233 xfs_inodegc_flush(mp);
234 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
235 flags & XFS_ZR_RESERVED);
236 }
237 if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
238 error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
239 if (error)
240 return error;
241
242 error = xfs_zoned_reserve_available(mp, count_fsb, flags);
243 if (error) {
244 xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
245 return error;
246 }
247 ac->reserved_blocks = count_fsb;
248 return 0;
249}
250
251void
252xfs_zoned_space_unreserve(
253 struct xfs_mount *mp,
254 struct xfs_zone_alloc_ctx *ac)
255{
256 if (ac->reserved_blocks > 0) {
257 xfs_zoned_add_available(mp, ac->reserved_blocks);
258 xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
259 }
260 if (ac->open_zone)
261 xfs_open_zone_put(ac->open_zone);
262}