Merge tag 'xfs-5.12-merge-6' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull more xfs updates from Darrick Wong:
"The most notable fix here prevents premature reuse of freed metadata
blocks, and adding the ability to detect accidental nested
transactions, which are not allowed here.

- Restore a disused sysctl control knob that was inadvertently
dropped during the merge window to avoid fstests regressions.

- Don't speculatively release freed blocks from the busy list until
we're actually allocating them, which fixes a rare log recovery
regression.

- Don't nest transactions when scanning for free space.

- Add an idiot^Wmaintainer light to detect nested transactions. ;)"

* tag 'xfs-5.12-merge-6' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
xfs: use current->journal_info for detecting transaction recursion
xfs: don't nest transactions when scanning for eofblocks
xfs: don't reuse busy extents on extent trim
xfs: restore speculative_cow_prealloc_lifetime sysctl

+94 -70
+10 -6
Documentation/admin-guide/xfs.rst
··· 284 284 removes unused preallocation from clean inodes and releases 285 285 the unused space back to the free pool. 286 286 287 + fs.xfs.speculative_cow_prealloc_lifetime 288 + This is an alias for speculative_prealloc_lifetime. 289 + 287 290 fs.xfs.error_level (Min: 0 Default: 3 Max: 11) 288 291 A volume knob for error reporting when internal errors occur. 289 292 This will generate detailed messages & backtraces for filesystem ··· 359 356 Deprecated Sysctls 360 357 ================== 361 358 362 - =========================== ================ 363 - Name Removal Schedule 364 - =========================== ================ 365 - fs.xfs.irix_sgid_inherit September 2025 366 - fs.xfs.irix_symlink_mode September 2025 367 - =========================== ================ 359 + =========================================== ================ 360 + Name Removal Schedule 361 + =========================================== ================ 362 + fs.xfs.irix_sgid_inherit September 2025 363 + fs.xfs.irix_symlink_mode September 2025 364 + fs.xfs.speculative_cow_prealloc_lifetime September 2025 365 + =========================================== ================ 368 366 369 367 370 368 Removed Sysctls
-7
fs/iomap/buffered-io.c
··· 1459 1459 goto redirty; 1460 1460 1461 1461 /* 1462 - * Given that we do not allow direct reclaim to call us, we should 1463 - * never be called in a recursive filesystem reclaim context. 1464 - */ 1465 - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) 1466 - goto redirty; 1467 - 1468 - /* 1469 1462 * Is this page beyond the end of the file? 1470 1463 * 1471 1464 * The page index is less than the end_index, adjust the end_offset
+10 -2
fs/xfs/libxfs/xfs_btree.c
··· 2805 2805 struct xfs_btree_split_args *args = container_of(work, 2806 2806 struct xfs_btree_split_args, work); 2807 2807 unsigned long pflags; 2808 - unsigned long new_pflags = PF_MEMALLOC_NOFS; 2808 + unsigned long new_pflags = 0; 2809 2809 2810 2810 /* 2811 2811 * we are in a transaction context here, but may also be doing work ··· 2817 2817 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2818 2818 2819 2819 current_set_flags_nested(&pflags, new_pflags); 2820 + xfs_trans_set_context(args->cur->bc_tp); 2820 2821 2821 2822 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 2822 2823 args->key, args->curp, args->stat); 2824 + 2825 + xfs_trans_clear_context(args->cur->bc_tp); 2826 + current_restore_flags_nested(&pflags, new_pflags); 2827 + 2828 + /* 2829 + * Do not access args after complete() has run here. We don't own args 2830 + * and the owner may run and free args before we return here. 2831 + */ 2823 2832 complete(args->done); 2824 2833 2825 - current_restore_flags_nested(&pflags, new_pflags); 2826 2834 } 2827 2835 2828 2836 /*
+15 -2
fs/xfs/xfs_aops.c
··· 62 62 * We hand off the transaction to the completion thread now, so 63 63 * clear the flag here. 64 64 */ 65 - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 65 + xfs_trans_clear_context(tp); 66 66 return 0; 67 67 } 68 68 ··· 125 125 * thus we need to mark ourselves as being in a transaction manually. 126 126 * Similarly for freeze protection. 127 127 */ 128 - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 128 + xfs_trans_set_context(tp); 129 129 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); 130 130 131 131 /* we abort the update if there was an IO error */ ··· 568 568 { 569 569 struct xfs_writepage_ctx wpc = { }; 570 570 571 + if (WARN_ON_ONCE(current->journal_info)) { 572 + redirty_page_for_writepage(wbc, page); 573 + unlock_page(page); 574 + return 0; 575 + } 576 + 571 577 return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); 572 578 } 573 579 ··· 583 577 struct writeback_control *wbc) 584 578 { 585 579 struct xfs_writepage_ctx wpc = { }; 580 + 581 + /* 582 + * Writing back data in a transaction context can result in recursive 583 + * transactions. This is bad, so issue a warning and get out of here. 584 + */ 585 + if (WARN_ON_ONCE(current->journal_info)) 586 + return 0; 586 587 587 588 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 588 589 return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
-14
fs/xfs/xfs_extent_busy.c
··· 344 344 ASSERT(*len > 0); 345 345 346 346 spin_lock(&args->pag->pagb_lock); 347 - restart: 348 347 fbno = *bno; 349 348 flen = *len; 350 349 rbp = args->pag->pagb_tree.rb_node; ··· 359 360 continue; 360 361 } else if (fbno >= bend) { 361 362 rbp = rbp->rb_right; 362 - continue; 363 - } 364 - 365 - /* 366 - * If this is a metadata allocation, try to reuse the busy 367 - * extent instead of trimming the allocation. 368 - */ 369 - if (!(args->datatype & XFS_ALLOC_USERDATA) && 370 - !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { 371 - if (!xfs_extent_busy_update_extent(args->mp, args->pag, 372 - busyp, fbno, flen, 373 - false)) 374 - goto restart; 375 363 continue; 376 364 } 377 365
+14 -21
fs/xfs/xfs_sysctl.c
··· 51 51 #endif /* CONFIG_PROC_FS */ 52 52 53 53 STATIC int 54 - xfs_deprecate_irix_sgid_inherit_proc_handler( 54 + xfs_deprecated_dointvec_minmax( 55 55 struct ctl_table *ctl, 56 56 int write, 57 57 void *buffer, ··· 59 59 loff_t *ppos) 60 60 { 61 61 if (write) { 62 - printk_once(KERN_WARNING 63 - "XFS: " "%s sysctl option is deprecated.\n", 64 - ctl->procname); 65 - } 66 - return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); 67 - } 68 - 69 - STATIC int 70 - xfs_deprecate_irix_symlink_mode_proc_handler( 71 - struct ctl_table *ctl, 72 - int write, 73 - void *buffer, 74 - size_t *lenp, 75 - loff_t *ppos) 76 - { 77 - if (write) { 78 - printk_once(KERN_WARNING 79 - "XFS: " "%s sysctl option is deprecated.\n", 62 + printk_ratelimited(KERN_WARNING 63 + "XFS: %s sysctl option is deprecated.\n", 80 64 ctl->procname); 81 65 } 82 66 return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); ··· 72 88 .data = &xfs_params.sgid_inherit.val, 73 89 .maxlen = sizeof(int), 74 90 .mode = 0644, 75 - .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, 91 + .proc_handler = xfs_deprecated_dointvec_minmax, 76 92 .extra1 = &xfs_params.sgid_inherit.min, 77 93 .extra2 = &xfs_params.sgid_inherit.max 78 94 }, ··· 81 97 .data = &xfs_params.symlink_mode.val, 82 98 .maxlen = sizeof(int), 83 99 .mode = 0644, 84 - .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, 100 + .proc_handler = xfs_deprecated_dointvec_minmax, 85 101 .extra1 = &xfs_params.symlink_mode.min, 86 102 .extra2 = &xfs_params.symlink_mode.max 87 103 }, ··· 182 198 .maxlen = sizeof(int), 183 199 .mode = 0644, 184 200 .proc_handler = proc_dointvec_minmax, 201 + .extra1 = &xfs_params.blockgc_timer.min, 202 + .extra2 = &xfs_params.blockgc_timer.max, 203 + }, 204 + { 205 + .procname = "speculative_cow_prealloc_lifetime", 206 + .data = &xfs_params.blockgc_timer.val, 207 + .maxlen = sizeof(int), 208 + .mode = 0644, 209 + .proc_handler = xfs_deprecated_dointvec_minmax, 185 210 .extra1 = &xfs_params.blockgc_timer.min, 186 211 .extra2 = &xfs_params.blockgc_timer.max, 187 212 },
+15 -18
fs/xfs/xfs_trans.c
··· 72 72 xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); 73 73 74 74 trace_xfs_trans_free(tp, _RET_IP_); 75 + xfs_trans_clear_context(tp); 75 76 if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) 76 77 sb_end_intwrite(tp->t_mountp->m_super); 77 78 xfs_trans_free_dqinfo(tp); ··· 124 123 125 124 ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; 126 125 tp->t_rtx_res = tp->t_rtx_res_used; 127 - ntp->t_pflags = tp->t_pflags; 126 + 127 + xfs_trans_switch_context(tp, ntp); 128 128 129 129 /* move deferred ops over to the new tp */ 130 130 xfs_defer_move(ntp, tp); ··· 159 157 int error = 0; 160 158 bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 161 159 162 - /* Mark this thread as being in a transaction */ 163 - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 164 - 165 160 /* 166 161 * Attempt to reserve the needed disk blocks by decrementing 167 162 * the number needed from the number available. This will ··· 166 167 */ 167 168 if (blocks > 0) { 168 169 error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); 169 - if (error != 0) { 170 - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 170 + if (error != 0) 171 171 return -ENOSPC; 172 - } 173 172 tp->t_blk_res += blocks; 174 173 } 175 174 ··· 241 244 xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); 242 245 tp->t_blk_res = 0; 243 246 } 244 - 245 - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 246 - 247 247 return error; 248 248 } 249 249 ··· 254 260 struct xfs_trans **tpp) 255 261 { 256 262 struct xfs_trans *tp; 263 + bool want_retry = true; 257 264 int error; 258 265 259 266 /* ··· 262 267 * GFP_NOFS allocation context so that we avoid lockdep false positives 263 268 * by doing GFP_KERNEL allocations inside sb_start_intwrite(). 264 269 */ 270 + retry: 265 271 tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); 266 272 if (!(flags & XFS_TRANS_NO_WRITECOUNT)) 267 273 sb_start_intwrite(mp->m_super); 274 + xfs_trans_set_context(tp); 268 275 269 276 /* 270 277 * Zero-reservation ("empty") transactions can't modify anything, so ··· 286 289 tp->t_firstblock = NULLFSBLOCK; 287 290 288 291 error = xfs_trans_reserve(tp, resp, blocks, rtextents); 289 - if (error == -ENOSPC) { 292 + if (error == -ENOSPC && want_retry) { 293 + xfs_trans_cancel(tp); 294 + 290 295 /* 291 296 * We weren't able to reserve enough space for the transaction. 292 297 * Flush the other speculative space allocations to free space. ··· 296 297 * other locks. 297 298 */ 298 299 error = xfs_blockgc_free_space(mp, NULL); 299 - if (!error) 300 - error = xfs_trans_reserve(tp, resp, blocks, rtextents); 300 + if (error) 301 + return error; 302 + 303 + want_retry = false; 304 + goto retry; 301 305 } 302 306 if (error) { 303 307 xfs_trans_cancel(tp); ··· 895 893 896 894 xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); 897 895 898 - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 899 896 xfs_trans_free(tp); 900 897 901 898 /* ··· 926 925 xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); 927 926 tp->t_ticket = NULL; 928 927 } 929 - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 930 928 xfs_trans_free_items(tp, !!error); 931 929 xfs_trans_free(tp); 932 930 ··· 984 984 xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); 985 985 tp->t_ticket = NULL; 986 986 } 987 - 988 - /* mark this thread as no longer being in a transaction */ 989 - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); 990 987 991 988 xfs_trans_free_items(tp, dirty); 992 989 xfs_trans_free(tp);
+30
fs/xfs/xfs_trans.h
··· 281 281 struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, 282 282 struct xfs_trans **tpp); 283 283 284 + static inline void 285 + xfs_trans_set_context( 286 + struct xfs_trans *tp) 287 + { 288 + ASSERT(current->journal_info == NULL); 289 + tp->t_pflags = memalloc_nofs_save(); 290 + current->journal_info = tp; 291 + } 292 + 293 + static inline void 294 + xfs_trans_clear_context( 295 + struct xfs_trans *tp) 296 + { 297 + if (current->journal_info == tp) { 298 + memalloc_nofs_restore(tp->t_pflags); 299 + current->journal_info = NULL; 300 + } 301 + } 302 + 303 + static inline void 304 + xfs_trans_switch_context( 305 + struct xfs_trans *old_tp, 306 + struct xfs_trans *new_tp) 307 + { 308 + ASSERT(current->journal_info == old_tp); 309 + new_tp->t_pflags = old_tp->t_pflags; 310 + old_tp->t_pflags = 0; 311 + current->journal_info = new_tp; 312 + } 313 + 284 314 #endif /* __XFS_TRANS_H__ */