Merge tag 'xfs-6.6-fixes-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+242 -24

fs/xfs/xfs_discard.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * Copyright (C) 2010 Red Hat, Inc. 3 + * Copyright (C) 2010, 2023 Red Hat, Inc. 4 4 * All Rights Reserved. 5 5 */ 6 6 #include "xfs.h" ··· 19 19 #include "xfs_log.h" 20 20 #include "xfs_ag.h" 21 21 22 - STATIC int 23 - xfs_trim_extents( 22 + /* 23 + * Notes on an efficient, low latency fstrim algorithm 24 + * 25 + * We need to walk the filesystem free space and issue discards on the free 26 + * space that meet the search criteria (size and location). We cannot issue 27 + * discards on extents that might be in use, or are so recently in use they are 28 + * still marked as busy. To serialise against extent state changes whilst we are 29 + * gathering extents to trim, we must hold the AGF lock to lock out other 30 + * allocations and extent free operations that might change extent state. 31 + * 32 + * However, we cannot just hold the AGF for the entire AG free space walk whilst 33 + * we issue discards on each free space that is found. Storage devices can have 34 + * extremely slow discard implementations (e.g. ceph RBD) and so walking a 35 + * couple of million free extents and issuing synchronous discards on each 36 + * extent can take a *long* time. Whilst we are doing this walk, nothing else 37 + * can access the AGF, and we can stall transactions and hence the log whilst 38 + * modifications wait for the AGF lock to be released. This can lead hung tasks 39 + * kicking the hung task timer and rebooting the system. This is bad. 40 + * 41 + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 42 + * lock, gathers a range of inode cluster buffers that are allocated, drops the 43 + * AGI lock and then reads all the inode cluster buffers and processes them. It 44 + * loops doing this, using a cursor to keep track of where it is up to in the AG 45 + * for each iteration to restart the INOBT lookup from. 46 + * 47 + * We can't do this exactly with free space - once we drop the AGF lock, the 48 + * state of the free extent is out of our control and we cannot run a discard 49 + * safely on it in this situation. Unless, of course, we've marked the free 50 + * extent as busy and undergoing a discard operation whilst we held the AGF 51 + * locked. 52 + * 53 + * This is exactly how online discard works - free extents are marked busy when 54 + * they are freed, and once the extent free has been committed to the journal, 55 + * the busy extent record is marked as "undergoing discard" and the discard is 56 + * then issued on the free extent. Once the discard completes, the busy extent 57 + * record is removed and the extent is able to be allocated again. 58 + * 59 + * In the context of fstrim, if we find a free extent we need to discard, we 60 + * don't have to discard it immediately. All we need to do it record that free 61 + * extent as being busy and under discard, and all the allocation routines will 62 + * now avoid trying to allocate it. Hence if we mark the extent as busy under 63 + * the AGF lock, we can safely discard it without holding the AGF lock because 64 + * nothing will attempt to allocate that free space until the discard completes. 65 + * 66 + * This also allows us to issue discards asynchronously like we do with online 67 + * discard, and so for fast devices fstrim will run much faster as we can have 68 + * multiple discard operations in flight at once, as well as pipeline the free 69 + * extent search so that it overlaps in flight discard IO. 70 + */ 71 + 72 + struct workqueue_struct *xfs_discard_wq; 73 + 74 + static void 75 + xfs_discard_endio_work( 76 + struct work_struct *work) 77 + { 78 + struct xfs_busy_extents *extents = 79 + container_of(work, struct xfs_busy_extents, endio_work); 80 + 81 + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); 82 + kmem_free(extents->owner); 83 + } 84 + 85 + /* 86 + * Queue up the actual completion to a thread to avoid IRQ-safe locking for 87 + * pagb_lock. 88 + */ 89 + static void 90 + xfs_discard_endio( 91 + struct bio *bio) 92 + { 93 + struct xfs_busy_extents *extents = bio->bi_private; 94 + 95 + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 96 + queue_work(xfs_discard_wq, &extents->endio_work); 97 + bio_put(bio); 98 + } 99 + 100 + /* 101 + * Walk the discard list and issue discards on all the busy extents in the 102 + * list. We plug and chain the bios so that we only need a single completion 103 + * call to clear all the busy extents once the discards are complete. 104 + */ 105 + int 106 + xfs_discard_extents( 107 + struct xfs_mount *mp, 108 + struct xfs_busy_extents *extents) 109 + { 110 + struct xfs_extent_busy *busyp; 111 + struct bio *bio = NULL; 112 + struct blk_plug plug; 113 + int error = 0; 114 + 115 + blk_start_plug(&plug); 116 + list_for_each_entry(busyp, &extents->extent_list, list) { 117 + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, 118 + busyp->length); 119 + 120 + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, 121 + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), 122 + XFS_FSB_TO_BB(mp, busyp->length), 123 + GFP_NOFS, &bio); 124 + if (error && error != -EOPNOTSUPP) { 125 + xfs_info(mp, 126 + "discard failed for extent [0x%llx,%u], error %d", 127 + (unsigned long long)busyp->bno, 128 + busyp->length, 129 + error); 130 + break; 131 + } 132 + } 133 + 134 + if (bio) { 135 + bio->bi_private = extents; 136 + bio->bi_end_io = xfs_discard_endio; 137 + submit_bio(bio); 138 + } else { 139 + xfs_discard_endio_work(&extents->endio_work); 140 + } 141 + blk_finish_plug(&plug); 142 + 143 + return error; 144 + } 145 + 146 + 147 + static int 148 + xfs_trim_gather_extents( 24 149 struct xfs_perag *pag, 25 150 xfs_daddr_t start, 26 151 xfs_daddr_t end, 27 152 xfs_daddr_t minlen, 153 + struct xfs_alloc_rec_incore *tcur, 154 + struct xfs_busy_extents *extents, 28 155 uint64_t *blocks_trimmed) 29 156 { 30 157 struct xfs_mount *mp = pag->pag_mount; 31 - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; 32 158 struct xfs_btree_cur *cur; 33 159 struct xfs_buf *agbp; 34 - struct xfs_agf *agf; 35 160 int error; 36 161 int i; 162 + int batch = 100; 37 163 38 164 /* 39 165 * Force out the log. This means any transactions that might have freed ··· 171 45 error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); 172 46 if (error) 173 47 return error; 174 - agf = agbp->b_addr; 175 48 176 49 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); 177 50 178 51 /* 179 - * Look up the longest btree in the AGF and start with it. 52 + * Look up the extent length requested in the AGF and start with it. 180 53 */ 181 - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); 54 + if (tcur->ar_startblock == NULLAGBLOCK) 55 + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); 56 + else 57 + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, 58 + tcur->ar_blockcount, &i); 182 59 if (error) 183 60 goto out_del_cursor; 61 + if (i == 0) { 62 + /* nothing of that length left in the AG, we are done */ 63 + tcur->ar_blockcount = 0; 64 + goto out_del_cursor; 65 + } 184 66 185 67 /* 186 68 * Loop until we are done with all extents that are large 187 - * enough to be worth discarding. 69 + * enough to be worth discarding or we hit batch limits. 188 70 */ 189 71 while (i) { 190 72 xfs_agblock_t fbno; ··· 207 73 error = -EFSCORRUPTED; 208 74 break; 209 75 } 210 - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); 76 + 77 + if (--batch <= 0) { 78 + /* 79 + * Update the cursor to point at this extent so we 80 + * restart the next batch from this extent. 81 + */ 82 + tcur->ar_startblock = fbno; 83 + tcur->ar_blockcount = flen; 84 + break; 85 + } 211 86 212 87 /* 213 88 * use daddr format for all range/len calculations as that is ··· 231 88 */ 232 89 if (dlen < minlen) { 233 90 trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); 91 + tcur->ar_blockcount = 0; 234 92 break; 235 93 } 236 94 ··· 254 110 goto next_extent; 255 111 } 256 112 257 - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); 258 - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); 259 - if (error) 260 - break; 113 + xfs_extent_busy_insert_discard(pag, fbno, flen, 114 + &extents->extent_list); 261 115 *blocks_trimmed += flen; 262 - 263 116 next_extent: 264 117 error = xfs_btree_decrement(cur, 0, &i); 265 118 if (error) 266 119 break; 267 120 268 - if (fatal_signal_pending(current)) { 269 - error = -ERESTARTSYS; 270 - break; 271 - } 121 + /* 122 + * If there's no more records in the tree, we are done. Set the 123 + * cursor block count to 0 to indicate to the caller that there 124 + * is no more extents to search. 125 + */ 126 + if (i == 0) 127 + tcur->ar_blockcount = 0; 272 128 } 273 129 130 + /* 131 + * If there was an error, release all the gathered busy extents because 132 + * we aren't going to issue a discard on them any more. 133 + */ 134 + if (error) 135 + xfs_extent_busy_clear(mp, &extents->extent_list, false); 274 136 out_del_cursor: 275 137 xfs_btree_del_cursor(cur, error); 276 138 xfs_buf_relse(agbp); 277 139 return error; 140 + } 141 + 142 + static bool 143 + xfs_trim_should_stop(void) 144 + { 145 + return fatal_signal_pending(current) || freezing(current); 146 + } 147 + 148 + /* 149 + * Iterate the free list gathering extents and discarding them. We need a cursor 150 + * for the repeated iteration of gather/discard loop, so use the longest extent 151 + * we found in the last batch as the key to start the next. 152 + */ 153 + static int 154 + xfs_trim_extents( 155 + struct xfs_perag *pag, 156 + xfs_daddr_t start, 157 + xfs_daddr_t end, 158 + xfs_daddr_t minlen, 159 + uint64_t *blocks_trimmed) 160 + { 161 + struct xfs_alloc_rec_incore tcur = { 162 + .ar_blockcount = pag->pagf_longest, 163 + .ar_startblock = NULLAGBLOCK, 164 + }; 165 + int error = 0; 166 + 167 + do { 168 + struct xfs_busy_extents *extents; 169 + 170 + extents = kzalloc(sizeof(*extents), GFP_KERNEL); 171 + if (!extents) { 172 + error = -ENOMEM; 173 + break; 174 + } 175 + 176 + extents->mount = pag->pag_mount; 177 + extents->owner = extents; 178 + INIT_LIST_HEAD(&extents->extent_list); 179 + 180 + error = xfs_trim_gather_extents(pag, start, end, minlen, 181 + &tcur, extents, blocks_trimmed); 182 + if (error) { 183 + kfree(extents); 184 + break; 185 + } 186 + 187 + /* 188 + * We hand the extent list to the discard function here so the 189 + * discarded extents can be removed from the busy extent list. 190 + * This allows the discards to run asynchronously with gathering 191 + * the next round of extents to discard. 192 + * 193 + * However, we must ensure that we do not reference the extent 194 + * list after this function call, as it may have been freed by 195 + * the time control returns to us. 196 + */ 197 + error = xfs_discard_extents(pag->pag_mount, extents); 198 + if (error) 199 + break; 200 + 201 + if (xfs_trim_should_stop()) 202 + break; 203 + 204 + } while (tcur.ar_blockcount != 0); 205 + 206 + return error; 207 + 278 208 } 279 209 280 210 /* ··· 413 195 for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { 414 196 error = xfs_trim_extents(pag, start, end, minlen, 415 197 &blocks_trimmed); 416 - if (error) { 198 + if (error) 417 199 last_error = error; 418 - if (error == -ERESTARTSYS) { 419 - xfs_perag_rele(pag); 420 - break; 421 - } 200 + 201 + if (xfs_trim_should_stop()) { 202 + xfs_perag_rele(pag); 203 + break; 422 204 } 423 205 } 424 206

+4 -2

fs/xfs/xfs_discard.h

··· 3 3 #define XFS_DISCARD_H 1 4 4 5 5 struct fstrim_range; 6 - struct list_head; 6 + struct xfs_mount; 7 + struct xfs_busy_extents; 7 8 8 - extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); 9 + int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); 10 + int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); 9 11 10 12 #endif /* XFS_DISCARD_H */

+28 -6

fs/xfs/xfs_extent_busy.c

··· 19 19 #include "xfs_log.h" 20 20 #include "xfs_ag.h" 21 21 22 - void 23 - xfs_extent_busy_insert( 24 - struct xfs_trans *tp, 22 + static void 23 + xfs_extent_busy_insert_list( 25 24 struct xfs_perag *pag, 26 25 xfs_agblock_t bno, 27 26 xfs_extlen_t len, 28 - unsigned int flags) 27 + unsigned int flags, 28 + struct list_head *busy_list) 29 29 { 30 30 struct xfs_extent_busy *new; 31 31 struct xfs_extent_busy *busyp; ··· 40 40 new->flags = flags; 41 41 42 42 /* trace before insert to be able to see failed inserts */ 43 - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); 43 + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); 44 44 45 45 spin_lock(&pag->pagb_lock); 46 46 rbp = &pag->pagb_tree.rb_node; ··· 62 62 rb_link_node(&new->rb_node, parent, rbp); 63 63 rb_insert_color(&new->rb_node, &pag->pagb_tree); 64 64 65 - list_add(&new->list, &tp->t_busy); 65 + list_add(&new->list, busy_list); 66 66 spin_unlock(&pag->pagb_lock); 67 + } 68 + 69 + void 70 + xfs_extent_busy_insert( 71 + struct xfs_trans *tp, 72 + struct xfs_perag *pag, 73 + xfs_agblock_t bno, 74 + xfs_extlen_t len, 75 + unsigned int flags) 76 + { 77 + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); 78 + } 79 + 80 + void 81 + xfs_extent_busy_insert_discard( 82 + struct xfs_perag *pag, 83 + xfs_agblock_t bno, 84 + xfs_extlen_t len, 85 + struct list_head *busy_list) 86 + { 87 + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, 88 + busy_list); 67 89 } 68 90 69 91 /*

+21 -3

fs/xfs/xfs_extent_busy.h

··· 16 16 /* 17 17 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that 18 18 * have been freed but whose transactions aren't committed to disk yet. 19 - * 20 - * Note that we use the transaction ID to record the transaction, not the 21 - * transaction structure itself. See xfs_extent_busy_insert() for details. 22 19 */ 23 20 struct xfs_extent_busy { 24 21 struct rb_node rb_node; /* ag by-bno indexed search tree */ ··· 28 31 #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ 29 32 }; 30 33 34 + /* 35 + * List used to track groups of related busy extents all the way through 36 + * to discard completion. 37 + */ 38 + struct xfs_busy_extents { 39 + struct xfs_mount *mount; 40 + struct list_head extent_list; 41 + struct work_struct endio_work; 42 + 43 + /* 44 + * Owner is the object containing the struct xfs_busy_extents to free 45 + * once the busy extents have been processed. If only the 46 + * xfs_busy_extents object needs freeing, then point this at itself. 47 + */ 48 + void *owner; 49 + }; 50 + 31 51 void 32 52 xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, 33 53 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); 54 + 55 + void 56 + xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, 57 + xfs_extlen_t len, struct list_head *busy_list); 34 58 35 59 void 36 60 xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,

+13 -80

fs/xfs/xfs_log_cil.c

··· 16 16 #include "xfs_log.h" 17 17 #include "xfs_log_priv.h" 18 18 #include "xfs_trace.h" 19 - 20 - struct workqueue_struct *xfs_discard_wq; 19 + #include "xfs_discard.h" 21 20 22 21 /* 23 22 * Allocate a new ticket. Failing to get a new ticket makes it really hard to ··· 102 103 103 104 ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); 104 105 INIT_LIST_HEAD(&ctx->committing); 105 - INIT_LIST_HEAD(&ctx->busy_extents); 106 + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); 106 107 INIT_LIST_HEAD(&ctx->log_items); 107 108 INIT_LIST_HEAD(&ctx->lv_chain); 108 109 INIT_WORK(&ctx->push_work, xlog_cil_push_work); ··· 131 132 132 133 if (!list_empty(&cilpcp->busy_extents)) { 133 134 list_splice_init(&cilpcp->busy_extents, 134 - &ctx->busy_extents); 135 + &ctx->busy_extents.extent_list); 135 136 } 136 137 if (!list_empty(&cilpcp->log_items)) 137 138 list_splice_init(&cilpcp->log_items, &ctx->log_items); ··· 707 708 } 708 709 } 709 710 710 - static void 711 - xlog_discard_endio_work( 712 - struct work_struct *work) 713 - { 714 - struct xfs_cil_ctx *ctx = 715 - container_of(work, struct xfs_cil_ctx, discard_endio_work); 716 - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; 717 - 718 - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); 719 - kmem_free(ctx); 720 - } 721 - 722 - /* 723 - * Queue up the actual completion to a thread to avoid IRQ-safe locking for 724 - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might 725 - * get the execution delayed up to 30 seconds for weird reasons. 726 - */ 727 - static void 728 - xlog_discard_endio( 729 - struct bio *bio) 730 - { 731 - struct xfs_cil_ctx *ctx = bio->bi_private; 732 - 733 - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); 734 - queue_work(xfs_discard_wq, &ctx->discard_endio_work); 735 - bio_put(bio); 736 - } 737 - 738 - static void 739 - xlog_discard_busy_extents( 740 - struct xfs_mount *mp, 741 - struct xfs_cil_ctx *ctx) 742 - { 743 - struct list_head *list = &ctx->busy_extents; 744 - struct xfs_extent_busy *busyp; 745 - struct bio *bio = NULL; 746 - struct blk_plug plug; 747 - int error = 0; 748 - 749 - ASSERT(xfs_has_discard(mp)); 750 - 751 - blk_start_plug(&plug); 752 - list_for_each_entry(busyp, list, list) { 753 - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, 754 - busyp->length); 755 - 756 - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, 757 - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), 758 - XFS_FSB_TO_BB(mp, busyp->length), 759 - GFP_NOFS, &bio); 760 - if (error && error != -EOPNOTSUPP) { 761 - xfs_info(mp, 762 - "discard failed for extent [0x%llx,%u], error %d", 763 - (unsigned long long)busyp->bno, 764 - busyp->length, 765 - error); 766 - break; 767 - } 768 - } 769 - 770 - if (bio) { 771 - bio->bi_private = ctx; 772 - bio->bi_end_io = xlog_discard_endio; 773 - submit_bio(bio); 774 - } else { 775 - xlog_discard_endio_work(&ctx->discard_endio_work); 776 - } 777 - blk_finish_plug(&plug); 778 - } 779 - 780 711 /* 781 712 * Mark all items committed and clear busy extents. We free the log vector 782 713 * chains in a separate pass so that we unpin the log items as quickly as ··· 736 807 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, 737 808 ctx->start_lsn, abort); 738 809 739 - xfs_extent_busy_sort(&ctx->busy_extents); 740 - xfs_extent_busy_clear(mp, &ctx->busy_extents, 810 + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); 811 + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, 741 812 xfs_has_discard(mp) && !abort); 742 813 743 814 spin_lock(&ctx->cil->xc_push_lock); ··· 746 817 747 818 xlog_cil_free_logvec(&ctx->lv_chain); 748 819 749 - if (!list_empty(&ctx->busy_extents)) 750 - xlog_discard_busy_extents(mp, ctx); 751 - else 752 - kmem_free(ctx); 820 + if (!list_empty(&ctx->busy_extents.extent_list)) { 821 + ctx->busy_extents.mount = mp; 822 + ctx->busy_extents.owner = ctx; 823 + xfs_discard_extents(mp, &ctx->busy_extents); 824 + return; 825 + } 826 + 827 + kmem_free(ctx); 753 828 } 754 829 755 830 void

+3 -2

fs/xfs/xfs_log_priv.h

··· 6 6 #ifndef __XFS_LOG_PRIV_H__ 7 7 #define __XFS_LOG_PRIV_H__ 8 8 9 + #include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ 10 + 9 11 struct xfs_buf; 10 12 struct xlog; 11 13 struct xlog_ticket; ··· 225 223 struct xlog_in_core *commit_iclog; 226 224 struct xlog_ticket *ticket; /* chkpt ticket */ 227 225 atomic_t space_used; /* aggregate size of regions */ 228 - struct list_head busy_extents; /* busy extents in chkpt */ 226 + struct xfs_busy_extents busy_extents; 229 227 struct list_head log_items; /* log items in chkpt */ 230 228 struct list_head lv_chain; /* logvecs being pushed */ 231 229 struct list_head iclog_entry; 232 230 struct list_head committing; /* ctx committing list */ 233 - struct work_struct discard_endio_work; 234 231 struct work_struct push_work; 235 232 atomic_t order_id; 236 233

Configure Feed

Configure Feed