commit 4e69f490d211ce4e11db60c05c0fcd0ac2f8e61e · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'xfs-fstrim-busy-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs into xfs-6.6-fixesC

xfs: reduce AGF hold times during fstrim operations

A recent log space overflow and recovery failure was root caused to
a long running truncate blocking on the AGF and ending up pinning
the tail of the log. The filesystem then hung, the machine was
rebooted, and log recoery then refused to run because there wasn't
enough space in the log for EFI transaction reservation.

The reason the long running truncate got blocked on the AGF for so
long was that an fstrim was being run. THe underlying block device
was large and very slow (10TB ceph rbd volume) and so discarding all
the free space in the AG took a really long time.

The current fstrim implementation holds the AGF across the entire
operations - both the freee space scan and the issuing of all the
discards. The discards are synchronous and single depth, so if there
are millions of free spaces, we hold the AGF lock across millions of
discard operations.

It doesn't really need to be said that this is a Bad Thing.

This series reworks the fstrim discard path to use the same
mechanisms as online discard. This allows discards to be issued
asynchronously without holding the AGF locked, enabling higher
discard queue depths (much faster on fast devices) and only
requiring the AGF lock to be held whilst we are scanning free space.

To do this, we make use of busy extents - we lock the AGF, mark all
the extents we want to discard as "busy under discard" so that
nothing will be allowed to allocate them, and then drop the AGF
lock. We then issue discards on the gathered busy extents and on
discard completion remove them from the busy list.

This results in AGF lock holds times for fstrim dropping to a few
milliseconds each batch of free extents we scan, and so the hours
long hold times that can currently occur on large, slow, badly
fragmented device no longer occur.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>

* tag 'xfs-fstrim-busy-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
xfs: abort fstrim if kernel is suspending
xfs: reduce AGF hold times during fstrim operations
xfs: move log discard work to xfs_discard.c

Chandan Babu R 2 years ago 4e69f490 8a749fd1

+311 -117

6 changed files

expand all

unified split

xfs

xfs_discard.c

xfs_discard.h

xfs_extent_busy.c

xfs_extent_busy.h

xfs_log_cil.c

xfs_log_priv.h

+242 -24

fs/xfs/xfs_discard.c

··· 1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 - * Copyright (C) 2010 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" ··· 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 22 - STATIC int 23 - xfs_trim_extents( 24 struct xfs_perag *pag, 25 xfs_daddr_t start, 26 xfs_daddr_t end, 27 xfs_daddr_t minlen, 28 uint64_t *blocks_trimmed) 29 { 30 struct xfs_mount *mp = pag->pag_mount; 31 - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; 32 struct xfs_btree_cur *cur; 33 struct xfs_buf *agbp; 34 - struct xfs_agf *agf; 35 int error; 36 int i; 37 38 /* 39 * Force out the log. This means any transactions that might have freed ··· 171 error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); 172 if (error) 173 return error; 174 - agf = agbp->b_addr; 175 176 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); 177 178 /* 179 - * Look up the longest btree in the AGF and start with it. 180 */ 181 - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); 182 if (error) 183 goto out_del_cursor; 184 185 /* 186 * Loop until we are done with all extents that are large 187 - * enough to be worth discarding. 188 */ 189 while (i) { 190 xfs_agblock_t fbno; ··· 207 error = -EFSCORRUPTED; 208 break; 209 } 210 - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); 211 212 /* 213 * use daddr format for all range/len calculations as that is ··· 231 */ 232 if (dlen < minlen) { 233 trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); 234 break; 235 } 236 ··· 254 goto next_extent; 255 } 256 257 - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); 258 - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); 259 - if (error) 260 - break; 261 *blocks_trimmed += flen; 262 - 263 next_extent: 264 error = xfs_btree_decrement(cur, 0, &i); 265 if (error) 266 break; 267 268 - if (fatal_signal_pending(current)) { 269 - error = -ERESTARTSYS; 270 - break; 271 - } 272 } 273 274 out_del_cursor: 275 xfs_btree_del_cursor(cur, error); 276 xfs_buf_relse(agbp); 277 return error; 278 } 279 280 /* ··· 413 for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { 414 error = xfs_trim_extents(pag, start, end, minlen, 415 &blocks_trimmed); 416 - if (error) { 417 last_error = error; 418 - if (error == -ERESTARTSYS) { 419 - xfs_perag_rele(pag); 420 - break; 421 - } 422 } 423 } 424

··· 1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 + * Copyright (C) 2010, 2023 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" ··· 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 22 + /* 23 + * Notes on an efficient, low latency fstrim algorithm 24 + * 25 + * We need to walk the filesystem free space and issue discards on the free 26 + * space that meet the search criteria (size and location). We cannot issue 27 + * discards on extents that might be in use, or are so recently in use they are 28 + * still marked as busy. To serialise against extent state changes whilst we are 29 + * gathering extents to trim, we must hold the AGF lock to lock out other 30 + * allocations and extent free operations that might change extent state. 31 + * 32 + * However, we cannot just hold the AGF for the entire AG free space walk whilst 33 + * we issue discards on each free space that is found. Storage devices can have 34 + * extremely slow discard implementations (e.g. ceph RBD) and so walking a 35 + * couple of million free extents and issuing synchronous discards on each 36 + * extent can take a *long* time. Whilst we are doing this walk, nothing else 37 + * can access the AGF, and we can stall transactions and hence the log whilst 38 + * modifications wait for the AGF lock to be released. This can lead hung tasks 39 + * kicking the hung task timer and rebooting the system. This is bad. 40 + * 41 + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 42 + * lock, gathers a range of inode cluster buffers that are allocated, drops the 43 + * AGI lock and then reads all the inode cluster buffers and processes them. It 44 + * loops doing this, using a cursor to keep track of where it is up to in the AG 45 + * for each iteration to restart the INOBT lookup from. 46 + * 47 + * We can't do this exactly with free space - once we drop the AGF lock, the 48 + * state of the free extent is out of our control and we cannot run a discard 49 + * safely on it in this situation. Unless, of course, we've marked the free 50 + * extent as busy and undergoing a discard operation whilst we held the AGF 51 + * locked. 52 + * 53 + * This is exactly how online discard works - free extents are marked busy when 54 + * they are freed, and once the extent free has been committed to the journal, 55 + * the busy extent record is marked as "undergoing discard" and the discard is 56 + * then issued on the free extent. Once the discard completes, the busy extent 57 + * record is removed and the extent is able to be allocated again. 58 + * 59 + * In the context of fstrim, if we find a free extent we need to discard, we 60 + * don't have to discard it immediately. All we need to do it record that free 61 + * extent as being busy and under discard, and all the allocation routines will 62 + * now avoid trying to allocate it. Hence if we mark the extent as busy under 63 + * the AGF lock, we can safely discard it without holding the AGF lock because 64 + * nothing will attempt to allocate that free space until the discard completes. 65 + * 66 + * This also allows us to issue discards asynchronously like we do with online 67 + * discard, and so for fast devices fstrim will run much faster as we can have 68 + * multiple discard operations in flight at once, as well as pipeline the free 69 + * extent search so that it overlaps in flight discard IO. 70 + */ 71 + 72 + struct workqueue_struct *xfs_discard_wq; 73 + 74 + static void 75 + xfs_discard_endio_work( 76 + struct work_struct *work) 77 + { 78 + struct xfs_busy_extents *extents = 79 + container_of(work, struct xfs_busy_extents, endio_work); 80 + 81 + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); 82 + kmem_free(extents->owner); 83 + } 84 + 85 + /* 86 + * Queue up the actual completion to a thread to avoid IRQ-safe locking for 87 + * pagb_lock. 88 + */ 89 + static void 90 + xfs_discard_endio( 91 + struct bio *bio) 92 + { 93 + struct xfs_busy_extents *extents = bio->bi_private; 94 + 95 + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 96 + queue_work(xfs_discard_wq, &extents->endio_work); 97 + bio_put(bio); 98 + } 99 + 100 + /* 101 + * Walk the discard list and issue discards on all the busy extents in the 102 + * list. We plug and chain the bios so that we only need a single completion 103 + * call to clear all the busy extents once the discards are complete. 104 + */ 105 + int 106 + xfs_discard_extents( 107 + struct xfs_mount *mp, 108 + struct xfs_busy_extents *extents) 109 + { 110 + struct xfs_extent_busy *busyp; 111 + struct bio *bio = NULL; 112 + struct blk_plug plug; 113 + int error = 0; 114 + 115 + blk_start_plug(&plug); 116 + list_for_each_entry(busyp, &extents->extent_list, list) { 117 + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, 118 + busyp->length); 119 + 120 + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, 121 + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), 122 + XFS_FSB_TO_BB(mp, busyp->length), 123 + GFP_NOFS, &bio); 124 + if (error && error != -EOPNOTSUPP) { 125 + xfs_info(mp, 126 + "discard failed for extent [0x%llx,%u], error %d", 127 + (unsigned long long)busyp->bno, 128 + busyp->length, 129 + error); 130 + break; 131 + } 132 + } 133 + 134 + if (bio) { 135 + bio->bi_private = extents; 136 + bio->bi_end_io = xfs_discard_endio; 137 + submit_bio(bio); 138 + } else { 139 + xfs_discard_endio_work(&extents->endio_work); 140 + } 141 + blk_finish_plug(&plug); 142 + 143 + return error; 144 + } 145 + 146 + 147 + static int 148 + xfs_trim_gather_extents( 149 struct xfs_perag *pag, 150 xfs_daddr_t start, 151 xfs_daddr_t end, 152 xfs_daddr_t minlen, 153 + struct xfs_alloc_rec_incore *tcur, 154 + struct xfs_busy_extents *extents, 155 uint64_t *blocks_trimmed) 156 { 157 struct xfs_mount *mp = pag->pag_mount; 158 struct xfs_btree_cur *cur; 159 struct xfs_buf *agbp; 160 int error; 161 int i; 162 + int batch = 100; 163 164 /* 165 * Force out the log. This means any transactions that might have freed ··· 45 error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); 46 if (error) 47 return error; 48 49 cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); 50 51 /* 52 + * Look up the extent length requested in the AGF and start with it. 53 */ 54 + if (tcur->ar_startblock == NULLAGBLOCK) 55 + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); 56 + else 57 + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, 58 + tcur->ar_blockcount, &i); 59 if (error) 60 goto out_del_cursor; 61 + if (i == 0) { 62 + /* nothing of that length left in the AG, we are done */ 63 + tcur->ar_blockcount = 0; 64 + goto out_del_cursor; 65 + } 66 67 /* 68 * Loop until we are done with all extents that are large 69 + * enough to be worth discarding or we hit batch limits. 70 */ 71 while (i) { 72 xfs_agblock_t fbno; ··· 73 error = -EFSCORRUPTED; 74 break; 75 } 76 + 77 + if (--batch <= 0) { 78 + /* 79 + * Update the cursor to point at this extent so we 80 + * restart the next batch from this extent. 81 + */ 82 + tcur->ar_startblock = fbno; 83 + tcur->ar_blockcount = flen; 84 + break; 85 + } 86 87 /* 88 * use daddr format for all range/len calculations as that is ··· 88 */ 89 if (dlen < minlen) { 90 trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); 91 + tcur->ar_blockcount = 0; 92 break; 93 } 94 ··· 110 goto next_extent; 111 } 112 113 + xfs_extent_busy_insert_discard(pag, fbno, flen, 114 + &extents->extent_list); 115 *blocks_trimmed += flen; 116 next_extent: 117 error = xfs_btree_decrement(cur, 0, &i); 118 if (error) 119 break; 120 121 + /* 122 + * If there's no more records in the tree, we are done. Set the 123 + * cursor block count to 0 to indicate to the caller that there 124 + * is no more extents to search. 125 + */ 126 + if (i == 0) 127 + tcur->ar_blockcount = 0; 128 } 129 130 + /* 131 + * If there was an error, release all the gathered busy extents because 132 + * we aren't going to issue a discard on them any more. 133 + */ 134 + if (error) 135 + xfs_extent_busy_clear(mp, &extents->extent_list, false); 136 out_del_cursor: 137 xfs_btree_del_cursor(cur, error); 138 xfs_buf_relse(agbp); 139 return error; 140 + } 141 + 142 + static bool 143 + xfs_trim_should_stop(void) 144 + { 145 + return fatal_signal_pending(current) || freezing(current); 146 + } 147 + 148 + /* 149 + * Iterate the free list gathering extents and discarding them. We need a cursor 150 + * for the repeated iteration of gather/discard loop, so use the longest extent 151 + * we found in the last batch as the key to start the next. 152 + */ 153 + static int 154 + xfs_trim_extents( 155 + struct xfs_perag *pag, 156 + xfs_daddr_t start, 157 + xfs_daddr_t end, 158 + xfs_daddr_t minlen, 159 + uint64_t *blocks_trimmed) 160 + { 161 + struct xfs_alloc_rec_incore tcur = { 162 + .ar_blockcount = pag->pagf_longest, 163 + .ar_startblock = NULLAGBLOCK, 164 + }; 165 + int error = 0; 166 + 167 + do { 168 + struct xfs_busy_extents *extents; 169 + 170 + extents = kzalloc(sizeof(*extents), GFP_KERNEL); 171 + if (!extents) { 172 + error = -ENOMEM; 173 + break; 174 + } 175 + 176 + extents->mount = pag->pag_mount; 177 + extents->owner = extents; 178 + INIT_LIST_HEAD(&extents->extent_list); 179 + 180 + error = xfs_trim_gather_extents(pag, start, end, minlen, 181 + &tcur, extents, blocks_trimmed); 182 + if (error) { 183 + kfree(extents); 184 + break; 185 + } 186 + 187 + /* 188 + * We hand the extent list to the discard function here so the 189 + * discarded extents can be removed from the busy extent list. 190 + * This allows the discards to run asynchronously with gathering 191 + * the next round of extents to discard. 192 + * 193 + * However, we must ensure that we do not reference the extent 194 + * list after this function call, as it may have been freed by 195 + * the time control returns to us. 196 + */ 197 + error = xfs_discard_extents(pag->pag_mount, extents); 198 + if (error) 199 + break; 200 + 201 + if (xfs_trim_should_stop()) 202 + break; 203 + 204 + } while (tcur.ar_blockcount != 0); 205 + 206 + return error; 207 + 208 } 209 210 /* ··· 195 for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { 196 error = xfs_trim_extents(pag, start, end, minlen, 197 &blocks_trimmed); 198 + if (error) 199 last_error = error; 200 + 201 + if (xfs_trim_should_stop()) { 202 + xfs_perag_rele(pag); 203 + break; 204 } 205 } 206

+4 -2

fs/xfs/xfs_discard.h

··· 3 #define XFS_DISCARD_H 1 4 5 struct fstrim_range; 6 - struct list_head; 7 8 - extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); 9 10 #endif /* XFS_DISCARD_H */

··· 3 #define XFS_DISCARD_H 1 4 5 struct fstrim_range; 6 + struct xfs_mount; 7 + struct xfs_busy_extents; 8 9 + int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); 10 + int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); 11 12 #endif /* XFS_DISCARD_H */

+28 -6

fs/xfs/xfs_extent_busy.c

··· 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 22 - void 23 - xfs_extent_busy_insert( 24 - struct xfs_trans *tp, 25 struct xfs_perag *pag, 26 xfs_agblock_t bno, 27 xfs_extlen_t len, 28 - unsigned int flags) 29 { 30 struct xfs_extent_busy *new; 31 struct xfs_extent_busy *busyp; ··· 40 new->flags = flags; 41 42 /* trace before insert to be able to see failed inserts */ 43 - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); 44 45 spin_lock(&pag->pagb_lock); 46 rbp = &pag->pagb_tree.rb_node; ··· 62 rb_link_node(&new->rb_node, parent, rbp); 63 rb_insert_color(&new->rb_node, &pag->pagb_tree); 64 65 - list_add(&new->list, &tp->t_busy); 66 spin_unlock(&pag->pagb_lock); 67 } 68 69 /*

··· 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 22 + static void 23 + xfs_extent_busy_insert_list( 24 struct xfs_perag *pag, 25 xfs_agblock_t bno, 26 xfs_extlen_t len, 27 + unsigned int flags, 28 + struct list_head *busy_list) 29 { 30 struct xfs_extent_busy *new; 31 struct xfs_extent_busy *busyp; ··· 40 new->flags = flags; 41 42 /* trace before insert to be able to see failed inserts */ 43 + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); 44 45 spin_lock(&pag->pagb_lock); 46 rbp = &pag->pagb_tree.rb_node; ··· 62 rb_link_node(&new->rb_node, parent, rbp); 63 rb_insert_color(&new->rb_node, &pag->pagb_tree); 64 65 + list_add(&new->list, busy_list); 66 spin_unlock(&pag->pagb_lock); 67 + } 68 + 69 + void 70 + xfs_extent_busy_insert( 71 + struct xfs_trans *tp, 72 + struct xfs_perag *pag, 73 + xfs_agblock_t bno, 74 + xfs_extlen_t len, 75 + unsigned int flags) 76 + { 77 + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); 78 + } 79 + 80 + void 81 + xfs_extent_busy_insert_discard( 82 + struct xfs_perag *pag, 83 + xfs_agblock_t bno, 84 + xfs_extlen_t len, 85 + struct list_head *busy_list) 86 + { 87 + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, 88 + busy_list); 89 } 90 91 /*

+21 -3

fs/xfs/xfs_extent_busy.h

··· 16 /* 17 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that 18 * have been freed but whose transactions aren't committed to disk yet. 19 - * 20 - * Note that we use the transaction ID to record the transaction, not the 21 - * transaction structure itself. See xfs_extent_busy_insert() for details. 22 */ 23 struct xfs_extent_busy { 24 struct rb_node rb_node; /* ag by-bno indexed search tree */ ··· 28 #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ 29 }; 30 31 void 32 xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, 33 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); 34 35 void 36 xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,

··· 16 /* 17 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that 18 * have been freed but whose transactions aren't committed to disk yet. 19 */ 20 struct xfs_extent_busy { 21 struct rb_node rb_node; /* ag by-bno indexed search tree */ ··· 31 #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ 32 }; 33 34 + /* 35 + * List used to track groups of related busy extents all the way through 36 + * to discard completion. 37 + */ 38 + struct xfs_busy_extents { 39 + struct xfs_mount *mount; 40 + struct list_head extent_list; 41 + struct work_struct endio_work; 42 + 43 + /* 44 + * Owner is the object containing the struct xfs_busy_extents to free 45 + * once the busy extents have been processed. If only the 46 + * xfs_busy_extents object needs freeing, then point this at itself. 47 + */ 48 + void *owner; 49 + }; 50 + 51 void 52 xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, 53 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); 54 + 55 + void 56 + xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, 57 + xfs_extlen_t len, struct list_head *busy_list); 58 59 void 60 xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,

+13 -80

fs/xfs/xfs_log_cil.c

··· 16 #include "xfs_log.h" 17 #include "xfs_log_priv.h" 18 #include "xfs_trace.h" 19 - 20 - struct workqueue_struct *xfs_discard_wq; 21 22 /* 23 * Allocate a new ticket. Failing to get a new ticket makes it really hard to ··· 102 103 ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); 104 INIT_LIST_HEAD(&ctx->committing); 105 - INIT_LIST_HEAD(&ctx->busy_extents); 106 INIT_LIST_HEAD(&ctx->log_items); 107 INIT_LIST_HEAD(&ctx->lv_chain); 108 INIT_WORK(&ctx->push_work, xlog_cil_push_work); ··· 131 132 if (!list_empty(&cilpcp->busy_extents)) { 133 list_splice_init(&cilpcp->busy_extents, 134 - &ctx->busy_extents); 135 } 136 if (!list_empty(&cilpcp->log_items)) 137 list_splice_init(&cilpcp->log_items, &ctx->log_items); ··· 707 } 708 } 709 710 - static void 711 - xlog_discard_endio_work( 712 - struct work_struct *work) 713 - { 714 - struct xfs_cil_ctx *ctx = 715 - container_of(work, struct xfs_cil_ctx, discard_endio_work); 716 - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; 717 - 718 - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); 719 - kmem_free(ctx); 720 - } 721 - 722 - /* 723 - * Queue up the actual completion to a thread to avoid IRQ-safe locking for 724 - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might 725 - * get the execution delayed up to 30 seconds for weird reasons. 726 - */ 727 - static void 728 - xlog_discard_endio( 729 - struct bio *bio) 730 - { 731 - struct xfs_cil_ctx *ctx = bio->bi_private; 732 - 733 - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); 734 - queue_work(xfs_discard_wq, &ctx->discard_endio_work); 735 - bio_put(bio); 736 - } 737 - 738 - static void 739 - xlog_discard_busy_extents( 740 - struct xfs_mount *mp, 741 - struct xfs_cil_ctx *ctx) 742 - { 743 - struct list_head *list = &ctx->busy_extents; 744 - struct xfs_extent_busy *busyp; 745 - struct bio *bio = NULL; 746 - struct blk_plug plug; 747 - int error = 0; 748 - 749 - ASSERT(xfs_has_discard(mp)); 750 - 751 - blk_start_plug(&plug); 752 - list_for_each_entry(busyp, list, list) { 753 - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, 754 - busyp->length); 755 - 756 - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, 757 - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), 758 - XFS_FSB_TO_BB(mp, busyp->length), 759 - GFP_NOFS, &bio); 760 - if (error && error != -EOPNOTSUPP) { 761 - xfs_info(mp, 762 - "discard failed for extent [0x%llx,%u], error %d", 763 - (unsigned long long)busyp->bno, 764 - busyp->length, 765 - error); 766 - break; 767 - } 768 - } 769 - 770 - if (bio) { 771 - bio->bi_private = ctx; 772 - bio->bi_end_io = xlog_discard_endio; 773 - submit_bio(bio); 774 - } else { 775 - xlog_discard_endio_work(&ctx->discard_endio_work); 776 - } 777 - blk_finish_plug(&plug); 778 - } 779 - 780 /* 781 * Mark all items committed and clear busy extents. We free the log vector 782 * chains in a separate pass so that we unpin the log items as quickly as ··· 736 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, 737 ctx->start_lsn, abort); 738 739 - xfs_extent_busy_sort(&ctx->busy_extents); 740 - xfs_extent_busy_clear(mp, &ctx->busy_extents, 741 xfs_has_discard(mp) && !abort); 742 743 spin_lock(&ctx->cil->xc_push_lock); ··· 746 747 xlog_cil_free_logvec(&ctx->lv_chain); 748 749 - if (!list_empty(&ctx->busy_extents)) 750 - xlog_discard_busy_extents(mp, ctx); 751 - else 752 - kmem_free(ctx); 753 } 754 755 void

··· 16 #include "xfs_log.h" 17 #include "xfs_log_priv.h" 18 #include "xfs_trace.h" 19 + #include "xfs_discard.h" 20 21 /* 22 * Allocate a new ticket. Failing to get a new ticket makes it really hard to ··· 103 104 ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); 105 INIT_LIST_HEAD(&ctx->committing); 106 + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); 107 INIT_LIST_HEAD(&ctx->log_items); 108 INIT_LIST_HEAD(&ctx->lv_chain); 109 INIT_WORK(&ctx->push_work, xlog_cil_push_work); ··· 132 133 if (!list_empty(&cilpcp->busy_extents)) { 134 list_splice_init(&cilpcp->busy_extents, 135 + &ctx->busy_extents.extent_list); 136 } 137 if (!list_empty(&cilpcp->log_items)) 138 list_splice_init(&cilpcp->log_items, &ctx->log_items); ··· 708 } 709 } 710 711 /* 712 * Mark all items committed and clear busy extents. We free the log vector 713 * chains in a separate pass so that we unpin the log items as quickly as ··· 807 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, 808 ctx->start_lsn, abort); 809 810 + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); 811 + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, 812 xfs_has_discard(mp) && !abort); 813 814 spin_lock(&ctx->cil->xc_push_lock); ··· 817 818 xlog_cil_free_logvec(&ctx->lv_chain); 819 820 + if (!list_empty(&ctx->busy_extents.extent_list)) { 821 + ctx->busy_extents.mount = mp; 822 + ctx->busy_extents.owner = ctx; 823 + xfs_discard_extents(mp, &ctx->busy_extents); 824 + return; 825 + } 826 + 827 + kmem_free(ctx); 828 } 829 830 void

+3 -2

fs/xfs/xfs_log_priv.h

··· 6 #ifndef __XFS_LOG_PRIV_H__ 7 #define __XFS_LOG_PRIV_H__ 8 9 struct xfs_buf; 10 struct xlog; 11 struct xlog_ticket; ··· 225 struct xlog_in_core *commit_iclog; 226 struct xlog_ticket *ticket; /* chkpt ticket */ 227 atomic_t space_used; /* aggregate size of regions */ 228 - struct list_head busy_extents; /* busy extents in chkpt */ 229 struct list_head log_items; /* log items in chkpt */ 230 struct list_head lv_chain; /* logvecs being pushed */ 231 struct list_head iclog_entry; 232 struct list_head committing; /* ctx committing list */ 233 - struct work_struct discard_endio_work; 234 struct work_struct push_work; 235 atomic_t order_id; 236

··· 6 #ifndef __XFS_LOG_PRIV_H__ 7 #define __XFS_LOG_PRIV_H__ 8 9 + #include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ 10 + 11 struct xfs_buf; 12 struct xlog; 13 struct xlog_ticket; ··· 223 struct xlog_in_core *commit_iclog; 224 struct xlog_ticket *ticket; /* chkpt ticket */ 225 atomic_t space_used; /* aggregate size of regions */ 226 + struct xfs_busy_extents busy_extents; 227 struct list_head log_items; /* log items in chkpt */ 228 struct list_head lv_chain; /* logvecs being pushed */ 229 struct list_head iclog_entry; 230 struct list_head committing; /* ctx committing list */ 231 struct work_struct push_work; 232 atomic_t order_id; 233