xfs: multithreaded iwalk implementation

+1

fs/xfs/Makefile

··· 85 85 xfs_message.o \ 86 86 xfs_mount.o \ 87 87 xfs_mru_cache.o \ 88 + xfs_pwork.o \ 88 89 xfs_reflink.o \ 89 90 xfs_stats.o \ 90 91 xfs_super.o \

+3

fs/xfs/xfs_globals.c

··· 40 40 #else 41 41 .bug_on_assert = false, /* assert failures WARN() */ 42 42 #endif 43 + #ifdef DEBUG 44 + .pwork_threads = -1, /* automatic thread detection */ 45 + #endif 43 46 };

+82

fs/xfs/xfs_iwalk.c

··· 20 20 #include "xfs_icache.h" 21 21 #include "xfs_health.h" 22 22 #include "xfs_trans.h" 23 + #include "xfs_pwork.h" 23 24 24 25 /* 25 26 * Walking Inodes in the Filesystem ··· 46 45 */ 47 46 48 47 struct xfs_iwalk_ag { 48 + /* parallel work control data; will be null if single threaded */ 49 + struct xfs_pwork pwork; 50 + 49 51 struct xfs_mount *mp; 50 52 struct xfs_trans *tp; 51 53 ··· 186 182 187 183 trace_xfs_iwalk_ag_rec(mp, agno, irec); 188 184 185 + if (xfs_pwork_want_abort(&iwag->pwork)) 186 + return 0; 187 + 189 188 if (iwag->inobt_walk_fn) { 190 189 error = iwag->inobt_walk_fn(mp, tp, agno, irec, 191 190 iwag->data); ··· 200 193 continue; 201 194 202 195 for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { 196 + if (xfs_pwork_want_abort(&iwag->pwork)) 197 + return 0; 198 + 203 199 /* Skip if this inode is free */ 204 200 if (XFS_INOBT_MASK(j) & irec->ir_free) 205 201 continue; ··· 397 387 struct xfs_inobt_rec_incore *irec; 398 388 399 389 cond_resched(); 390 + if (xfs_pwork_want_abort(&iwag->pwork)) 391 + goto out; 400 392 401 393 /* Fetch the inobt record. */ 402 394 irec = &iwag->recs[iwag->nr_recs]; ··· 532 520 .sz_recs = xfs_iwalk_prefetch(inode_records), 533 521 .trim_start = 1, 534 522 .skip_empty = 1, 523 + .pwork = XFS_PWORK_SINGLE_THREADED, 535 524 }; 536 525 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); 537 526 int error; ··· 552 539 553 540 xfs_iwalk_free(&iwag); 554 541 return error; 542 + } 543 + 544 + /* Run per-thread iwalk work. */ 545 + static int 546 + xfs_iwalk_ag_work( 547 + struct xfs_mount *mp, 548 + struct xfs_pwork *pwork) 549 + { 550 + struct xfs_iwalk_ag *iwag; 551 + int error = 0; 552 + 553 + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); 554 + if (xfs_pwork_want_abort(pwork)) 555 + goto out; 556 + 557 + error = xfs_iwalk_alloc(iwag); 558 + if (error) 559 + goto out; 560 + 561 + error = xfs_iwalk_ag(iwag); 562 + xfs_iwalk_free(iwag); 563 + out: 564 + kmem_free(iwag); 565 + return error; 566 + } 567 + 568 + /* 569 + * Walk all the inodes in the filesystem using multiple threads to process each 570 + * AG. 571 + */ 572 + int 573 + xfs_iwalk_threaded( 574 + struct xfs_mount *mp, 575 + xfs_ino_t startino, 576 + xfs_iwalk_fn iwalk_fn, 577 + unsigned int inode_records, 578 + void *data) 579 + { 580 + struct xfs_pwork_ctl pctl; 581 + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); 582 + unsigned int nr_threads; 583 + int error; 584 + 585 + ASSERT(agno < mp->m_sb.sb_agcount); 586 + 587 + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); 588 + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", 589 + nr_threads); 590 + if (error) 591 + return error; 592 + 593 + for (; agno < mp->m_sb.sb_agcount; agno++) { 594 + struct xfs_iwalk_ag *iwag; 595 + 596 + if (xfs_pwork_ctl_want_abort(&pctl)) 597 + break; 598 + 599 + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); 600 + iwag->mp = mp; 601 + iwag->iwalk_fn = iwalk_fn; 602 + iwag->data = data; 603 + iwag->startino = startino; 604 + iwag->sz_recs = xfs_iwalk_prefetch(inode_records); 605 + xfs_pwork_queue(&pctl, &iwag->pwork); 606 + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); 607 + } 608 + 609 + return xfs_pwork_destroy(&pctl); 555 610 } 556 611 557 612 /* ··· 682 601 .data = data, 683 602 .startino = startino, 684 603 .sz_recs = xfs_inobt_walk_prefetch(inobt_records), 604 + .pwork = XFS_PWORK_SINGLE_THREADED, 685 605 }; 686 606 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); 687 607 int error;

+2

fs/xfs/xfs_iwalk.h

··· 15 15 16 16 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, 17 17 xfs_iwalk_fn iwalk_fn, unsigned int inode_records, void *data); 18 + int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, 19 + xfs_iwalk_fn iwalk_fn, unsigned int inode_records, void *data); 18 20 19 21 /* Walk all inode btree records in the filesystem starting from @startino. */ 20 22 typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,

+117

fs/xfs/xfs_pwork.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (C) 2019 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_mount.h" 13 + #include "xfs_trace.h" 14 + #include "xfs_sysctl.h" 15 + #include "xfs_pwork.h" 16 + 17 + /* 18 + * Parallel Work Queue 19 + * =================== 20 + * 21 + * Abstract away the details of running a large and "obviously" parallelizable 22 + * task across multiple CPUs. Callers initialize the pwork control object with 23 + * a desired level of parallelization and a work function. Next, they embed 24 + * struct xfs_pwork in whatever structure they use to pass work context to a 25 + * worker thread and queue that pwork. The work function will be passed the 26 + * pwork item when it is run (from process context) and any returned error will 27 + * be recorded in xfs_pwork_ctl.error. Work functions should check for errors 28 + * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not 29 + * stop workqueue item processing. 30 + * 31 + * This is the rough equivalent of the xfsprogs workqueue code, though we can't 32 + * reuse that name here. 33 + */ 34 + 35 + /* Invoke our caller's function. */ 36 + static void 37 + xfs_pwork_work( 38 + struct work_struct *work) 39 + { 40 + struct xfs_pwork *pwork; 41 + struct xfs_pwork_ctl *pctl; 42 + int error; 43 + 44 + pwork = container_of(work, struct xfs_pwork, work); 45 + pctl = pwork->pctl; 46 + error = pctl->work_fn(pctl->mp, pwork); 47 + if (error && !pctl->error) 48 + pctl->error = error; 49 + } 50 + 51 + /* 52 + * Set up control data for parallel work. @work_fn is the function that will 53 + * be called. @tag will be written into the kernel threads. @nr_threads is 54 + * the level of parallelism desired, or 0 for no limit. 55 + */ 56 + int 57 + xfs_pwork_init( 58 + struct xfs_mount *mp, 59 + struct xfs_pwork_ctl *pctl, 60 + xfs_pwork_work_fn work_fn, 61 + const char *tag, 62 + unsigned int nr_threads) 63 + { 64 + #ifdef DEBUG 65 + if (xfs_globals.pwork_threads >= 0) 66 + nr_threads = xfs_globals.pwork_threads; 67 + #endif 68 + trace_xfs_pwork_init(mp, nr_threads, current->pid); 69 + 70 + pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, 71 + current->pid); 72 + if (!pctl->wq) 73 + return -ENOMEM; 74 + pctl->work_fn = work_fn; 75 + pctl->error = 0; 76 + pctl->mp = mp; 77 + 78 + return 0; 79 + } 80 + 81 + /* Queue some parallel work. */ 82 + void 83 + xfs_pwork_queue( 84 + struct xfs_pwork_ctl *pctl, 85 + struct xfs_pwork *pwork) 86 + { 87 + INIT_WORK(&pwork->work, xfs_pwork_work); 88 + pwork->pctl = pctl; 89 + queue_work(pctl->wq, &pwork->work); 90 + } 91 + 92 + /* Wait for the work to finish and tear down the control structure. */ 93 + int 94 + xfs_pwork_destroy( 95 + struct xfs_pwork_ctl *pctl) 96 + { 97 + destroy_workqueue(pctl->wq); 98 + pctl->wq = NULL; 99 + return pctl->error; 100 + } 101 + 102 + /* 103 + * Return the amount of parallelism that the data device can handle, or 0 for 104 + * no limit. 105 + */ 106 + unsigned int 107 + xfs_pwork_guess_datadev_parallelism( 108 + struct xfs_mount *mp) 109 + { 110 + struct xfs_buftarg *btp = mp->m_ddev_targp; 111 + 112 + /* 113 + * For now we'll go with the most conservative setting possible, 114 + * which is two threads for an SSD and 1 thread everywhere else. 115 + */ 116 + return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1; 117 + }

+58

fs/xfs/xfs_pwork.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (C) 2019 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 + */ 6 + #ifndef __XFS_PWORK_H__ 7 + #define __XFS_PWORK_H__ 8 + 9 + struct xfs_pwork; 10 + struct xfs_mount; 11 + 12 + typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); 13 + 14 + /* 15 + * Parallel work coordination structure. 16 + */ 17 + struct xfs_pwork_ctl { 18 + struct workqueue_struct *wq; 19 + struct xfs_mount *mp; 20 + xfs_pwork_work_fn work_fn; 21 + int error; 22 + }; 23 + 24 + /* 25 + * Embed this parallel work control item inside your own work structure, 26 + * then queue work with it. 27 + */ 28 + struct xfs_pwork { 29 + struct work_struct work; 30 + struct xfs_pwork_ctl *pctl; 31 + }; 32 + 33 + #define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } 34 + 35 + /* Have we been told to abort? */ 36 + static inline bool 37 + xfs_pwork_ctl_want_abort( 38 + struct xfs_pwork_ctl *pctl) 39 + { 40 + return pctl && pctl->error; 41 + } 42 + 43 + /* Have we been told to abort? */ 44 + static inline bool 45 + xfs_pwork_want_abort( 46 + struct xfs_pwork *pwork) 47 + { 48 + return xfs_pwork_ctl_want_abort(pwork->pctl); 49 + } 50 + 51 + int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, 52 + xfs_pwork_work_fn work_fn, const char *tag, 53 + unsigned int nr_threads); 54 + void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); 55 + int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); 56 + unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); 57 + 58 + #endif /* __XFS_PWORK_H__ */

+1 -1

fs/xfs/xfs_qm.c

··· 1300 1300 flags |= XFS_PQUOTA_CHKD; 1301 1301 } 1302 1302 1303 - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); 1303 + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); 1304 1304 if (error) 1305 1305 goto error_return; 1306 1306

+3

fs/xfs/xfs_sysctl.h

··· 82 82 extern xfs_param_t xfs_params; 83 83 84 84 struct xfs_globals { 85 + #ifdef DEBUG 86 + int pwork_threads; /* parallel workqueue threads */ 87 + #endif 85 88 int log_recovery_delay; /* log recovery delay (secs) */ 86 89 int mount_delay; /* mount setup delay (secs) */ 87 90 bool bug_on_assert; /* BUG() the kernel on assert failure */

+40

fs/xfs/xfs_sysfs.c

··· 204 204 } 205 205 XFS_SYSFS_ATTR_RW(always_cow); 206 206 207 + #ifdef DEBUG 208 + /* 209 + * Override how many threads the parallel work queue is allowed to create. 210 + * This has to be a debug-only global (instead of an errortag) because one of 211 + * the main users of parallel workqueues is mount time quotacheck. 212 + */ 213 + STATIC ssize_t 214 + pwork_threads_store( 215 + struct kobject *kobject, 216 + const char *buf, 217 + size_t count) 218 + { 219 + int ret; 220 + int val; 221 + 222 + ret = kstrtoint(buf, 0, &val); 223 + if (ret) 224 + return ret; 225 + 226 + if (val < -1 || val > num_possible_cpus()) 227 + return -EINVAL; 228 + 229 + xfs_globals.pwork_threads = val; 230 + 231 + return count; 232 + } 233 + 234 + STATIC ssize_t 235 + pwork_threads_show( 236 + struct kobject *kobject, 237 + char *buf) 238 + { 239 + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); 240 + } 241 + XFS_SYSFS_ATTR_RW(pwork_threads); 242 + #endif /* DEBUG */ 243 + 207 244 static struct attribute *xfs_dbg_attrs[] = { 208 245 ATTR_LIST(bug_on_assert), 209 246 ATTR_LIST(log_recovery_delay), 210 247 ATTR_LIST(mount_delay), 211 248 ATTR_LIST(always_cow), 249 + #ifdef DEBUG 250 + ATTR_LIST(pwork_threads), 251 + #endif 212 252 NULL, 213 253 }; 214 254

+18

fs/xfs/xfs_trace.h

··· 3557 3557 __entry->startino, __entry->freemask) 3558 3558 ) 3559 3559 3560 + TRACE_EVENT(xfs_pwork_init, 3561 + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), 3562 + TP_ARGS(mp, nr_threads, pid), 3563 + TP_STRUCT__entry( 3564 + __field(dev_t, dev) 3565 + __field(unsigned int, nr_threads) 3566 + __field(pid_t, pid) 3567 + ), 3568 + TP_fast_assign( 3569 + __entry->dev = mp->m_super->s_dev; 3570 + __entry->nr_threads = nr_threads; 3571 + __entry->pid = pid; 3572 + ), 3573 + TP_printk("dev %d:%d nr_threads %u pid %u", 3574 + MAJOR(__entry->dev), MINOR(__entry->dev), 3575 + __entry->nr_threads, __entry->pid) 3576 + ) 3577 + 3560 3578 #endif /* _TRACE_XFS_H */ 3561 3579 3562 3580 #undef TRACE_INCLUDE_PATH