Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfs: reduce transaction reservations with reflink

Before to the introduction of deferred refcount operations, reflink
would try to cram refcount btree updates into the same transaction as an
allocation or a free event. Mainline XFS has never actually done that,
but we never refactored the transaction reservations to reflect that we
now do all refcount updates in separate transactions. Fix this to
reduce the transaction reservation size even farther, so that between
this patch and the previous one, we reduce the tr_write and tr_itruncate
sizes by 66%.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>

+138 -17
+12
fs/xfs/libxfs/xfs_log_rlimit.c
··· 80 80 resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; 81 81 } 82 82 83 + /* 84 + * In the early days of reflink, we did not use deferred refcount 85 + * update log items, so log reservations must be recomputed using the 86 + * old calculations. 87 + */ 88 + resv->tr_write.tr_logres = 89 + xfs_calc_write_reservation_minlogsize(mp); 90 + resv->tr_itruncate.tr_logres = 91 + xfs_calc_itruncate_reservation_minlogsize(mp); 92 + resv->tr_qm_dqalloc.tr_logres = 93 + xfs_calc_qm_dqalloc_reservation_minlogsize(mp); 94 + 83 95 /* Put everything back the way it was. This goes at the end. */ 84 96 mp->m_rmap_maxlevels = rmap_maxlevels; 85 97 }
+7 -2
fs/xfs/libxfs/xfs_refcount.c
··· 886 886 { 887 887 unsigned long overhead; 888 888 889 - overhead = cur->bc_ag.refc.shape_changes * 890 - xfs_allocfree_log_count(cur->bc_mp, 1); 889 + /* 890 + * Worst case estimate: full splits of the free space and rmap btrees 891 + * to handle each of the shape changes to the refcount btree. 892 + */ 893 + overhead = xfs_allocfree_log_count(cur->bc_mp, 894 + cur->bc_ag.refc.shape_changes); 895 + overhead += cur->bc_mp->m_refc_maxlevels; 891 896 overhead *= cur->bc_mp->m_sb.sb_blocksize; 892 897 893 898 /*
+115 -15
fs/xfs/libxfs/xfs_trans_resv.c
··· 56 56 * Per-extent log reservation for the btree changes involved in freeing or 57 57 * allocating an extent. In classic XFS there were two trees that will be 58 58 * modified (bnobt + cntbt). With rmap enabled, there are three trees 59 - * (rmapbt). With reflink, there are four trees (refcountbt). The number of 60 - * blocks reserved is based on the formula: 59 + * (rmapbt). The number of blocks reserved is based on the formula: 61 60 * 62 61 * num trees * ((2 blocks/level * max depth) - 1) 63 62 * ··· 72 73 blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); 73 74 if (xfs_has_rmapbt(mp)) 74 75 blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); 75 - if (xfs_has_reflink(mp)) 76 - blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); 77 76 78 77 return blocks; 78 + } 79 + 80 + /* 81 + * Per-extent log reservation for refcount btree changes. These are never done 82 + * in the same transaction as an allocation or a free, so we compute them 83 + * separately. 84 + */ 85 + static unsigned int 86 + xfs_refcountbt_block_count( 87 + struct xfs_mount *mp, 88 + unsigned int num_ops) 89 + { 90 + return num_ops * (2 * mp->m_refc_maxlevels - 1); 79 91 } 80 92 81 93 /* ··· 243 233 * register overflow from temporaries in the calculations. 244 234 */ 245 235 236 + /* 237 + * Compute the log reservation required to handle the refcount update 238 + * transaction. Refcount updates are always done via deferred log items. 239 + * 240 + * This is calculated as: 241 + * Data device refcount updates (t1): 242 + * the agfs of the ags containing the blocks: nr_ops * sector size 243 + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size 244 + */ 245 + static unsigned int 246 + xfs_calc_refcountbt_reservation( 247 + struct xfs_mount *mp, 248 + unsigned int nr_ops) 249 + { 250 + unsigned int blksz = XFS_FSB_TO_B(mp, 1); 251 + 252 + if (!xfs_has_reflink(mp)) 253 + return 0; 254 + 255 + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + 256 + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); 257 + } 246 258 247 259 /* 248 260 * In a write transaction we can allocate a maximum of 2 ··· 287 255 * the agfls of the ags containing the blocks: 2 * sector size 288 256 * the super block free block counter: sector size 289 257 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size 258 + * And any refcount updates that happen in a separate transaction (t4). 290 259 */ 291 260 STATIC uint 292 261 xfs_calc_write_reservation( 293 - struct xfs_mount *mp) 262 + struct xfs_mount *mp, 263 + bool for_minlogsize) 294 264 { 295 - unsigned int t1, t2, t3; 265 + unsigned int t1, t2, t3, t4; 296 266 unsigned int blksz = XFS_FSB_TO_B(mp, 1); 297 267 298 268 t1 = xfs_calc_inode_res(mp, 1) + ··· 316 282 t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + 317 283 xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); 318 284 319 - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); 285 + /* 286 + * In the early days of reflink, we included enough reservation to log 287 + * two refcountbt splits for each transaction. The codebase runs 288 + * refcountbt updates in separate transactions now, so to compute the 289 + * minimum log size, add the refcountbtree splits back to t1 and t3 and 290 + * do not account them separately as t4. Reflink did not support 291 + * realtime when the reservations were established, so no adjustment to 292 + * t2 is needed. 293 + */ 294 + if (for_minlogsize) { 295 + unsigned int adj = 0; 296 + 297 + if (xfs_has_reflink(mp)) 298 + adj = xfs_calc_buf_res( 299 + xfs_refcountbt_block_count(mp, 2), 300 + blksz); 301 + t1 += adj; 302 + t3 += adj; 303 + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); 304 + } 305 + 306 + t4 = xfs_calc_refcountbt_reservation(mp, 1); 307 + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); 308 + } 309 + 310 + unsigned int 311 + xfs_calc_write_reservation_minlogsize( 312 + struct xfs_mount *mp) 313 + { 314 + return xfs_calc_write_reservation(mp, true); 320 315 } 321 316 322 317 /* ··· 367 304 * the realtime summary: 2 exts * 1 block 368 305 * worst case split in allocation btrees per extent assuming 2 extents: 369 306 * 2 exts * 2 trees * (2 * max depth - 1) * block size 307 + * And any refcount updates that happen in a separate transaction (t4). 370 308 */ 371 309 STATIC uint 372 310 xfs_calc_itruncate_reservation( 373 - struct xfs_mount *mp) 311 + struct xfs_mount *mp, 312 + bool for_minlogsize) 374 313 { 375 - unsigned int t1, t2, t3; 314 + unsigned int t1, t2, t3, t4; 376 315 unsigned int blksz = XFS_FSB_TO_B(mp, 1); 377 316 378 317 t1 = xfs_calc_inode_res(mp, 1) + ··· 391 326 t3 = 0; 392 327 } 393 328 394 - return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); 329 + /* 330 + * In the early days of reflink, we included enough reservation to log 331 + * four refcountbt splits in the same transaction as bnobt/cntbt 332 + * updates. The codebase runs refcountbt updates in separate 333 + * transactions now, so to compute the minimum log size, add the 334 + * refcount btree splits back here and do not compute them separately 335 + * as t4. Reflink did not support realtime when the reservations were 336 + * established, so do not adjust t3. 337 + */ 338 + if (for_minlogsize) { 339 + if (xfs_has_reflink(mp)) 340 + t2 += xfs_calc_buf_res( 341 + xfs_refcountbt_block_count(mp, 4), 342 + blksz); 343 + 344 + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); 345 + } 346 + 347 + t4 = xfs_calc_refcountbt_reservation(mp, 2); 348 + return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3)); 349 + } 350 + 351 + unsigned int 352 + xfs_calc_itruncate_reservation_minlogsize( 353 + struct xfs_mount *mp) 354 + { 355 + return xfs_calc_itruncate_reservation(mp, true); 395 356 } 396 357 397 358 /* ··· 883 792 */ 884 793 STATIC uint 885 794 xfs_calc_qm_dqalloc_reservation( 886 - struct xfs_mount *mp) 795 + struct xfs_mount *mp, 796 + bool for_minlogsize) 887 797 { 888 - return xfs_calc_write_reservation(mp) + 798 + return xfs_calc_write_reservation(mp, for_minlogsize) + 889 799 xfs_calc_buf_res(1, 890 800 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); 801 + } 802 + 803 + unsigned int 804 + xfs_calc_qm_dqalloc_reservation_minlogsize( 805 + struct xfs_mount *mp) 806 + { 807 + return xfs_calc_qm_dqalloc_reservation(mp, true); 891 808 } 892 809 893 810 /* ··· 920 821 * The following transactions are logged in physical format and 921 822 * require a permanent reservation on space. 922 823 */ 923 - resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); 824 + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false); 924 825 resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; 925 826 resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 926 827 927 - resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); 828 + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false); 928 829 resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; 929 830 resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 930 831 ··· 981 882 resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; 982 883 resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 983 884 984 - resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); 885 + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp, 886 + false); 985 887 resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; 986 888 resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; 987 889
+4
fs/xfs/libxfs/xfs_trans_resv.h
··· 98 98 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); 99 99 uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); 100 100 101 + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); 102 + unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); 103 + unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); 104 + 101 105 #endif /* __XFS_TRANS_RESV_H__ */