Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfs: xlog_sync() manually adjusts grant head space

When xlog_sync() rounds off the tail the iclog that is being
flushed, it manually subtracts that space from the grant heads. This
space is actually reserved by the transaction ticket that covers
the xlog_sync() call from xlog_write(), but we don't plumb the
ticket down far enough for it to account for the space consumed in
the current log ticket.

The grant heads are hot, so we really should be accounting this to
the ticket is we can, rather than adding thousands of extra grant
head updates every CIL commit.

Interestingly, this actually indicates a potential log space overrun
can occur when we force the log. By the time that xfs_log_force()
pushes out an active iclog and consumes the roundoff space, the
reservation for that roundoff space has been returned to the grant
heads and is no longer covered by a reservation. In theory the
roundoff added to log force on an already full log could push the
write head past the tail. In practice, the CIL commit that writes to
the log and needs the iclog pushed will have reserved space for
roundoff, so when it releases the ticket there will still be
physical space for the roundoff to be committed to the log, even
though it is no longer reserved. This roundoff won't be enough space
to allow a transaction to be woken if the log is full, so overruns
should not actually occur in practice.

That said, it indicates that we should not release the CIL context
log ticket until after we've released the commit iclog. It also
means that xlog_sync() still needs the direct grant head
manipulation if we don't provide it with a ticket. Log forces are
rare when we are in fast paths running 1.5 million transactions/s
that make the grant heads hot, so let's optimise the hot case and
pass CIL log tickets down to the xlog_sync() code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>

authored by

Dave Chinner and committed by
Dave Chinner
d9f68777 1ccb0745

+41 -17
+23 -12
fs/xfs/xfs_log.c
··· 57 57 STATIC void 58 58 xlog_sync( 59 59 struct xlog *log, 60 - struct xlog_in_core *iclog); 60 + struct xlog_in_core *iclog, 61 + struct xlog_ticket *ticket); 61 62 #if defined(DEBUG) 62 63 STATIC void 63 64 xlog_verify_grant_tail( ··· 568 567 int 569 568 xlog_state_release_iclog( 570 569 struct xlog *log, 571 - struct xlog_in_core *iclog) 570 + struct xlog_in_core *iclog, 571 + struct xlog_ticket *ticket) 572 572 { 573 573 xfs_lsn_t tail_lsn; 574 574 bool last_ref; ··· 616 614 trace_xlog_iclog_syncing(iclog, _RET_IP_); 617 615 618 616 spin_unlock(&log->l_icloglock); 619 - xlog_sync(log, iclog); 617 + xlog_sync(log, iclog, ticket); 620 618 spin_lock(&log->l_icloglock); 621 619 return 0; 622 620 } ··· 883 881 iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; 884 882 if (iclog->ic_state == XLOG_STATE_ACTIVE) 885 883 xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); 886 - return xlog_state_release_iclog(iclog->ic_log, iclog); 884 + return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); 887 885 } 888 886 889 887 /* ··· 2029 2027 STATIC void 2030 2028 xlog_sync( 2031 2029 struct xlog *log, 2032 - struct xlog_in_core *iclog) 2030 + struct xlog_in_core *iclog, 2031 + struct xlog_ticket *ticket) 2033 2032 { 2034 2033 unsigned int count; /* byte count of bwrite */ 2035 2034 unsigned int roundoff; /* roundoff to BB or stripe */ ··· 2042 2039 2043 2040 count = xlog_calc_iclog_size(log, iclog, &roundoff); 2044 2041 2045 - /* move grant heads by roundoff in sync */ 2046 - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); 2047 - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); 2042 + /* 2043 + * If we have a ticket, account for the roundoff via the ticket 2044 + * reservation to avoid touching the hot grant heads needlessly. 2045 + * Otherwise, we have to move grant heads directly. 2046 + */ 2047 + if (ticket) { 2048 + ticket->t_curr_res -= roundoff; 2049 + } else { 2050 + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); 2051 + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); 2052 + } 2048 2053 2049 2054 /* put cycle number in every block */ 2050 - xlog_pack_data(log, iclog, roundoff); 2055 + xlog_pack_data(log, iclog, roundoff); 2051 2056 2052 2057 /* real byte length */ 2053 2058 size = iclog->ic_offset; ··· 2288 2277 spin_lock(&log->l_icloglock); 2289 2278 ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); 2290 2279 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2291 - error = xlog_state_release_iclog(log, iclog); 2280 + error = xlog_state_release_iclog(log, iclog, ticket); 2292 2281 spin_unlock(&log->l_icloglock); 2293 2282 if (error) 2294 2283 return error; ··· 2550 2539 */ 2551 2540 spin_lock(&log->l_icloglock); 2552 2541 xlog_state_finish_copy(log, iclog, record_cnt, 0); 2553 - error = xlog_state_release_iclog(log, iclog); 2542 + error = xlog_state_release_iclog(log, iclog, ticket); 2554 2543 spin_unlock(&log->l_icloglock); 2555 2544 2556 2545 return error; ··· 2970 2959 * reference to the iclog. 2971 2960 */ 2972 2961 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) 2973 - error = xlog_state_release_iclog(log, iclog); 2962 + error = xlog_state_release_iclog(log, iclog, ticket); 2974 2963 spin_unlock(&log->l_icloglock); 2975 2964 if (error) 2976 2965 return error;
+16 -4
fs/xfs/xfs_log_cil.c
··· 1189 1189 xfs_csn_t push_seq; 1190 1190 bool push_commit_stable; 1191 1191 LIST_HEAD (whiteouts); 1192 + struct xlog_ticket *ticket; 1192 1193 1193 1194 new_ctx = xlog_cil_ctx_alloc(); 1194 1195 new_ctx->ticket = xlog_cil_ticket_alloc(log); ··· 1324 1323 if (error) 1325 1324 goto out_abort_free_ticket; 1326 1325 1327 - xfs_log_ticket_ungrant(log, ctx->ticket); 1326 + /* 1327 + * Grab the ticket from the ctx so we can ungrant it after releasing the 1328 + * commit_iclog. The ctx may be freed by the time we return from 1329 + * releasing the commit_iclog (i.e. checkpoint has been completed and 1330 + * callback run) so we can't reference the ctx after the call to 1331 + * xlog_state_release_iclog(). 1332 + */ 1333 + ticket = ctx->ticket; 1328 1334 1329 1335 /* 1330 1336 * If the checkpoint spans multiple iclogs, wait for all previous iclogs ··· 1381 1373 if (push_commit_stable && 1382 1374 ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) 1383 1375 xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); 1384 - xlog_state_release_iclog(log, ctx->commit_iclog); 1376 + ticket = ctx->ticket; 1377 + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); 1385 1378 1386 1379 /* Not safe to reference ctx now! */ 1387 1380 1388 1381 spin_unlock(&log->l_icloglock); 1389 1382 xlog_cil_cleanup_whiteouts(&whiteouts); 1383 + xfs_log_ticket_ungrant(log, ticket); 1390 1384 return; 1391 1385 1392 1386 out_skip: ··· 1398 1388 return; 1399 1389 1400 1390 out_abort_free_ticket: 1401 - xfs_log_ticket_ungrant(log, ctx->ticket); 1402 1391 ASSERT(xlog_is_shutdown(log)); 1403 1392 xlog_cil_cleanup_whiteouts(&whiteouts); 1404 1393 if (!ctx->commit_iclog) { 1394 + xfs_log_ticket_ungrant(log, ctx->ticket); 1405 1395 xlog_cil_committed(ctx); 1406 1396 return; 1407 1397 } 1408 1398 spin_lock(&log->l_icloglock); 1409 - xlog_state_release_iclog(log, ctx->commit_iclog); 1399 + ticket = ctx->ticket; 1400 + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); 1410 1401 /* Not safe to reference ctx now! */ 1411 1402 spin_unlock(&log->l_icloglock); 1403 + xfs_log_ticket_ungrant(log, ticket); 1412 1404 } 1413 1405 1414 1406 /*
+2 -1
fs/xfs/xfs_log_priv.h
··· 515 515 516 516 void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, 517 517 int eventual_size); 518 - int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); 518 + int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, 519 + struct xlog_ticket *ticket); 519 520 520 521 /* 521 522 * When we crack an atomic LSN, we sample it first so that the value will not