commit b6f8dd49dbdbfa60a33bba3d4b766fe341109b4b · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

xfs: ensure that sync updates the log tail correctly

Updates to the VFS layer removed an extra ->sync_fs call into the
filesystem during the sync process (from the quota code).
Unfortunately the sync code was unknowingly relying on this call to
make sure metadata buffers were flushed via a xfs_buftarg_flush()
call to move the tail of the log forward in memory before the final
transactions of the sync process were issued.

As a result, the old code would write a very recent log tail value
to the log by the end of the sync process, and so a subsequent crash
would leave nothing for log recovery to do. Hence in qa test 182,
log recovery only replayed a small handle for inode fsync
transactions in this case.

However, with the removal of the extra ->sync_fs call, the log tail
was now not moved forward with the inode fsync transactions near the
end of the sync procese the first (and only) buftarg flush occurred
after these transactions went to disk. The result is that log
recovery now sees a large number of transactions for metadata that
is already on disk.

This usually isn't a problem, but when the transactions include
inode chunk allocation, the inode create transactions and all
subsequent changes are replayed as we cannt rely on what is on disk
is valid. As a result, if the inode was written and contains
unlogged changes, the unlogged changes are lost, thereby violating
sync semantics.

The fix is to always issue a transaction after the buftarg flush
occurs is the log iѕ not idle or covered. This results in a dummy
transaction being written that contains the up-to-date log tail
value, which will be very recent. Indeed, it will be at least as
recent as the old code would have left on disk, so log recovery
will behave exactly as it used to in this situation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>

authored by Dave Chinner and committed by Alex Elder 16 years ago b6f8dd49 dc57da38

+26 -12

1 changed file

expand all

unified split

xfs

xfs_log.c

+26 -12

fs/xfs/xfs_log.c

··· 745 746 /* 747 * Determine if we have a transaction that has gone to disk 748 - * that needs to be covered. Log activity needs to be idle (no AIL and 749 - * nothing in the iclogs). And, we need to be in the right state indicating 750 - * something has gone out. 751 */ 752 int 753 xfs_log_need_covered(xfs_mount_t *mp) ··· 766 return 0; 767 768 spin_lock(&log->l_icloglock); 769 - if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || 770 - (log->l_covered_state == XLOG_STATE_COVER_NEED2)) 771 - && !xfs_trans_ail_tail(log->l_ailp) 772 - && xlog_iclogs_empty(log)) { 773 - if (log->l_covered_state == XLOG_STATE_COVER_NEED) 774 - log->l_covered_state = XLOG_STATE_COVER_DONE; 775 - else { 776 - ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); 777 - log->l_covered_state = XLOG_STATE_COVER_DONE2; 778 } 779 needed = 1; 780 } 781 spin_unlock(&log->l_icloglock); 782 return needed;

··· 745 746 /* 747 * Determine if we have a transaction that has gone to disk 748 + * that needs to be covered. To begin the transition to the idle state 749 + * firstly the log needs to be idle (no AIL and nothing in the iclogs). 750 + * If we are then in a state where covering is needed, the caller is informed 751 + * that dummy transactions are required to move the log into the idle state. 752 + * 753 + * Because this is called as part of the sync process, we should also indicate 754 + * that dummy transactions should be issued in anything but the covered or 755 + * idle states. This ensures that the log tail is accurately reflected in 756 + * the log at the end of the sync, hence if a crash occurrs avoids replay 757 + * of transactions where the metadata is already on disk. 758 */ 759 int 760 xfs_log_need_covered(xfs_mount_t *mp) ··· 759 return 0; 760 761 spin_lock(&log->l_icloglock); 762 + switch (log->l_covered_state) { 763 + case XLOG_STATE_COVER_DONE: 764 + case XLOG_STATE_COVER_DONE2: 765 + case XLOG_STATE_COVER_IDLE: 766 + break; 767 + case XLOG_STATE_COVER_NEED: 768 + case XLOG_STATE_COVER_NEED2: 769 + if (!xfs_trans_ail_tail(log->l_ailp) && 770 + xlog_iclogs_empty(log)) { 771 + if (log->l_covered_state == XLOG_STATE_COVER_NEED) 772 + log->l_covered_state = XLOG_STATE_COVER_DONE; 773 + else 774 + log->l_covered_state = XLOG_STATE_COVER_DONE2; 775 } 776 + /* FALLTHRU */ 777 + default: 778 needed = 1; 779 + break; 780 } 781 spin_unlock(&log->l_icloglock); 782 return needed;