jbd2: Fix a race between checkpointing code and journal_get_write_access()

The following race can happen:

CPU1 CPU2
checkpointing code checks the buffer, adds
it to an array for writeback
do_get_write_access()
...
lock_buffer()
unlock_buffer()
flush_batch() submits the buffer for IO
__jbd2_journal_file_buffer()

So a buffer under writeout is returned from
do_get_write_access(). Since the filesystem code relies on the fact
that journaled buffers cannot be written out, it does not take the
buffer lock and so it can modify buffer while it is under
writeout. That can lead to a filesystem corruption if we crash at the
right moment.

We fix the problem by clearing the buffer dirty bit under buffer_lock
even if the buffer is on BJ_None list. Actually, we clear the dirty
bit regardless the list the buffer is in and warn about the fact if
the buffer is already journalled.

Thanks for spotting the problem goes to dingdinghua <dingdinghua85@gmail.com>.

Reported-by: dingdinghua <dingdinghua85@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Jan Kara and committed by Theodore Ts'o f91d1d04 3e03f9ca

+35 -33
+35 -33
fs/jbd2/transaction.c
··· 499 wake_up(&journal->j_wait_transaction_locked); 500 } 501 502 - /* 503 - * Report any unexpected dirty buffers which turn up. Normally those 504 - * indicate an error, but they can occur if the user is running (say) 505 - * tune2fs to modify the live filesystem, so we need the option of 506 - * continuing as gracefully as possible. # 507 - * 508 - * The caller should already hold the journal lock and 509 - * j_list_lock spinlock: most callers will need those anyway 510 - * in order to probe the buffer's journaling state safely. 511 - */ 512 - static void jbd_unexpected_dirty_buffer(struct journal_head *jh) 513 { 514 - int jlist; 515 516 - /* If this buffer is one which might reasonably be dirty 517 - * --- ie. data, or not part of this journal --- then 518 - * we're OK to leave it alone, but otherwise we need to 519 - * move the dirty bit to the journal's own internal 520 - * JBDDirty bit. */ 521 - jlist = jh->b_jlist; 522 - 523 - if (jlist == BJ_Metadata || jlist == BJ_Reserved || 524 - jlist == BJ_Shadow || jlist == BJ_Forget) { 525 - struct buffer_head *bh = jh2bh(jh); 526 - 527 - if (test_clear_buffer_dirty(bh)) 528 - set_buffer_jbddirty(bh); 529 - } 530 } 531 532 /* ··· 574 if (jh->b_next_transaction) 575 J_ASSERT_JH(jh, jh->b_next_transaction == 576 transaction); 577 } 578 /* 579 * In any case we need to clean the dirty flag and we must 580 * do it under the buffer lock to be sure we don't race 581 * with running write-out. 582 */ 583 - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 584 - jbd_unexpected_dirty_buffer(jh); 585 } 586 587 unlock_buffer(bh); ··· 826 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 827 828 if (jh->b_transaction == NULL) { 829 jh->b_transaction = transaction; 830 831 /* first access by this transaction */ ··· 1636 1637 if (jh->b_cp_transaction) { 1638 JBUFFER_TRACE(jh, "on running+cp transaction"); 1639 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1640 - clear_buffer_jbddirty(bh); 1641 may_free = 0; 1642 } else { 1643 JBUFFER_TRACE(jh, "on running transaction"); ··· 1893 if (jh->b_transaction && jh->b_jlist == jlist) 1894 return; 1895 1896 - /* The following list of buffer states needs to be consistent 1897 - * with __jbd_unexpected_dirty_buffer()'s handling of dirty 1898 - * state. */ 1899 - 1900 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 1901 jlist == BJ_Shadow || jlist == BJ_Forget) { 1902 if (test_clear_buffer_dirty(bh) || 1903 test_clear_buffer_jbddirty(bh)) 1904 was_dirty = 1;
··· 499 wake_up(&journal->j_wait_transaction_locked); 500 } 501 502 + static void warn_dirty_buffer(struct buffer_head *bh) 503 { 504 + char b[BDEVNAME_SIZE]; 505 506 + printk(KERN_WARNING 507 + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 508 + "There's a risk of filesystem corruption in case of system " 509 + "crash.\n", 510 + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 511 } 512 513 /* ··· 593 if (jh->b_next_transaction) 594 J_ASSERT_JH(jh, jh->b_next_transaction == 595 transaction); 596 + warn_dirty_buffer(bh); 597 } 598 /* 599 * In any case we need to clean the dirty flag and we must 600 * do it under the buffer lock to be sure we don't race 601 * with running write-out. 602 */ 603 + JBUFFER_TRACE(jh, "Journalling dirty buffer"); 604 + clear_buffer_dirty(bh); 605 + set_buffer_jbddirty(bh); 606 } 607 608 unlock_buffer(bh); ··· 843 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 844 845 if (jh->b_transaction == NULL) { 846 + /* 847 + * Previous jbd2_journal_forget() could have left the buffer 848 + * with jbddirty bit set because it was being committed. When 849 + * the commit finished, we've filed the buffer for 850 + * checkpointing and marked it dirty. Now we are reallocating 851 + * the buffer so the transaction freeing it must have 852 + * committed and so it's safe to clear the dirty bit. 853 + */ 854 + clear_buffer_dirty(jh2bh(jh)); 855 jh->b_transaction = transaction; 856 857 /* first access by this transaction */ ··· 1644 1645 if (jh->b_cp_transaction) { 1646 JBUFFER_TRACE(jh, "on running+cp transaction"); 1647 + /* 1648 + * We don't want to write the buffer anymore, clear the 1649 + * bit so that we don't confuse checks in 1650 + * __journal_file_buffer 1651 + */ 1652 + clear_buffer_dirty(bh); 1653 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1654 may_free = 0; 1655 } else { 1656 JBUFFER_TRACE(jh, "on running transaction"); ··· 1896 if (jh->b_transaction && jh->b_jlist == jlist) 1897 return; 1898 1899 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 1900 jlist == BJ_Shadow || jlist == BJ_Forget) { 1901 + /* 1902 + * For metadata buffers, we track dirty bit in buffer_jbddirty 1903 + * instead of buffer_dirty. We should not see a dirty bit set 1904 + * here because we clear it in do_get_write_access but e.g. 1905 + * tune2fs can modify the sb and set the dirty bit at any time 1906 + * so we try to gracefully handle that. 1907 + */ 1908 + if (buffer_dirty(bh)) 1909 + warn_dirty_buffer(bh); 1910 if (test_clear_buffer_dirty(bh) || 1911 test_clear_buffer_jbddirty(bh)) 1912 was_dirty = 1;