jbd2: Fix a race between checkpointing code and journal_get_write_access()

The following race can happen:

CPU1 CPU2
checkpointing code checks the buffer, adds
it to an array for writeback
do_get_write_access()
...
lock_buffer()
unlock_buffer()
flush_batch() submits the buffer for IO
__jbd2_journal_file_buffer()

So a buffer under writeout is returned from
do_get_write_access(). Since the filesystem code relies on the fact
that journaled buffers cannot be written out, it does not take the
buffer lock and so it can modify buffer while it is under
writeout. That can lead to a filesystem corruption if we crash at the
right moment.

We fix the problem by clearing the buffer dirty bit under buffer_lock
even if the buffer is on BJ_None list. Actually, we clear the dirty
bit regardless the list the buffer is in and warn about the fact if
the buffer is already journalled.

Thanks for spotting the problem goes to dingdinghua <dingdinghua85@gmail.com>.

Reported-by: dingdinghua <dingdinghua85@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Jan Kara and committed by Theodore Ts'o f91d1d04 3e03f9ca

+35 -33
+35 -33
fs/jbd2/transaction.c
··· 499 499 wake_up(&journal->j_wait_transaction_locked); 500 500 } 501 501 502 - /* 503 - * Report any unexpected dirty buffers which turn up. Normally those 504 - * indicate an error, but they can occur if the user is running (say) 505 - * tune2fs to modify the live filesystem, so we need the option of 506 - * continuing as gracefully as possible. # 507 - * 508 - * The caller should already hold the journal lock and 509 - * j_list_lock spinlock: most callers will need those anyway 510 - * in order to probe the buffer's journaling state safely. 511 - */ 512 - static void jbd_unexpected_dirty_buffer(struct journal_head *jh) 502 + static void warn_dirty_buffer(struct buffer_head *bh) 513 503 { 514 - int jlist; 504 + char b[BDEVNAME_SIZE]; 515 505 516 - /* If this buffer is one which might reasonably be dirty 517 - * --- ie. data, or not part of this journal --- then 518 - * we're OK to leave it alone, but otherwise we need to 519 - * move the dirty bit to the journal's own internal 520 - * JBDDirty bit. */ 521 - jlist = jh->b_jlist; 522 - 523 - if (jlist == BJ_Metadata || jlist == BJ_Reserved || 524 - jlist == BJ_Shadow || jlist == BJ_Forget) { 525 - struct buffer_head *bh = jh2bh(jh); 526 - 527 - if (test_clear_buffer_dirty(bh)) 528 - set_buffer_jbddirty(bh); 529 - } 506 + printk(KERN_WARNING 507 + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 508 + "There's a risk of filesystem corruption in case of system " 509 + "crash.\n", 510 + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 530 511 } 531 512 532 513 /* ··· 574 593 if (jh->b_next_transaction) 575 594 J_ASSERT_JH(jh, jh->b_next_transaction == 576 595 transaction); 596 + warn_dirty_buffer(bh); 577 597 } 578 598 /* 579 599 * In any case we need to clean the dirty flag and we must 580 600 * do it under the buffer lock to be sure we don't race 581 601 * with running write-out. 582 602 */ 583 - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 584 - jbd_unexpected_dirty_buffer(jh); 603 + JBUFFER_TRACE(jh, "Journalling dirty buffer"); 604 + clear_buffer_dirty(bh); 605 + set_buffer_jbddirty(bh); 585 606 } 586 607 587 608 unlock_buffer(bh); ··· 826 843 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 827 844 828 845 if (jh->b_transaction == NULL) { 846 + /* 847 + * Previous jbd2_journal_forget() could have left the buffer 848 + * with jbddirty bit set because it was being committed. When 849 + * the commit finished, we've filed the buffer for 850 + * checkpointing and marked it dirty. Now we are reallocating 851 + * the buffer so the transaction freeing it must have 852 + * committed and so it's safe to clear the dirty bit. 853 + */ 854 + clear_buffer_dirty(jh2bh(jh)); 829 855 jh->b_transaction = transaction; 830 856 831 857 /* first access by this transaction */ ··· 1636 1644 1637 1645 if (jh->b_cp_transaction) { 1638 1646 JBUFFER_TRACE(jh, "on running+cp transaction"); 1647 + /* 1648 + * We don't want to write the buffer anymore, clear the 1649 + * bit so that we don't confuse checks in 1650 + * __journal_file_buffer 1651 + */ 1652 + clear_buffer_dirty(bh); 1639 1653 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1640 - clear_buffer_jbddirty(bh); 1641 1654 may_free = 0; 1642 1655 } else { 1643 1656 JBUFFER_TRACE(jh, "on running transaction"); ··· 1893 1896 if (jh->b_transaction && jh->b_jlist == jlist) 1894 1897 return; 1895 1898 1896 - /* The following list of buffer states needs to be consistent 1897 - * with __jbd_unexpected_dirty_buffer()'s handling of dirty 1898 - * state. */ 1899 - 1900 1899 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 1901 1900 jlist == BJ_Shadow || jlist == BJ_Forget) { 1901 + /* 1902 + * For metadata buffers, we track dirty bit in buffer_jbddirty 1903 + * instead of buffer_dirty. We should not see a dirty bit set 1904 + * here because we clear it in do_get_write_access but e.g. 1905 + * tune2fs can modify the sb and set the dirty bit at any time 1906 + * so we try to gracefully handle that. 1907 + */ 1908 + if (buffer_dirty(bh)) 1909 + warn_dirty_buffer(bh); 1902 1910 if (test_clear_buffer_dirty(bh) || 1903 1911 test_clear_buffer_jbddirty(bh)) 1904 1912 was_dirty = 1;