Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext3: Wait for proper transaction commit on fsync

We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Jan Kara fe8bc91c ea0174a7

+57 -21
+16 -20
fs/ext3/fsync.c
··· 46 46 int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 47 47 { 48 48 struct inode *inode = dentry->d_inode; 49 + struct ext3_inode_info *ei = EXT3_I(inode); 50 + journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 49 51 int ret = 0; 52 + tid_t commit_tid; 53 + 54 + if (inode->i_sb->s_flags & MS_RDONLY) 55 + return 0; 50 56 51 57 J_ASSERT(ext3_journal_current_handle() == NULL); 52 58 53 59 /* 54 - * data=writeback: 60 + * data=writeback,ordered: 55 61 * The caller's filemap_fdatawrite()/wait will sync the data. 56 - * sync_inode() will sync the metadata 57 - * 58 - * data=ordered: 59 - * The caller's filemap_fdatawrite() will write the data and 60 - * sync_inode() will write the inode if it is dirty. Then the caller's 61 - * filemap_fdatawait() will wait on the pages. 62 + * Metadata is in the journal, we wait for a proper transaction 63 + * to commit here. 62 64 * 63 65 * data=journal: 64 66 * filemap_fdatawrite won't do anything (the buffers are clean). ··· 75 73 goto out; 76 74 } 77 75 78 - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 79 - goto flush; 76 + if (datasync) 77 + commit_tid = atomic_read(&ei->i_datasync_tid); 78 + else 79 + commit_tid = atomic_read(&ei->i_sync_tid); 80 80 81 - /* 82 - * The VFS has written the file data. If the inode is unaltered 83 - * then we need not start a commit. 84 - */ 85 - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { 86 - struct writeback_control wbc = { 87 - .sync_mode = WB_SYNC_ALL, 88 - .nr_to_write = 0, /* sys_fsync did this */ 89 - }; 90 - ret = sync_inode(inode, &wbc); 81 + if (log_start_commit(journal, commit_tid)) { 82 + log_wait_commit(journal, commit_tid); 91 83 goto out; 92 84 } 93 - flush: 85 + 94 86 /* 95 87 * In case we didn't commit a transaction, we have to flush 96 88 * disk caches manually so that data really is on persistent
+31 -1
fs/ext3/inode.c
··· 699 699 int err = 0; 700 700 struct ext3_block_alloc_info *block_i; 701 701 ext3_fsblk_t current_block; 702 + struct ext3_inode_info *ei = EXT3_I(inode); 702 703 703 - block_i = EXT3_I(inode)->i_block_alloc_info; 704 + block_i = ei->i_block_alloc_info; 704 705 /* 705 706 * If we're splicing into a [td]indirect block (as opposed to the 706 707 * inode) then we need to get write access to the [td]indirect block ··· 742 741 743 742 inode->i_ctime = CURRENT_TIME_SEC; 744 743 ext3_mark_inode_dirty(handle, inode); 744 + /* ext3_mark_inode_dirty already updated i_sync_tid */ 745 + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); 745 746 746 747 /* had we spliced it onto indirect block? */ 747 748 if (where->bh) { ··· 2757 2754 struct ext3_inode_info *ei; 2758 2755 struct buffer_head *bh; 2759 2756 struct inode *inode; 2757 + journal_t *journal = EXT3_SB(sb)->s_journal; 2758 + transaction_t *transaction; 2760 2759 long ret; 2761 2760 int block; 2762 2761 ··· 2835 2830 for (block = 0; block < EXT3_N_BLOCKS; block++) 2836 2831 ei->i_data[block] = raw_inode->i_block[block]; 2837 2832 INIT_LIST_HEAD(&ei->i_orphan); 2833 + 2834 + /* 2835 + * Set transaction id's of transactions that have to be committed 2836 + * to finish f[data]sync. We set them to currently running transaction 2837 + * as we cannot be sure that the inode or some of its metadata isn't 2838 + * part of the transaction - the inode could have been reclaimed and 2839 + * now it is reread from disk. 2840 + */ 2841 + if (journal) { 2842 + tid_t tid; 2843 + 2844 + spin_lock(&journal->j_state_lock); 2845 + if (journal->j_running_transaction) 2846 + transaction = journal->j_running_transaction; 2847 + else 2848 + transaction = journal->j_committing_transaction; 2849 + if (transaction) 2850 + tid = transaction->t_tid; 2851 + else 2852 + tid = journal->j_commit_sequence; 2853 + spin_unlock(&journal->j_state_lock); 2854 + atomic_set(&ei->i_sync_tid, tid); 2855 + atomic_set(&ei->i_datasync_tid, tid); 2856 + } 2838 2857 2839 2858 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 2840 2859 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { ··· 3044 3015 err = rc; 3045 3016 ei->i_state &= ~EXT3_STATE_NEW; 3046 3017 3018 + atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); 3047 3019 out_brelse: 3048 3020 brelse (bh); 3049 3021 ext3_std_error(inode->i_sb, err);
+2
fs/ext3/super.c
··· 466 466 return NULL; 467 467 ei->i_block_alloc_info = NULL; 468 468 ei->vfs_inode.i_version = 1; 469 + atomic_set(&ei->i_datasync_tid, 0); 470 + atomic_set(&ei->i_sync_tid, 0); 469 471 return &ei->vfs_inode; 470 472 } 471 473
+8
include/linux/ext3_fs_i.h
··· 137 137 * by other means, so we have truncate_mutex. 138 138 */ 139 139 struct mutex truncate_mutex; 140 + 141 + /* 142 + * Transactions that contain inode's metadata needed to complete 143 + * fsync and fdatasync, respectively. 144 + */ 145 + atomic_t i_sync_tid; 146 + atomic_t i_datasync_tid; 147 + 140 148 struct inode vfs_inode; 141 149 }; 142 150