Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

jbd2: add support for avoiding data writes during transaction commits

Currently when filesystem needs to make sure data is on permanent
storage before committing a transaction it adds inode to transaction's
inode list. During transaction commit, jbd2 writes back all dirty
buffers that have allocated underlying blocks and waits for the IO to
finish. However when doing writeback for delayed allocated data, we
allocate blocks and immediately submit the data. Thus asking jbd2 to
write dirty pages just unnecessarily adds more work to jbd2 possibly
writing back other redirtied blocks.

Add support to jbd2 to allow filesystem to ask jbd2 to only wait for
outstanding data writes before committing a transaction and thus avoid
unnecessary writes.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Jan Kara and committed by
Theodore Ts'o
41617e1a 3957ef53

+38 -9
+2 -1
fs/ext4/ext4_jbd2.h
··· 362 362 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 363 363 { 364 364 if (ext4_handle_valid(handle)) 365 - return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); 365 + return jbd2_journal_inode_add_write(handle, 366 + EXT4_I(inode)->jinode); 366 367 return 0; 367 368 } 368 369
+4
fs/jbd2/commit.c
··· 219 219 220 220 spin_lock(&journal->j_list_lock); 221 221 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 222 + if (!(jinode->i_flags & JI_WRITE_DATA)) 223 + continue; 222 224 mapping = jinode->i_vfs_inode->i_mapping; 223 225 jinode->i_flags |= JI_COMMIT_RUNNING; 224 226 spin_unlock(&journal->j_list_lock); ··· 258 256 /* For locking, see the comment in journal_submit_data_buffers() */ 259 257 spin_lock(&journal->j_list_lock); 260 258 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 259 + if (!(jinode->i_flags & JI_WAIT_DATA)) 260 + continue; 261 261 jinode->i_flags |= JI_COMMIT_RUNNING; 262 262 spin_unlock(&journal->j_list_lock); 263 263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+2 -1
fs/jbd2/journal.c
··· 94 94 EXPORT_SYMBOL(jbd2_journal_invalidatepage); 95 95 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 96 96 EXPORT_SYMBOL(jbd2_journal_force_commit); 97 - EXPORT_SYMBOL(jbd2_journal_file_inode); 97 + EXPORT_SYMBOL(jbd2_journal_inode_add_write); 98 + EXPORT_SYMBOL(jbd2_journal_inode_add_wait); 98 99 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 99 100 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 100 101 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+18 -4
fs/jbd2/transaction.c
··· 2462 2462 /* 2463 2463 * File inode in the inode list of the handle's transaction 2464 2464 */ 2465 - int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2465 + static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, 2466 + unsigned long flags) 2466 2467 { 2467 2468 transaction_t *transaction = handle->h_transaction; 2468 2469 journal_t *journal; ··· 2488 2487 * and if jinode->i_next_transaction == transaction, commit code 2489 2488 * will only file the inode where we want it. 2490 2489 */ 2491 - if (jinode->i_transaction == transaction || 2492 - jinode->i_next_transaction == transaction) 2490 + if ((jinode->i_transaction == transaction || 2491 + jinode->i_next_transaction == transaction) && 2492 + (jinode->i_flags & flags) == flags) 2493 2493 return 0; 2494 2494 2495 2495 spin_lock(&journal->j_list_lock); 2496 - 2496 + jinode->i_flags |= flags; 2497 + /* Is inode already attached where we need it? */ 2497 2498 if (jinode->i_transaction == transaction || 2498 2499 jinode->i_next_transaction == transaction) 2499 2500 goto done; ··· 2524 2521 spin_unlock(&journal->j_list_lock); 2525 2522 2526 2523 return 0; 2524 + } 2525 + 2526 + int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) 2527 + { 2528 + return jbd2_journal_file_inode(handle, jinode, 2529 + JI_WRITE_DATA | JI_WAIT_DATA); 2530 + } 2531 + 2532 + int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) 2533 + { 2534 + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA); 2527 2535 } 2528 2536 2529 2537 /*
+1 -1
fs/ocfs2/journal.h
··· 619 619 620 620 static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) 621 621 { 622 - return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); 622 + return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); 623 623 } 624 624 625 625 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+11 -2
include/linux/jbd2.h
··· 403 403 404 404 /* Flags in jbd_inode->i_flags */ 405 405 #define __JI_COMMIT_RUNNING 0 406 - /* Commit of the inode data in progress. We use this flag to protect us from 406 + #define __JI_WRITE_DATA 1 407 + #define __JI_WAIT_DATA 2 408 + 409 + /* 410 + * Commit of the inode data in progress. We use this flag to protect us from 407 411 * concurrent deletion of inode. We cannot use reference to inode for this 408 412 * since we cannot afford doing last iput() on behalf of kjournald 409 413 */ 410 414 #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) 415 + /* Write allocated dirty buffers in this inode before commit */ 416 + #define JI_WRITE_DATA (1 << __JI_WRITE_DATA) 417 + /* Wait for outstanding data writes for this inode before commit */ 418 + #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) 411 419 412 420 /** 413 421 * struct jbd_inode is the structure linking inodes in ordered mode ··· 1278 1270 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1279 1271 extern int jbd2_journal_force_commit(journal_t *); 1280 1272 extern int jbd2_journal_force_commit_nested(journal_t *); 1281 - extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); 1273 + extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); 1274 + extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); 1282 1275 extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, 1283 1276 struct jbd2_inode *inode, loff_t new_size); 1284 1277 extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);