Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

jbd: Write journal superblock with WRITE_FUA after checkpointing

If journal superblock is written only in disk's caches and other transaction
starts reusing space of the transaction cleaned from the log, it can happen
blocks of a new transaction reach the disk before journal superblock. When
power failure happens in such case, subsequent journal replay would still try
to replay the old transaction but some of it's blocks may be already
overwritten by the new transaction. For this reason we must use WRITE_FUA when
updating log tail and we must first write new log tail to disk and update
in-memory information only after that.

Signed-off-by: Jan Kara <jack@suse.cz>

Jan Kara fd2cbd4d 1ce8486d

+65 -39
+10 -13
fs/jbd/checkpoint.c
··· 508 508 /* 509 509 * We need to make sure that any blocks that were recently written out 510 510 * --- perhaps by log_do_checkpoint() --- are flushed out before we 511 - * drop the transactions from the journal. It's unlikely this will be 512 - * necessary, especially with an appropriately sized journal, but we 513 - * need this to guarantee correctness. Fortunately 514 - * cleanup_journal_tail() doesn't get called all that often. 511 + * drop the transactions from the journal. Similarly we need to be sure 512 + * superblock makes it to disk before next transaction starts reusing 513 + * freed space (otherwise we could replay some blocks of the new 514 + * transaction thinking they belong to the old one). So we use 515 + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially 516 + * with an appropriately sized journal, but we need this to guarantee 517 + * correctness. Fortunately cleanup_journal_tail() doesn't get called 518 + * all that often. 515 519 */ 516 - if (journal->j_flags & JFS_BARRIER) 517 - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 520 + journal_update_sb_log_tail(journal, first_tid, blocknr, 521 + WRITE_FLUSH_FUA); 518 522 519 523 spin_lock(&journal->j_state_lock); 520 - if (!tid_gt(first_tid, journal->j_tail_sequence)) { 521 - spin_unlock(&journal->j_state_lock); 522 - /* Someone else cleaned up journal so return 0 */ 523 - return 0; 524 - } 525 524 /* OK, update the superblock to recover the freed space. 526 525 * Physical blocks come first: have we wrapped beyond the end of 527 526 * the log? */ ··· 538 539 journal->j_tail_sequence = first_tid; 539 540 journal->j_tail = blocknr; 540 541 spin_unlock(&journal->j_state_lock); 541 - if (!(journal->j_flags & JFS_ABORT)) 542 - journal_update_sb_log_tail(journal); 543 542 return 0; 544 543 } 545 544
+8 -1
fs/jbd/commit.c
··· 309 309 if (journal->j_flags & JFS_FLUSHED) { 310 310 jbd_debug(3, "super block updated\n"); 311 311 mutex_lock(&journal->j_checkpoint_mutex); 312 - journal_update_sb_log_tail(journal); 312 + /* 313 + * We hold j_checkpoint_mutex so tail cannot change under us. 314 + * We don't need any special data guarantees for writing sb 315 + * since journal is empty and it is ok for write to be 316 + * flushed only with transaction commit. 317 + */ 318 + journal_update_sb_log_tail(journal, journal->j_tail_sequence, 319 + journal->j_tail, WRITE_SYNC); 313 320 mutex_unlock(&journal->j_checkpoint_mutex); 314 321 } else { 315 322 jbd_debug(3, "superblock not updated\n");
+39 -21
fs/jbd/journal.c
··· 938 938 } else { 939 939 /* Lock here to make assertions happy... */ 940 940 mutex_lock(&journal->j_checkpoint_mutex); 941 - /* Add the dynamic fields and write it to disk. */ 942 - journal_update_sb_log_tail(journal); 941 + /* 942 + * Update log tail information. We use WRITE_FUA since new 943 + * transaction will start reusing journal space and so we 944 + * must make sure information about current log tail is on 945 + * disk before that. 946 + */ 947 + journal_update_sb_log_tail(journal, 948 + journal->j_tail_sequence, 949 + journal->j_tail, 950 + WRITE_FUA); 943 951 mutex_unlock(&journal->j_checkpoint_mutex); 944 952 } 945 953 return journal_start_thread(journal); ··· 1026 1018 return journal_reset(journal); 1027 1019 } 1028 1020 1029 - static void journal_write_superblock(journal_t *journal) 1021 + static void journal_write_superblock(journal_t *journal, int write_op) 1030 1022 { 1031 1023 struct buffer_head *bh = journal->j_sb_buffer; 1024 + int ret; 1032 1025 1033 - trace_journal_write_superblock(journal); 1026 + trace_journal_write_superblock(journal, write_op); 1027 + if (!(journal->j_flags & JFS_BARRIER)) 1028 + write_op &= ~(REQ_FUA | REQ_FLUSH); 1029 + lock_buffer(bh); 1034 1030 if (buffer_write_io_error(bh)) { 1035 1031 char b[BDEVNAME_SIZE]; 1036 1032 /* ··· 1052 1040 set_buffer_uptodate(bh); 1053 1041 } 1054 1042 1055 - BUFFER_TRACE(bh, "marking dirty"); 1056 - mark_buffer_dirty(bh); 1057 - sync_dirty_buffer(bh); 1043 + get_bh(bh); 1044 + bh->b_end_io = end_buffer_write_sync; 1045 + ret = submit_bh(write_op, bh); 1046 + wait_on_buffer(bh); 1058 1047 if (buffer_write_io_error(bh)) { 1059 - char b[BDEVNAME_SIZE]; 1060 - printk(KERN_ERR "JBD: I/O error detected " 1061 - "when updating journal superblock for %s.\n", 1062 - journal_dev_name(journal, b)); 1063 1048 clear_buffer_write_io_error(bh); 1064 1049 set_buffer_uptodate(bh); 1050 + ret = -EIO; 1051 + } 1052 + if (ret) { 1053 + char b[BDEVNAME_SIZE]; 1054 + printk(KERN_ERR "JBD: Error %d detected " 1055 + "when updating journal superblock for %s.\n", 1056 + ret, journal_dev_name(journal, b)); 1065 1057 } 1066 1058 } 1067 1059 1068 1060 /** 1069 1061 * journal_update_sb_log_tail() - Update log tail in journal sb on disk. 1070 1062 * @journal: The journal to update. 1063 + * @tail_tid: TID of the new transaction at the tail of the log 1064 + * @tail_block: The first block of the transaction at the tail of the log 1065 + * @write_op: With which operation should we write the journal sb 1071 1066 * 1072 1067 * Update a journal's superblock information about log tail and write it to 1073 1068 * disk, waiting for the IO to complete. 1074 1069 */ 1075 - void journal_update_sb_log_tail(journal_t *journal) 1070 + void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, 1071 + unsigned int tail_block, int write_op) 1076 1072 { 1077 1073 journal_superblock_t *sb = journal->j_superblock; 1078 1074 1079 1075 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1080 - spin_lock(&journal->j_state_lock); 1081 - jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", 1082 - journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1076 + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", 1077 + tail_block, tail_tid); 1083 1078 1084 - sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1085 - sb->s_start = cpu_to_be32(journal->j_tail); 1086 - spin_unlock(&journal->j_state_lock); 1079 + sb->s_sequence = cpu_to_be32(tail_tid); 1080 + sb->s_start = cpu_to_be32(tail_block); 1087 1081 1088 - journal_write_superblock(journal); 1082 + journal_write_superblock(journal, write_op); 1089 1083 1090 1084 /* Log is no longer empty */ 1091 1085 spin_lock(&journal->j_state_lock); ··· 1120 1102 sb->s_start = cpu_to_be32(0); 1121 1103 spin_unlock(&journal->j_state_lock); 1122 1104 1123 - journal_write_superblock(journal); 1105 + journal_write_superblock(journal, WRITE_FUA); 1124 1106 1125 1107 spin_lock(&journal->j_state_lock); 1126 1108 /* Log is empty */ ··· 1145 1127 sb->s_errno = cpu_to_be32(journal->j_errno); 1146 1128 spin_unlock(&journal->j_state_lock); 1147 1129 1148 - journal_write_superblock(journal); 1130 + journal_write_superblock(journal, WRITE_SYNC); 1149 1131 } 1150 1132 1151 1133 /*
+2 -1
include/linux/jbd.h
··· 864 864 extern int journal_recover (journal_t *journal); 865 865 extern int journal_wipe (journal_t *, int); 866 866 extern int journal_skip_recovery (journal_t *); 867 - extern void journal_update_sb_log_tail (journal_t *); 867 + extern void journal_update_sb_log_tail (journal_t *, tid_t, unsigned int, 868 + int); 868 869 extern void journal_abort (journal_t *, int); 869 870 extern int journal_errno (journal_t *); 870 871 extern void journal_ack_err (journal_t *);
+6 -3
include/trace/events/jbd.h
··· 170 170 ); 171 171 172 172 TRACE_EVENT(journal_write_superblock, 173 - TP_PROTO(journal_t *journal), 173 + TP_PROTO(journal_t *journal, int write_op), 174 174 175 - TP_ARGS(journal), 175 + TP_ARGS(journal, write_op), 176 176 177 177 TP_STRUCT__entry( 178 178 __field( dev_t, dev ) 179 + __field( int, write_op ) 179 180 ), 180 181 181 182 TP_fast_assign( 182 183 __entry->dev = journal->j_fs_dev->bd_dev; 184 + __entry->write_op = write_op; 183 185 ), 184 186 185 - TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) 187 + TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev), 188 + MINOR(__entry->dev), __entry->write_op) 186 189 ); 187 190 188 191 #endif /* _TRACE_JBD_H */