Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: add fsync batch tuning knobs

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

+91 -8
+29
Documentation/filesystems/ext4.txt
··· 283 283 nodelalloc Disable delayed allocation. Blocks are allocation 284 284 when data is copied from user to page cache. 285 285 286 + max_batch_time=usec Maximum amount of time ext4 should wait for 287 + additional filesystem operations to be batch 288 + together with a synchronous write operation. 289 + Since a synchronous write operation is going to 290 + force a commit and then a wait for the I/O 291 + complete, it doesn't cost much, and can be a 292 + huge throughput win, we wait for a small amount 293 + of time to see if any other transactions can 294 + piggyback on the synchronous write. The 295 + algorithm used is designed to automatically tune 296 + for the speed of the disk, by measuring the 297 + amount of time (on average) that it takes to 298 + finish committing a transaction. Call this time 299 + the "commit time". If the time that the 300 + transactoin has been running is less than the 301 + commit time, ext4 will try sleeping for the 302 + commit time to see if other operations will join 303 + the transaction. The commit time is capped by 304 + the max_batch_time, which defaults to 15000us 305 + (15ms). This optimization can be turned off 306 + entirely by setting max_batch_time to 0. 307 + 308 + min_batch_time=usec This parameter sets the commit time (as 309 + described above) to be at least min_batch_time. 310 + It defaults to zero microseconds. Increasing 311 + this parameter may improve the throughput of 312 + multi-threaded, synchronous workloads on very 313 + fast disks, at the cost of increasing latency. 314 + 286 315 Data Mode 287 316 ========= 288 317 There are 3 different data modes:
+7
fs/ext4/ext4.h
··· 328 328 uid_t s_resuid; 329 329 gid_t s_resgid; 330 330 unsigned long s_commit_interval; 331 + u32 s_min_batch_time, s_max_batch_time; 331 332 #ifdef CONFIG_QUOTA 332 333 int s_jquota_fmt; 333 334 char *s_qf_names[MAXQUOTAS]; ··· 805 804 #define EXT4_DEFM_JMODE_DATA 0x0020 806 805 #define EXT4_DEFM_JMODE_ORDERED 0x0040 807 806 #define EXT4_DEFM_JMODE_WBACK 0x0060 807 + 808 + /* 809 + * Default journal batch times 810 + */ 811 + #define EXT4_DEF_MIN_BATCH_TIME 0 812 + #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ 808 813 809 814 /* 810 815 * Structure of a directory entry
+2
fs/ext4/ext4_sb.h
··· 74 74 struct journal_s *s_journal; 75 75 struct list_head s_orphan; 76 76 unsigned long s_commit_interval; 77 + u32 s_max_batch_time; 78 + u32 s_min_batch_time; 77 79 struct block_device *journal_bdev; 78 80 #ifdef CONFIG_JBD2_DEBUG 79 81 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+40 -7
fs/ext4/super.c
··· 705 705 #endif 706 706 if (!test_opt(sb, RESERVATION)) 707 707 seq_puts(seq, ",noreservation"); 708 - if (sbi->s_commit_interval) { 708 + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 709 709 seq_printf(seq, ",commit=%u", 710 710 (unsigned) (sbi->s_commit_interval / HZ)); 711 711 } 712 + if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { 713 + seq_printf(seq, ",min_batch_time=%u", 714 + (unsigned) sbi->s_min_batch_time); 715 + } 716 + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { 717 + seq_printf(seq, ",max_batch_time=%u", 718 + (unsigned) sbi->s_min_batch_time); 719 + } 720 + 712 721 /* 713 722 * We're changing the default of barrier mount option, so 714 723 * let's always display its mount state so it's clear what its ··· 883 874 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 884 875 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 885 876 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 886 - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 877 + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, 878 + Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 887 879 Opt_journal_checksum, Opt_journal_async_commit, 888 880 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 889 881 Opt_data_err_abort, Opt_data_err_ignore, ··· 923 913 {Opt_nobh, "nobh"}, 924 914 {Opt_bh, "bh"}, 925 915 {Opt_commit, "commit=%u"}, 916 + {Opt_min_batch_time, "min_batch_time=%u"}, 917 + {Opt_max_batch_time, "max_batch_time=%u"}, 926 918 {Opt_journal_update, "journal=update"}, 927 919 {Opt_journal_inum, "journal=%u"}, 928 920 {Opt_journal_dev, "journal_dev=%u"}, ··· 1142 1130 if (option == 0) 1143 1131 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1144 1132 sbi->s_commit_interval = HZ * option; 1133 + break; 1134 + case Opt_max_batch_time: 1135 + if (match_int(&args[0], &option)) 1136 + return 0; 1137 + if (option < 0) 1138 + return 0; 1139 + if (option == 0) 1140 + option = EXT4_DEF_MAX_BATCH_TIME; 1141 + sbi->s_max_batch_time = option; 1142 + break; 1143 + case Opt_min_batch_time: 1144 + if (match_int(&args[0], &option)) 1145 + return 0; 1146 + if (option < 0) 1147 + return 0; 1148 + sbi->s_min_batch_time = option; 1145 1149 break; 1146 1150 case Opt_data_journal: 1147 1151 data_opt = EXT4_MOUNT_JOURNAL_DATA; ··· 2007 1979 2008 1980 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2009 1981 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 1982 + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 1983 + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 1984 + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2010 1985 2011 1986 set_opt(sbi->s_mount_opt, RESERVATION); 2012 1987 set_opt(sbi->s_mount_opt, BARRIER); ··· 2555 2524 { 2556 2525 struct ext4_sb_info *sbi = EXT4_SB(sb); 2557 2526 2558 - if (sbi->s_commit_interval) 2559 - journal->j_commit_interval = sbi->s_commit_interval; 2560 - /* We could also set up an ext4-specific default for the commit 2561 - * interval here, but for now we'll just fall back to the jbd 2562 - * default. */ 2527 + journal->j_commit_interval = sbi->s_commit_interval; 2528 + journal->j_min_batch_time = sbi->s_min_batch_time; 2529 + journal->j_max_batch_time = sbi->s_max_batch_time; 2563 2530 2564 2531 spin_lock(&journal->j_state_lock); 2565 2532 if (test_opt(sb, BARRIER)) ··· 3071 3042 old_opts.s_resuid = sbi->s_resuid; 3072 3043 old_opts.s_resgid = sbi->s_resgid; 3073 3044 old_opts.s_commit_interval = sbi->s_commit_interval; 3045 + old_opts.s_min_batch_time = sbi->s_min_batch_time; 3046 + old_opts.s_max_batch_time = sbi->s_max_batch_time; 3074 3047 #ifdef CONFIG_QUOTA 3075 3048 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 3076 3049 for (i = 0; i < MAXQUOTAS; i++) ··· 3209 3178 sbi->s_resuid = old_opts.s_resuid; 3210 3179 sbi->s_resgid = old_opts.s_resgid; 3211 3180 sbi->s_commit_interval = old_opts.s_commit_interval; 3181 + sbi->s_min_batch_time = old_opts.s_min_batch_time; 3182 + sbi->s_max_batch_time = old_opts.s_max_batch_time; 3212 3183 #ifdef CONFIG_QUOTA 3213 3184 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 3214 3185 for (i = 0; i < MAXQUOTAS; i++) {
+2
fs/jbd2/journal.c
··· 964 964 spin_lock_init(&journal->j_state_lock); 965 965 966 966 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 967 + journal->j_min_batch_time = 0; 968 + journal->j_max_batch_time = 15000; /* 15ms */ 967 969 968 970 /* The journal is marked for error until we succeed with recovery! */ 969 971 journal->j_flags = JBD2_ABORT;
+3 -1
fs/jbd2/transaction.c
··· 1255 1255 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1256 1256 transaction->t_start_time)); 1257 1257 1258 + commit_time = max_t(u64, commit_time, 1259 + 1000*journal->j_min_batch_time); 1258 1260 commit_time = min_t(u64, commit_time, 1259 - 1000*jiffies_to_usecs(1)); 1261 + 1000*journal->j_max_batch_time); 1260 1262 1261 1263 if (trans_time < commit_time) { 1262 1264 ktime_t expires = ktime_add_ns(ktime_get(),
+8
include/linux/jbd2.h
··· 956 956 */ 957 957 u64 j_average_commit_time; 958 958 959 + /* 960 + * minimum and maximum times that we should wait for 961 + * additional filesystem operations to get batched into a 962 + * synchronous handle in microseconds 963 + */ 964 + u32 j_min_batch_time; 965 + u32 j_max_batch_time; 966 + 959 967 /* This function is called when a transaction is closed */ 960 968 void (*j_commit_callback)(journal_t *, 961 969 transaction_t *);