ext4: Turn off multiple page-io submission by default

Jon Nelson has found a test case which causes postgresql to fail with
the error:

psql:t.sql:4: ERROR: invalid page header in block 38269 of relation base/16384/16581

Under memory pressure, it looks like part of a file can end up getting
replaced by zero's. Until we can figure out the cause, we'll roll
back the change and use block_write_full_page() instead of
ext4_bio_write_page(). The new, more efficient writing function can
be used via the mount option mblk_io_submit, so we can test and fix
the new page I/O code.

To reproduce the problem, install postgres 8.4 or 9.0, and pin enough
memory such that the system just at the end of triggering writeback
before running the following sql script:

begin;
create temporary table foo as select x as a, ARRAY[x] as b FROM
generate_series(1, 10000000 ) AS x;
create index foo_a_idx on foo (a);
create index foo_b_idx on foo USING GIN (b);
rollback;

If the temporary table is created on a hard drive partition which is
encrypted using dm_crypt, then under memory pressure, approximately
30-40% of the time, pgsql will issue the above failure.

This patch should fix this problem, and the problem will come back if
the file system is mounted with the mblk_io_submit mount option.

Reported-by: Jon Nelson <jnelson@jamponi.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

+17 -3
+1
fs/ext4/ext4.h
··· 910 910 #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 911 911 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 912 912 #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 913 + #define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ 913 914 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 914 915 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 915 916 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+4 -1
fs/ext4/inode.c
··· 2125 2125 */ 2126 2126 if (unlikely(journal_data && PageChecked(page))) 2127 2127 err = __ext4_journalled_writepage(page, len); 2128 - else 2128 + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) 2129 2129 err = ext4_bio_write_page(&io_submit, page, 2130 2130 len, mpd->wbc); 2131 + else 2132 + err = block_write_full_page(page, 2133 + noalloc_get_block_write, mpd->wbc); 2131 2134 2132 2135 if (!err) 2133 2136 mpd->pages_written++;
+12 -2
fs/ext4/super.c
··· 1026 1026 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1027 1027 seq_puts(seq, ",nodelalloc"); 1028 1028 1029 + if (test_opt(sb, MBLK_IO_SUBMIT)) 1030 + seq_puts(seq, ",mblk_io_submit"); 1029 1031 if (sbi->s_stripe) 1030 1032 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1031 1033 /* ··· 1241 1239 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1242 1240 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, 1243 1241 Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, 1244 - Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1245 - Opt_block_validity, Opt_noblock_validity, 1242 + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, 1243 + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1246 1244 Opt_inode_readahead_blks, Opt_journal_ioprio, 1247 1245 Opt_dioread_nolock, Opt_dioread_lock, 1248 1246 Opt_discard, Opt_nodiscard, ··· 1306 1304 {Opt_resize, "resize"}, 1307 1305 {Opt_delalloc, "delalloc"}, 1308 1306 {Opt_nodelalloc, "nodelalloc"}, 1307 + {Opt_mblk_io_submit, "mblk_io_submit"}, 1308 + {Opt_nomblk_io_submit, "nomblk_io_submit"}, 1309 1309 {Opt_block_validity, "block_validity"}, 1310 1310 {Opt_noblock_validity, "noblock_validity"}, 1311 1311 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, ··· 1728 1724 break; 1729 1725 case Opt_nodelalloc: 1730 1726 clear_opt(sbi->s_mount_opt, DELALLOC); 1727 + break; 1728 + case Opt_mblk_io_submit: 1729 + set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); 1730 + break; 1731 + case Opt_nomblk_io_submit: 1732 + clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); 1731 1733 break; 1732 1734 case Opt_stripe: 1733 1735 if (match_int(&args[0], &option))