ext4: Fix Direct I/O locking

We cannot start transaction in ext4_direct_IO() and just let it last
during the whole write because dio_get_page() acquires mmap_sem which
ranks above transaction start (e.g. because we have dependency chain
mmap_sem->PageLock->journal_start, or because we update atime while
holding mmap_sem) and thus deadlocks could happen. We solve the problem
by starting a transaction separately for each ext4_get_block() call.

We *could* have a problem that we allocate a block and before its data
are written out the machine crashes and thus we expose stale data. But
that does not happen because for hole-filling generic code falls back to
buffered writes and for file extension, we add inode to orphan list and
thus in case of crash, journal replay will truncate inode back to the
original size.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Jan Kara and committed by Theodore Ts'o 7fb5409d 8009f9fb

+53 -54
+53 -54
fs/ext4/inode.c
··· 892 return err; 893 } 894 895 - #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) 896 897 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 898 unsigned long max_blocks, struct buffer_head *bh, ··· 948 struct buffer_head *bh_result, int create) 949 { 950 handle_t *handle = ext4_journal_current_handle(); 951 - int ret = 0; 952 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 953 954 - if (!create) 955 - goto get_block; /* A read */ 956 - 957 - if (max_blocks == 1) 958 - goto get_block; /* A single block get */ 959 - 960 - if (handle->h_transaction->t_state == T_LOCKED) { 961 - /* 962 - * Huge direct-io writes can hold off commits for long 963 - * periods of time. Let this commit run. 964 - */ 965 - ext4_journal_stop(handle); 966 - handle = ext4_journal_start(inode, DIO_CREDITS); 967 - if (IS_ERR(handle)) 968 ret = PTR_ERR(handle); 969 - goto get_block; 970 - } 971 - 972 - if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) { 973 - /* 974 - * Getting low on buffer credits... 975 - */ 976 - ret = ext4_journal_extend(handle, DIO_CREDITS); 977 - if (ret > 0) { 978 - /* 979 - * Couldn't extend the transaction. Start a new one. 980 - */ 981 - ret = ext4_journal_restart(handle, DIO_CREDITS); 982 } 983 } 984 985 - get_block: 986 - if (ret == 0) { 987 - ret = ext4_get_blocks_wrap(handle, inode, iblock, 988 max_blocks, bh_result, create, 0); 989 - if (ret > 0) { 990 - bh_result->b_size = (ret << inode->i_blkbits); 991 - ret = 0; 992 - } 993 } 994 return ret; 995 } 996 ··· 1662 * if the machine crashes during the write. 1663 * 1664 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1665 - * crashes then stale disk data _may_ be exposed inside the file. 1666 */ 1667 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 1668 const struct iovec *iov, loff_t offset, ··· 1672 struct file *file = iocb->ki_filp; 1673 struct inode *inode = file->f_mapping->host; 1674 struct ext4_inode_info *ei = EXT4_I(inode); 1675 - handle_t *handle = NULL; 1676 ssize_t ret; 1677 int orphan = 0; 1678 size_t count = iov_length(iov, nr_segs); ··· 1680 if (rw == WRITE) { 1681 loff_t final_size = offset + count; 1682 1683 - handle = ext4_journal_start(inode, DIO_CREDITS); 1684 - if (IS_ERR(handle)) { 1685 - ret = PTR_ERR(handle); 1686 - goto out; 1687 - } 1688 if (final_size > inode->i_size) { 1689 ret = ext4_orphan_add(handle, inode); 1690 - if (ret) 1691 - goto out_stop; 1692 orphan = 1; 1693 ei->i_disksize = inode->i_size; 1694 } 1695 } 1696 ··· 1702 offset, nr_segs, 1703 ext4_get_block, NULL); 1704 1705 - /* 1706 - * Reacquire the handle: ext4_get_block() can restart the transaction 1707 - */ 1708 - handle = ext4_journal_current_handle(); 1709 - 1710 - out_stop: 1711 - if (handle) { 1712 int err; 1713 1714 - if (orphan && inode->i_nlink) 1715 ext4_orphan_del(handle, inode); 1716 - if (orphan && ret > 0) { 1717 loff_t end = offset + ret; 1718 if (end > inode->i_size) { 1719 ei->i_disksize = end;
··· 892 return err; 893 } 894 895 + /* Maximum number of blocks we map for direct IO at once. */ 896 + #define DIO_MAX_BLOCKS 4096 897 + /* 898 + * Number of credits we need for writing DIO_MAX_BLOCKS: 899 + * We need sb + group descriptor + bitmap + inode -> 4 900 + * For B blocks with A block pointers per block we need: 901 + * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). 902 + * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. 903 + */ 904 + #define DIO_CREDITS 25 905 906 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 907 unsigned long max_blocks, struct buffer_head *bh, ··· 939 struct buffer_head *bh_result, int create) 940 { 941 handle_t *handle = ext4_journal_current_handle(); 942 + int ret = 0, started = 0; 943 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 944 945 + if (create && !handle) { 946 + /* Direct IO write... */ 947 + if (max_blocks > DIO_MAX_BLOCKS) 948 + max_blocks = DIO_MAX_BLOCKS; 949 + handle = ext4_journal_start(inode, DIO_CREDITS + 950 + 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); 951 + if (IS_ERR(handle)) { 952 ret = PTR_ERR(handle); 953 + goto out; 954 } 955 + started = 1; 956 } 957 958 + ret = ext4_get_blocks_wrap(handle, inode, iblock, 959 max_blocks, bh_result, create, 0); 960 + if (ret > 0) { 961 + bh_result->b_size = (ret << inode->i_blkbits); 962 + ret = 0; 963 } 964 + if (started) 965 + ext4_journal_stop(handle); 966 + out: 967 return ret; 968 } 969 ··· 1671 * if the machine crashes during the write. 1672 * 1673 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1674 + * crashes then stale disk data _may_ be exposed inside the file. But current 1675 + * VFS code falls back into buffered path in that case so we are safe. 1676 */ 1677 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 1678 const struct iovec *iov, loff_t offset, ··· 1680 struct file *file = iocb->ki_filp; 1681 struct inode *inode = file->f_mapping->host; 1682 struct ext4_inode_info *ei = EXT4_I(inode); 1683 + handle_t *handle; 1684 ssize_t ret; 1685 int orphan = 0; 1686 size_t count = iov_length(iov, nr_segs); ··· 1688 if (rw == WRITE) { 1689 loff_t final_size = offset + count; 1690 1691 if (final_size > inode->i_size) { 1692 + /* Credits for sb + inode write */ 1693 + handle = ext4_journal_start(inode, 2); 1694 + if (IS_ERR(handle)) { 1695 + ret = PTR_ERR(handle); 1696 + goto out; 1697 + } 1698 ret = ext4_orphan_add(handle, inode); 1699 + if (ret) { 1700 + ext4_journal_stop(handle); 1701 + goto out; 1702 + } 1703 orphan = 1; 1704 ei->i_disksize = inode->i_size; 1705 + ext4_journal_stop(handle); 1706 } 1707 } 1708 ··· 1706 offset, nr_segs, 1707 ext4_get_block, NULL); 1708 1709 + if (orphan) { 1710 int err; 1711 1712 + /* Credits for sb + inode write */ 1713 + handle = ext4_journal_start(inode, 2); 1714 + if (IS_ERR(handle)) { 1715 + /* This is really bad luck. We've written the data 1716 + * but cannot extend i_size. Bail out and pretend 1717 + * the write failed... */ 1718 + ret = PTR_ERR(handle); 1719 + goto out; 1720 + } 1721 + if (inode->i_nlink) 1722 ext4_orphan_del(handle, inode); 1723 + if (ret > 0) { 1724 loff_t end = offset + ret; 1725 if (end > inode->i_size) { 1726 ei->i_disksize = end;