ext4: Fix Direct I/O locking

We cannot start transaction in ext4_direct_IO() and just let it last
during the whole write because dio_get_page() acquires mmap_sem which
ranks above transaction start (e.g. because we have dependency chain
mmap_sem->PageLock->journal_start, or because we update atime while
holding mmap_sem) and thus deadlocks could happen. We solve the problem
by starting a transaction separately for each ext4_get_block() call.

We *could* have a problem that we allocate a block and before its data
are written out the machine crashes and thus we expose stale data. But
that does not happen because for hole-filling generic code falls back to
buffered writes and for file extension, we add inode to orphan list and
thus in case of crash, journal replay will truncate inode back to the
original size.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by Jan Kara and committed by Theodore Ts'o 7fb5409d 8009f9fb

+53 -54
+53 -54
fs/ext4/inode.c
··· 892 892 return err; 893 893 } 894 894 895 - #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) 895 + /* Maximum number of blocks we map for direct IO at once. */ 896 + #define DIO_MAX_BLOCKS 4096 897 + /* 898 + * Number of credits we need for writing DIO_MAX_BLOCKS: 899 + * We need sb + group descriptor + bitmap + inode -> 4 900 + * For B blocks with A block pointers per block we need: 901 + * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). 902 + * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. 903 + */ 904 + #define DIO_CREDITS 25 896 905 897 906 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 898 907 unsigned long max_blocks, struct buffer_head *bh, ··· 948 939 struct buffer_head *bh_result, int create) 949 940 { 950 941 handle_t *handle = ext4_journal_current_handle(); 951 - int ret = 0; 942 + int ret = 0, started = 0; 952 943 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 953 944 954 - if (!create) 955 - goto get_block; /* A read */ 956 - 957 - if (max_blocks == 1) 958 - goto get_block; /* A single block get */ 959 - 960 - if (handle->h_transaction->t_state == T_LOCKED) { 961 - /* 962 - * Huge direct-io writes can hold off commits for long 963 - * periods of time. Let this commit run. 964 - */ 965 - ext4_journal_stop(handle); 966 - handle = ext4_journal_start(inode, DIO_CREDITS); 967 - if (IS_ERR(handle)) 945 + if (create && !handle) { 946 + /* Direct IO write... */ 947 + if (max_blocks > DIO_MAX_BLOCKS) 948 + max_blocks = DIO_MAX_BLOCKS; 949 + handle = ext4_journal_start(inode, DIO_CREDITS + 950 + 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); 951 + if (IS_ERR(handle)) { 968 952 ret = PTR_ERR(handle); 969 - goto get_block; 970 - } 971 - 972 - if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) { 973 - /* 974 - * Getting low on buffer credits... 975 - */ 976 - ret = ext4_journal_extend(handle, DIO_CREDITS); 977 - if (ret > 0) { 978 - /* 979 - * Couldn't extend the transaction. Start a new one. 980 - */ 981 - ret = ext4_journal_restart(handle, DIO_CREDITS); 953 + goto out; 982 954 } 955 + started = 1; 983 956 } 984 957 985 - get_block: 986 - if (ret == 0) { 987 - ret = ext4_get_blocks_wrap(handle, inode, iblock, 958 + ret = ext4_get_blocks_wrap(handle, inode, iblock, 988 959 max_blocks, bh_result, create, 0); 989 - if (ret > 0) { 990 - bh_result->b_size = (ret << inode->i_blkbits); 991 - ret = 0; 992 - } 960 + if (ret > 0) { 961 + bh_result->b_size = (ret << inode->i_blkbits); 962 + ret = 0; 993 963 } 964 + if (started) 965 + ext4_journal_stop(handle); 966 + out: 994 967 return ret; 995 968 } 996 969 ··· 1662 1671 * if the machine crashes during the write. 1663 1672 * 1664 1673 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1665 - * crashes then stale disk data _may_ be exposed inside the file. 1674 + * crashes then stale disk data _may_ be exposed inside the file. But current 1675 + * VFS code falls back into buffered path in that case so we are safe. 1666 1676 */ 1667 1677 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 1668 1678 const struct iovec *iov, loff_t offset, ··· 1672 1680 struct file *file = iocb->ki_filp; 1673 1681 struct inode *inode = file->f_mapping->host; 1674 1682 struct ext4_inode_info *ei = EXT4_I(inode); 1675 - handle_t *handle = NULL; 1683 + handle_t *handle; 1676 1684 ssize_t ret; 1677 1685 int orphan = 0; 1678 1686 size_t count = iov_length(iov, nr_segs); ··· 1680 1688 if (rw == WRITE) { 1681 1689 loff_t final_size = offset + count; 1682 1690 1683 - handle = ext4_journal_start(inode, DIO_CREDITS); 1684 - if (IS_ERR(handle)) { 1685 - ret = PTR_ERR(handle); 1686 - goto out; 1687 - } 1688 1691 if (final_size > inode->i_size) { 1692 + /* Credits for sb + inode write */ 1693 + handle = ext4_journal_start(inode, 2); 1694 + if (IS_ERR(handle)) { 1695 + ret = PTR_ERR(handle); 1696 + goto out; 1697 + } 1689 1698 ret = ext4_orphan_add(handle, inode); 1690 - if (ret) 1691 - goto out_stop; 1699 + if (ret) { 1700 + ext4_journal_stop(handle); 1701 + goto out; 1702 + } 1692 1703 orphan = 1; 1693 1704 ei->i_disksize = inode->i_size; 1705 + ext4_journal_stop(handle); 1694 1706 } 1695 1707 } 1696 1708 ··· 1702 1706 offset, nr_segs, 1703 1707 ext4_get_block, NULL); 1704 1708 1705 - /* 1706 - * Reacquire the handle: ext4_get_block() can restart the transaction 1707 - */ 1708 - handle = ext4_journal_current_handle(); 1709 - 1710 - out_stop: 1711 - if (handle) { 1709 + if (orphan) { 1712 1710 int err; 1713 1711 1714 - if (orphan && inode->i_nlink) 1712 + /* Credits for sb + inode write */ 1713 + handle = ext4_journal_start(inode, 2); 1714 + if (IS_ERR(handle)) { 1715 + /* This is really bad luck. We've written the data 1716 + * but cannot extend i_size. Bail out and pretend 1717 + * the write failed... */ 1718 + ret = PTR_ERR(handle); 1719 + goto out; 1720 + } 1721 + if (inode->i_nlink) 1715 1722 ext4_orphan_del(handle, inode); 1716 - if (orphan && ret > 0) { 1723 + if (ret > 0) { 1717 1724 loff_t end = offset + ret; 1718 1725 if (end > inode->i_size) { 1719 1726 ei->i_disksize = end;