Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: pre-zero allocated blocks for DAX IO

Currently ext4 treats DAX IO the same way as direct IO. I.e., it
allocates unwritten extents before IO is done and converts unwritten
extents afterwards. However this way DAX IO can race with page fault to
the same area:

ext4_ext_direct_IO() dax_fault()
dax_io()
get_block() - allocates unwritten extent
copy_from_iter_pmem()
get_block() - converts
unwritten block to
written and zeroes it
out
ext4_convert_unwritten_extents()

So data written with DAX IO gets lost. Similarly dax_new_buf() called
from dax_io() can overwrite data that has been already written to the
block via mmap.

Fix the problem by using pre-zeroed blocks for DAX IO the same way as we
use them for DAX mmap. The downside of this solution is that every
allocating write writes each block twice (once zeros, once data). Fixing
the race with locking is possible as well however we would need to
lock-out faults for the whole range written to by DAX IO. And that is
not easy to do without locking-out faults for the whole file which seems
too aggressive.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Jan Kara and committed by
Theodore Ts'o
12735f88 914f82a3

+44 -14
+9 -2
fs/ext4/ext4.h
··· 2527 2527 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2528 2528 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 2529 2529 struct buffer_head *bh_result, int create); 2530 - int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 2531 - struct buffer_head *bh_result, int create); 2530 + int ext4_dax_get_block(struct inode *inode, sector_t iblock, 2531 + struct buffer_head *bh_result, int create); 2532 2532 int ext4_get_block(struct inode *inode, sector_t iblock, 2533 2533 struct buffer_head *bh_result, int create); 2534 2534 int ext4_dio_get_block(struct inode *inode, sector_t iblock, ··· 3332 3332 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 3333 3333 wake_up_all(ext4_ioend_wq(inode)); 3334 3334 } 3335 + } 3336 + 3337 + static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) 3338 + { 3339 + int blksize = 1 << inode->i_blkbits; 3340 + 3341 + return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); 3335 3342 } 3336 3343 3337 3344 #endif /* __KERNEL__ */
+2 -2
fs/ext4/file.c
··· 207 207 if (IS_ERR(handle)) 208 208 result = VM_FAULT_SIGBUS; 209 209 else 210 - result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); 210 + result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); 211 211 212 212 if (write) { 213 213 if (!IS_ERR(handle)) ··· 243 243 result = VM_FAULT_SIGBUS; 244 244 else 245 245 result = __dax_pmd_fault(vma, addr, pmd, flags, 246 - ext4_dax_mmap_get_block, NULL); 246 + ext4_dax_get_block, NULL); 247 247 248 248 if (write) { 249 249 if (!IS_ERR(handle))
+33 -10
fs/ext4/inode.c
··· 3229 3229 } 3230 3230 3231 3231 #ifdef CONFIG_FS_DAX 3232 - int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3233 - struct buffer_head *bh_result, int create) 3232 + /* 3233 + * Get block function for DAX IO and mmap faults. It takes care of converting 3234 + * unwritten extents to written ones and initializes new / converted blocks 3235 + * to zeros. 3236 + */ 3237 + int ext4_dax_get_block(struct inode *inode, sector_t iblock, 3238 + struct buffer_head *bh_result, int create) 3234 3239 { 3235 3240 int ret; 3236 3241 3237 - ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", 3238 - inode->i_ino, create); 3242 + ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); 3239 3243 if (!create) 3240 3244 return _ext4_get_block(inode, iblock, bh_result, 0); 3241 3245 ··· 3251 3247 3252 3248 if (buffer_unwritten(bh_result)) { 3253 3249 /* 3254 - * We are protected by i_mmap_sem so we know block cannot go 3255 - * away from under us even though we dropped i_data_sem. 3256 - * Convert extent to written and write zeros there. 3250 + * We are protected by i_mmap_sem or i_mutex so we know block 3251 + * cannot go away from under us even though we dropped 3252 + * i_data_sem. Convert extent to written and write zeros there. 3257 3253 */ 3258 3254 ret = ext4_get_block_trans(inode, iblock, bh_result, 3259 3255 EXT4_GET_BLOCKS_CONVERT | ··· 3266 3262 * doesn't attempt to zero blocks again in a racy way. 3267 3263 */ 3268 3264 clear_buffer_new(bh_result); 3265 + return 0; 3266 + } 3267 + #else 3268 + /* Just define empty function, it will never get called. */ 3269 + int ext4_dax_get_block(struct inode *inode, sector_t iblock, 3270 + struct buffer_head *bh_result, int create) 3271 + { 3272 + BUG(); 3269 3273 return 0; 3270 3274 } 3271 3275 #endif ··· 3397 3385 iocb->private = NULL; 3398 3386 if (overwrite) 3399 3387 get_block_func = ext4_dio_get_block_overwrite; 3400 - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || 3401 - round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { 3388 + else if (IS_DAX(inode)) { 3389 + /* 3390 + * We can avoid zeroing for aligned DAX writes beyond EOF. Other 3391 + * writes need zeroing either because they can race with page 3392 + * faults or because they use partial blocks. 3393 + */ 3394 + if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && 3395 + ext4_aligned_io(inode, offset, count)) 3396 + get_block_func = ext4_dio_get_block; 3397 + else 3398 + get_block_func = ext4_dax_get_block; 3399 + dio_flags = DIO_LOCKING; 3400 + } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || 3401 + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { 3402 3402 get_block_func = ext4_dio_get_block; 3403 3403 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; 3404 3404 } else if (is_sync_kiocb(iocb)) { ··· 3424 3400 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3425 3401 #endif 3426 3402 if (IS_DAX(inode)) { 3427 - dio_flags &= ~DIO_SKIP_HOLES; 3428 3403 ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 3429 3404 ext4_end_io_dio, dio_flags); 3430 3405 } else