Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: fix races between page faults and hole punching

Currently, page faults and hole punching are completely unsynchronized.
This can result in page fault faulting in a page into a range that we
are punching after truncate_pagecache_range() has been called and thus
we can end up with a page mapped to disk blocks that will be shortly
freed. Filesystem corruption will shortly follow. Note that the same
race is avoided for truncate by checking page fault offset against
i_size but there isn't similar mechanism available for punching holes.

Fix the problem by creating new rw semaphore i_mmap_sem in inode and
grab it for writing over truncate, hole punching, and other functions
removing blocks from extent tree and for read over page faults. We
cannot easily use i_data_sem for this since that ranks below transaction
start and we need something ranking above it so that it can be held over
the whole truncate / hole punching operation. Also remove various
workarounds we had in the code to reduce race window when page fault
could have created pages with stale mapping information.

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Jan Kara and committed by
Theodore Ts'o
ea3d7209 f41683a2

+127 -42
+10
fs/ext4/ext4.h
··· 910 910 * by other means, so we have i_data_sem. 911 911 */ 912 912 struct rw_semaphore i_data_sem; 913 + /* 914 + * i_mmap_sem is for serializing page faults with truncate / punch hole 915 + * operations. We have to make sure that new page cannot be faulted in 916 + * a section of the inode that is being punched. We cannot easily use 917 + * i_data_sem for this since we need protection for the whole punch 918 + * operation and i_data_sem ranks below transaction start so we have 919 + * to occasionally drop it. 920 + */ 921 + struct rw_semaphore i_mmap_sem; 913 922 struct inode vfs_inode; 914 923 struct jbd2_inode *jinode; 915 924 ··· 2493 2484 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2494 2485 loff_t lstart, loff_t lend); 2495 2486 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2487 + extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); 2496 2488 extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2497 2489 extern void ext4_da_update_reserve_space(struct inode *inode, 2498 2490 int used, int quota_claim);
+30 -24
fs/ext4/extents.c
··· 4770 4770 int partial_begin, partial_end; 4771 4771 loff_t start, end; 4772 4772 ext4_lblk_t lblk; 4773 - struct address_space *mapping = inode->i_mapping; 4774 4773 unsigned int blkbits = inode->i_blkbits; 4775 4774 4776 4775 trace_ext4_zero_range(inode, offset, len, mode); ··· 4780 4781 /* Call ext4_force_commit to flush all data in case of data=journal. */ 4781 4782 if (ext4_should_journal_data(inode)) { 4782 4783 ret = ext4_force_commit(inode->i_sb); 4783 - if (ret) 4784 - return ret; 4785 - } 4786 - 4787 - /* 4788 - * Write out all dirty pages to avoid race conditions 4789 - * Then release them. 4790 - */ 4791 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4792 - ret = filemap_write_and_wait_range(mapping, offset, 4793 - offset + len - 1); 4794 4784 if (ret) 4795 4785 return ret; 4796 4786 } ··· 4844 4856 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4845 4857 EXT4_EX_NOCACHE); 4846 4858 4847 - /* Now release the pages and zero block aligned part of pages*/ 4848 - truncate_pagecache_range(inode, start, end - 1); 4849 - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4850 - 4851 4859 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4852 4860 ext4_inode_block_unlocked_dio(inode); 4853 4861 inode_dio_wait(inode); 4854 4862 4863 + /* 4864 + * Prevent page faults from reinstantiating pages we have 4865 + * released from page cache. 4866 + */ 4867 + down_write(&EXT4_I(inode)->i_mmap_sem); 4868 + /* Now release the pages and zero block aligned part of pages */ 4869 + truncate_pagecache_range(inode, start, end - 1); 4870 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4871 + 4855 4872 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4856 4873 flags, mode); 4874 + up_write(&EXT4_I(inode)->i_mmap_sem); 4857 4875 if (ret) 4858 4876 goto out_dio; 4859 4877 } ··· 5518 5524 goto out_mutex; 5519 5525 } 5520 5526 5521 - truncate_pagecache(inode, ioffset); 5522 - 5523 5527 /* Wait for existing dio to complete */ 5524 5528 ext4_inode_block_unlocked_dio(inode); 5525 5529 inode_dio_wait(inode); 5530 + 5531 + /* 5532 + * Prevent page faults from reinstantiating pages we have released from 5533 + * page cache. 5534 + */ 5535 + down_write(&EXT4_I(inode)->i_mmap_sem); 5536 + truncate_pagecache(inode, ioffset); 5526 5537 5527 5538 credits = ext4_writepage_trans_blocks(inode); 5528 5539 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5529 5540 if (IS_ERR(handle)) { 5530 5541 ret = PTR_ERR(handle); 5531 - goto out_dio; 5542 + goto out_mmap; 5532 5543 } 5533 5544 5534 5545 down_write(&EXT4_I(inode)->i_data_sem); ··· 5572 5573 5573 5574 out_stop: 5574 5575 ext4_journal_stop(handle); 5575 - out_dio: 5576 + out_mmap: 5577 + up_write(&EXT4_I(inode)->i_mmap_sem); 5576 5578 ext4_inode_resume_unlocked_dio(inode); 5577 5579 out_mutex: 5578 5580 mutex_unlock(&inode->i_mutex); ··· 5660 5660 goto out_mutex; 5661 5661 } 5662 5662 5663 - truncate_pagecache(inode, ioffset); 5664 - 5665 5663 /* Wait for existing dio to complete */ 5666 5664 ext4_inode_block_unlocked_dio(inode); 5667 5665 inode_dio_wait(inode); 5666 + 5667 + /* 5668 + * Prevent page faults from reinstantiating pages we have released from 5669 + * page cache. 5670 + */ 5671 + down_write(&EXT4_I(inode)->i_mmap_sem); 5672 + truncate_pagecache(inode, ioffset); 5668 5673 5669 5674 credits = ext4_writepage_trans_blocks(inode); 5670 5675 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5671 5676 if (IS_ERR(handle)) { 5672 5677 ret = PTR_ERR(handle); 5673 - goto out_dio; 5678 + goto out_mmap; 5674 5679 } 5675 5680 5676 5681 /* Expand file to avoid data loss if there is error while shifting */ ··· 5746 5741 5747 5742 out_stop: 5748 5743 ext4_journal_stop(handle); 5749 - out_dio: 5744 + out_mmap: 5745 + up_write(&EXT4_I(inode)->i_mmap_sem); 5750 5746 ext4_inode_resume_unlocked_dio(inode); 5751 5747 out_mutex: 5752 5748 mutex_unlock(&inode->i_mutex);
+57 -9
fs/ext4/file.c
··· 209 209 { 210 210 int result; 211 211 handle_t *handle = NULL; 212 - struct super_block *sb = file_inode(vma->vm_file)->i_sb; 212 + struct inode *inode = file_inode(vma->vm_file); 213 + struct super_block *sb = inode->i_sb; 213 214 bool write = vmf->flags & FAULT_FLAG_WRITE; 214 215 215 216 if (write) { 216 217 sb_start_pagefault(sb); 217 218 file_update_time(vma->vm_file); 219 + down_read(&EXT4_I(inode)->i_mmap_sem); 218 220 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 219 221 EXT4_DATA_TRANS_BLOCKS(sb)); 220 - } 222 + } else 223 + down_read(&EXT4_I(inode)->i_mmap_sem); 221 224 222 225 if (IS_ERR(handle)) 223 226 result = VM_FAULT_SIGBUS; ··· 231 228 if (write) { 232 229 if (!IS_ERR(handle)) 233 230 ext4_journal_stop(handle); 231 + up_read(&EXT4_I(inode)->i_mmap_sem); 234 232 sb_end_pagefault(sb); 235 - } 233 + } else 234 + up_read(&EXT4_I(inode)->i_mmap_sem); 236 235 237 236 return result; 238 237 } ··· 251 246 if (write) { 252 247 sb_start_pagefault(sb); 253 248 file_update_time(vma->vm_file); 249 + down_read(&EXT4_I(inode)->i_mmap_sem); 254 250 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 255 251 ext4_chunk_trans_blocks(inode, 256 252 PMD_SIZE / PAGE_SIZE)); 257 - } 253 + } else 254 + down_read(&EXT4_I(inode)->i_mmap_sem); 258 255 259 256 if (IS_ERR(handle)) 260 257 result = VM_FAULT_SIGBUS; ··· 267 260 if (write) { 268 261 if (!IS_ERR(handle)) 269 262 ext4_journal_stop(handle); 263 + up_read(&EXT4_I(inode)->i_mmap_sem); 270 264 sb_end_pagefault(sb); 271 - } 265 + } else 266 + up_read(&EXT4_I(inode)->i_mmap_sem); 272 267 273 268 return result; 274 269 } 275 270 276 271 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 277 272 { 278 - return dax_mkwrite(vma, vmf, ext4_get_block_dax, 279 - ext4_end_io_unwritten); 273 + int err; 274 + struct inode *inode = file_inode(vma->vm_file); 275 + 276 + sb_start_pagefault(inode->i_sb); 277 + file_update_time(vma->vm_file); 278 + down_read(&EXT4_I(inode)->i_mmap_sem); 279 + err = __dax_mkwrite(vma, vmf, ext4_get_block_dax, 280 + ext4_end_io_unwritten); 281 + up_read(&EXT4_I(inode)->i_mmap_sem); 282 + sb_end_pagefault(inode->i_sb); 283 + 284 + return err; 285 + } 286 + 287 + /* 288 + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() 289 + * handler we check for races agaist truncate. Note that since we cycle through 290 + * i_mmap_sem, we are sure that also any hole punching that began before we 291 + * were called is finished by now and so if it included part of the file we 292 + * are working on, our pte will get unmapped and the check for pte_same() in 293 + * wp_pfn_shared() fails. Thus fault gets retried and things work out as 294 + * desired. 295 + */ 296 + static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, 297 + struct vm_fault *vmf) 298 + { 299 + struct inode *inode = file_inode(vma->vm_file); 300 + struct super_block *sb = inode->i_sb; 301 + int ret = VM_FAULT_NOPAGE; 302 + loff_t size; 303 + 304 + sb_start_pagefault(sb); 305 + file_update_time(vma->vm_file); 306 + down_read(&EXT4_I(inode)->i_mmap_sem); 307 + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 308 + if (vmf->pgoff >= size) 309 + ret = VM_FAULT_SIGBUS; 310 + up_read(&EXT4_I(inode)->i_mmap_sem); 311 + sb_end_pagefault(sb); 312 + 313 + return ret; 280 314 } 281 315 282 316 static const struct vm_operations_struct ext4_dax_vm_ops = { 283 317 .fault = ext4_dax_fault, 284 318 .pmd_fault = ext4_dax_pmd_fault, 285 319 .page_mkwrite = ext4_dax_mkwrite, 286 - .pfn_mkwrite = dax_pfn_mkwrite, 320 + .pfn_mkwrite = ext4_dax_pfn_mkwrite, 287 321 }; 288 322 #else 289 323 #define ext4_dax_vm_ops ext4_file_vm_ops 290 324 #endif 291 325 292 326 static const struct vm_operations_struct ext4_file_vm_ops = { 293 - .fault = filemap_fault, 327 + .fault = ext4_filemap_fault, 294 328 .map_pages = filemap_map_pages, 295 329 .page_mkwrite = ext4_page_mkwrite, 296 330 };
+27 -9
fs/ext4/inode.c
··· 3623 3623 3624 3624 } 3625 3625 3626 + /* Wait all existing dio workers, newcomers will block on i_mutex */ 3627 + ext4_inode_block_unlocked_dio(inode); 3628 + inode_dio_wait(inode); 3629 + 3630 + /* 3631 + * Prevent page faults from reinstantiating pages we have released from 3632 + * page cache. 3633 + */ 3634 + down_write(&EXT4_I(inode)->i_mmap_sem); 3626 3635 first_block_offset = round_up(offset, sb->s_blocksize); 3627 3636 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 3628 3637 ··· 3639 3630 if (last_block_offset > first_block_offset) 3640 3631 truncate_pagecache_range(inode, first_block_offset, 3641 3632 last_block_offset); 3642 - 3643 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 3644 - ext4_inode_block_unlocked_dio(inode); 3645 - inode_dio_wait(inode); 3646 3633 3647 3634 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3648 3635 credits = ext4_writepage_trans_blocks(inode); ··· 3685 3680 if (IS_SYNC(inode)) 3686 3681 ext4_handle_sync(handle); 3687 3682 3688 - /* Now release the pages again to reduce race window */ 3689 - if (last_block_offset > first_block_offset) 3690 - truncate_pagecache_range(inode, first_block_offset, 3691 - last_block_offset); 3692 - 3693 3683 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3694 3684 ext4_mark_inode_dirty(handle, inode); 3695 3685 out_stop: 3696 3686 ext4_journal_stop(handle); 3697 3687 out_dio: 3688 + up_write(&EXT4_I(inode)->i_mmap_sem); 3698 3689 ext4_inode_resume_unlocked_dio(inode); 3699 3690 out_mutex: 3700 3691 mutex_unlock(&inode->i_mutex); ··· 4824 4823 } else 4825 4824 ext4_wait_for_tail_page_commit(inode); 4826 4825 } 4826 + down_write(&EXT4_I(inode)->i_mmap_sem); 4827 4827 /* 4828 4828 * Truncate pagecache after we've waited for commit 4829 4829 * in data=journal mode to make pages freeable. ··· 4832 4830 truncate_pagecache(inode, inode->i_size); 4833 4831 if (shrink) 4834 4832 ext4_truncate(inode); 4833 + up_write(&EXT4_I(inode)->i_mmap_sem); 4835 4834 } 4836 4835 4837 4836 if (!rc) { ··· 5281 5278 5282 5279 sb_start_pagefault(inode->i_sb); 5283 5280 file_update_time(vma->vm_file); 5281 + 5282 + down_read(&EXT4_I(inode)->i_mmap_sem); 5284 5283 /* Delalloc case is easy... */ 5285 5284 if (test_opt(inode->i_sb, DELALLOC) && 5286 5285 !ext4_should_journal_data(inode) && ··· 5352 5347 out_ret: 5353 5348 ret = block_page_mkwrite_return(ret); 5354 5349 out: 5350 + up_read(&EXT4_I(inode)->i_mmap_sem); 5355 5351 sb_end_pagefault(inode->i_sb); 5356 5352 return ret; 5353 + } 5354 + 5355 + int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 5356 + { 5357 + struct inode *inode = file_inode(vma->vm_file); 5358 + int err; 5359 + 5360 + down_read(&EXT4_I(inode)->i_mmap_sem); 5361 + err = filemap_fault(vma, vmf); 5362 + up_read(&EXT4_I(inode)->i_mmap_sem); 5363 + 5364 + return err; 5357 5365 }
+1
fs/ext4/super.c
··· 958 958 INIT_LIST_HEAD(&ei->i_orphan); 959 959 init_rwsem(&ei->xattr_sem); 960 960 init_rwsem(&ei->i_data_sem); 961 + init_rwsem(&ei->i_mmap_sem); 961 962 inode_init_once(&ei->vfs_inode); 962 963 } 963 964
+2
fs/ext4/truncate.h
··· 10 10 */ 11 11 static inline void ext4_truncate_failed_write(struct inode *inode) 12 12 { 13 + down_write(&EXT4_I(inode)->i_mmap_sem); 13 14 truncate_inode_pages(inode->i_mapping, inode->i_size); 14 15 ext4_truncate(inode); 16 + up_write(&EXT4_I(inode)->i_mmap_sem); 15 17 } 16 18 17 19 /*