Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: replace FAULT_FLAG_SIZE with parameter to huge_fault

Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
been somewhat painful with getting the flags set and removed at the
correct locations. More than one kernel oops was introduced due to
difficulties of getting the placement correctly.

Remove the flag values and introduce an input parameter to huge_fault
that indicates the size of the page entry. This makes the code easier
to trace and should avoid the issues we see with the fault flags where
removal of the flag was necessary in the fallback paths.

Link: http://lkml.kernel.org/r/148615748258.43180.1690152053774975329.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Dave Jiang and committed by
Linus Torvalds
c791ace1 9557feee

+46 -38
+12 -6
drivers/dax/dax.c
··· 538 538 } 539 539 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 540 540 541 - static int dax_dev_fault(struct vm_fault *vmf) 541 + static int dax_dev_huge_fault(struct vm_fault *vmf, 542 + enum page_entry_size pe_size) 542 543 { 543 544 int rc; 544 545 struct file *filp = vmf->vma->vm_file; ··· 551 550 vmf->vma->vm_start, vmf->vma->vm_end); 552 551 553 552 rcu_read_lock(); 554 - switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { 555 - case FAULT_FLAG_SIZE_PTE: 553 + switch (pe_size) { 554 + case PE_SIZE_PTE: 556 555 rc = __dax_dev_pte_fault(dax_dev, vmf); 557 556 break; 558 - case FAULT_FLAG_SIZE_PMD: 557 + case PE_SIZE_PMD: 559 558 rc = __dax_dev_pmd_fault(dax_dev, vmf); 560 559 break; 561 - case FAULT_FLAG_SIZE_PUD: 560 + case PE_SIZE_PUD: 562 561 rc = __dax_dev_pud_fault(dax_dev, vmf); 563 562 break; 564 563 default: ··· 569 568 return rc; 570 569 } 571 570 571 + static int dax_dev_fault(struct vm_fault *vmf) 572 + { 573 + return dax_dev_huge_fault(vmf, PE_SIZE_PTE); 574 + } 575 + 572 576 static const struct vm_operations_struct dax_dev_vm_ops = { 573 577 .fault = dax_dev_fault, 574 - .huge_fault = dax_dev_fault, 578 + .huge_fault = dax_dev_huge_fault, 575 579 }; 576 580 577 581 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
+5 -4
fs/dax.c
··· 1452 1452 * has done all the necessary locking for page fault to proceed 1453 1453 * successfully. 1454 1454 */ 1455 - int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) 1455 + int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1456 + const struct iomap_ops *ops) 1456 1457 { 1457 - switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { 1458 - case FAULT_FLAG_SIZE_PTE: 1458 + switch (pe_size) { 1459 + case PE_SIZE_PTE: 1459 1460 return dax_iomap_pte_fault(vmf, ops); 1460 - case FAULT_FLAG_SIZE_PMD: 1461 + case PE_SIZE_PMD: 1461 1462 return dax_iomap_pmd_fault(vmf, ops); 1462 1463 default: 1463 1464 return VM_FAULT_FALLBACK;
+1 -1
fs/ext2/file.c
··· 99 99 } 100 100 down_read(&ei->dax_sem); 101 101 102 - ret = dax_iomap_fault(vmf, &ext2_iomap_ops); 102 + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); 103 103 104 104 up_read(&ei->dax_sem); 105 105 if (vmf->flags & FAULT_FLAG_WRITE)
+9 -3
fs/ext4/file.c
··· 253 253 } 254 254 255 255 #ifdef CONFIG_FS_DAX 256 - static int ext4_dax_fault(struct vm_fault *vmf) 256 + static int ext4_dax_huge_fault(struct vm_fault *vmf, 257 + enum page_entry_size pe_size) 257 258 { 258 259 int result; 259 260 struct inode *inode = file_inode(vmf->vma->vm_file); ··· 266 265 file_update_time(vmf->vma->vm_file); 267 266 } 268 267 down_read(&EXT4_I(inode)->i_mmap_sem); 269 - result = dax_iomap_fault(vmf, &ext4_iomap_ops); 268 + result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); 270 269 up_read(&EXT4_I(inode)->i_mmap_sem); 271 270 if (write) 272 271 sb_end_pagefault(sb); 273 272 274 273 return result; 274 + } 275 + 276 + static int ext4_dax_fault(struct vm_fault *vmf) 277 + { 278 + return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); 275 279 } 276 280 277 281 /* ··· 311 305 312 306 static const struct vm_operations_struct ext4_dax_vm_ops = { 313 307 .fault = ext4_dax_fault, 314 - .huge_fault = ext4_dax_fault, 308 + .huge_fault = ext4_dax_huge_fault, 315 309 .page_mkwrite = ext4_dax_fault, 316 310 .pfn_mkwrite = ext4_dax_pfn_mkwrite, 317 311 };
+5 -4
fs/xfs/xfs_file.c
··· 1391 1391 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1392 1392 1393 1393 if (IS_DAX(inode)) { 1394 - ret = dax_iomap_fault(vmf, &xfs_iomap_ops); 1394 + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); 1395 1395 } else { 1396 1396 ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); 1397 1397 ret = block_page_mkwrite_return(ret); ··· 1418 1418 1419 1419 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1420 1420 if (IS_DAX(inode)) 1421 - ret = dax_iomap_fault(vmf, &xfs_iomap_ops); 1421 + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); 1422 1422 else 1423 1423 ret = filemap_fault(vmf); 1424 1424 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ··· 1435 1435 */ 1436 1436 STATIC int 1437 1437 xfs_filemap_huge_fault( 1438 - struct vm_fault *vmf) 1438 + struct vm_fault *vmf, 1439 + enum page_entry_size pe_size) 1439 1440 { 1440 1441 struct inode *inode = file_inode(vmf->vma->vm_file); 1441 1442 struct xfs_inode *ip = XFS_I(inode); ··· 1453 1452 } 1454 1453 1455 1454 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1456 - ret = dax_iomap_fault(vmf, &xfs_iomap_ops); 1455 + ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); 1457 1456 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1458 1457 1459 1458 if (vmf->flags & FAULT_FLAG_WRITE)
+2 -1
include/linux/dax.h
··· 38 38 39 39 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 40 40 const struct iomap_ops *ops); 41 - int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops); 41 + int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 42 + const struct iomap_ops *ops); 42 43 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 43 44 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); 44 45 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+8 -6
include/linux/mm.h
··· 285 285 #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ 286 286 #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ 287 287 288 - #define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */ 289 - #define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */ 290 - #define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */ 291 - #define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */ 292 - 293 288 #define FAULT_FLAG_TRACE \ 294 289 { FAULT_FLAG_WRITE, "WRITE" }, \ 295 290 { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ ··· 344 349 */ 345 350 }; 346 351 352 + /* page entry size for vm->huge_fault() */ 353 + enum page_entry_size { 354 + PE_SIZE_PTE = 0, 355 + PE_SIZE_PMD, 356 + PE_SIZE_PUD, 357 + }; 358 + 347 359 /* 348 360 * These are the virtual MM functions - opening of an area, closing and 349 361 * unmapping it (needed to keep files on disk up-to-date etc), pointer ··· 361 359 void (*close)(struct vm_area_struct * area); 362 360 int (*mremap)(struct vm_area_struct * area); 363 361 int (*fault)(struct vm_fault *vmf); 364 - int (*huge_fault)(struct vm_fault *vmf); 362 + int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); 365 363 void (*map_pages)(struct vm_fault *vmf, 366 364 pgoff_t start_pgoff, pgoff_t end_pgoff); 367 365
+4 -13
mm/memory.c
··· 3489 3489 if (vma_is_anonymous(vmf->vma)) 3490 3490 return do_huge_pmd_anonymous_page(vmf); 3491 3491 if (vmf->vma->vm_ops->huge_fault) 3492 - return vmf->vma->vm_ops->huge_fault(vmf); 3492 + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); 3493 3493 return VM_FAULT_FALLBACK; 3494 3494 } 3495 3495 ··· 3498 3498 if (vma_is_anonymous(vmf->vma)) 3499 3499 return do_huge_pmd_wp_page(vmf, orig_pmd); 3500 3500 if (vmf->vma->vm_ops->huge_fault) 3501 - return vmf->vma->vm_ops->huge_fault(vmf); 3501 + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); 3502 3502 3503 3503 /* COW handled on pte level: split pmd */ 3504 3504 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); ··· 3519 3519 if (vma_is_anonymous(vmf->vma)) 3520 3520 return VM_FAULT_FALLBACK; 3521 3521 if (vmf->vma->vm_ops->huge_fault) 3522 - return vmf->vma->vm_ops->huge_fault(vmf); 3522 + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); 3523 3523 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3524 3524 return VM_FAULT_FALLBACK; 3525 3525 } ··· 3531 3531 if (vma_is_anonymous(vmf->vma)) 3532 3532 return VM_FAULT_FALLBACK; 3533 3533 if (vmf->vma->vm_ops->huge_fault) 3534 - return vmf->vma->vm_ops->huge_fault(vmf); 3534 + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); 3535 3535 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3536 3536 return VM_FAULT_FALLBACK; 3537 3537 } ··· 3659 3659 if (!vmf.pud) 3660 3660 return VM_FAULT_OOM; 3661 3661 if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { 3662 - vmf.flags |= FAULT_FLAG_SIZE_PUD; 3663 3662 ret = create_huge_pud(&vmf); 3664 3663 if (!(ret & VM_FAULT_FALLBACK)) 3665 3664 return ret; ··· 3668 3669 barrier(); 3669 3670 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { 3670 3671 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3671 - 3672 - vmf.flags |= FAULT_FLAG_SIZE_PUD; 3673 3672 3674 3673 /* NUMA case for anonymous PUDs would go here */ 3675 3674 ··· 3686 3689 if (!vmf.pmd) 3687 3690 return VM_FAULT_OOM; 3688 3691 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { 3689 - vmf.flags |= FAULT_FLAG_SIZE_PMD; 3690 3692 ret = create_huge_pmd(&vmf); 3691 3693 if (!(ret & VM_FAULT_FALLBACK)) 3692 3694 return ret; 3693 - /* fall through path, remove PMD flag */ 3694 - vmf.flags &= ~FAULT_FLAG_SIZE_PMD; 3695 3695 } else { 3696 3696 pmd_t orig_pmd = *vmf.pmd; 3697 3697 3698 3698 barrier(); 3699 3699 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3700 - vmf.flags |= FAULT_FLAG_SIZE_PMD; 3701 3700 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) 3702 3701 return do_huge_pmd_numa_page(&vmf, orig_pmd); 3703 3702 ··· 3702 3709 ret = wp_huge_pmd(&vmf, orig_pmd); 3703 3710 if (!(ret & VM_FAULT_FALLBACK)) 3704 3711 return ret; 3705 - /* fall through path, remove PUD flag */ 3706 - vmf.flags &= ~FAULT_FLAG_SIZE_PUD; 3707 3712 } else { 3708 3713 huge_pmd_set_accessed(&vmf, orig_pmd); 3709 3714 return 0;