Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: thp: fix /dev/zero MAP_PRIVATE and vm_flags cleanups

The huge_memory.c THP page fault was allowed to run if vm_ops was null
(which would succeed for /dev/zero MAP_PRIVATE, as the f_op->mmap wouldn't
setup a special vma->vm_ops and it would fallback to regular anonymous
memory) but other THP logics weren't fully activated for vmas with vm_file
not NULL (/dev/zero has a not NULL vma->vm_file).

So this removes the vm_file checks so that /dev/zero also can safely use
THP (the other albeit safer approach to fix this bug would have been to
prevent the THP initial page fault to run if vm_file was set).

After removing the vm_file checks, this also makes huge_memory.c stricter
in khugepaged for the DEBUG_VM=y case. It doesn't replace the vm_file
check with a is_pfn_mapping check (but it keeps checking for VM_PFNMAP
under VM_BUG_ON) because for a is_cow_mapping() mapping VM_PFNMAP should
only be allowed to exist before the first page fault, and in turn when
vma->anon_vma is null (so preventing khugepaged registration). So I tend
to think the previous comment saying if vm_file was set, VM_PFNMAP might
have been set and we could still be registered in khugepaged (despite
anon_vma was not NULL to be registered in khugepaged) was too paranoid.
The is_linear_pfn_mapping check is also I think superfluous (as described
by comment) but under DEBUG_VM it is safe to stay.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=33682

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Caspar Zhang <bugs@casparzhang.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: <stable@kernel.org> [2.6.38.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Andrea Arcangeli and committed by
Linus Torvalds
78f11a25 6d4831c2

+27 -21
+1 -1
include/linux/huge_mm.h
··· 117 117 unsigned long end, 118 118 long adjust_next) 119 119 { 120 - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) 120 + if (!vma->anon_vma || vma->vm_ops) 121 121 return; 122 122 __vma_adjust_trans_huge(vma, start, end, adjust_next); 123 123 }
+2 -1
include/linux/mm.h
··· 137 137 #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) 138 138 139 139 /* 140 - * special vmas that are non-mergable, non-mlock()able 140 + * Special vmas that are non-mergable, non-mlock()able. 141 + * Note: mm/huge_memory.c VM_NO_THP depends on this definition. 141 142 */ 142 143 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) 143 144
+24 -19
mm/huge_memory.c
··· 1408 1408 return ret; 1409 1409 } 1410 1410 1411 + #define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ 1412 + VM_HUGETLB|VM_SHARED|VM_MAYSHARE) 1413 + 1411 1414 int hugepage_madvise(struct vm_area_struct *vma, 1412 1415 unsigned long *vm_flags, int advice) 1413 1416 { ··· 1419 1416 /* 1420 1417 * Be somewhat over-protective like KSM for now! 1421 1418 */ 1422 - if (*vm_flags & (VM_HUGEPAGE | 1423 - VM_SHARED | VM_MAYSHARE | 1424 - VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1425 - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1426 - VM_MIXEDMAP | VM_SAO)) 1419 + if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1427 1420 return -EINVAL; 1428 1421 *vm_flags &= ~VM_NOHUGEPAGE; 1429 1422 *vm_flags |= VM_HUGEPAGE; ··· 1435 1436 /* 1436 1437 * Be somewhat over-protective like KSM for now! 1437 1438 */ 1438 - if (*vm_flags & (VM_NOHUGEPAGE | 1439 - VM_SHARED | VM_MAYSHARE | 1440 - VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1441 - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1442 - VM_MIXEDMAP | VM_SAO)) 1439 + if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) 1443 1440 return -EINVAL; 1444 1441 *vm_flags &= ~VM_HUGEPAGE; 1445 1442 *vm_flags |= VM_NOHUGEPAGE; ··· 1569 1574 * page fault if needed. 1570 1575 */ 1571 1576 return 0; 1572 - if (vma->vm_file || vma->vm_ops) 1577 + if (vma->vm_ops) 1573 1578 /* khugepaged not yet working on file or special mappings */ 1574 1579 return 0; 1575 - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1580 + /* 1581 + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be 1582 + * true too, verify it here. 1583 + */ 1584 + VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); 1576 1585 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1577 1586 hend = vma->vm_end & HPAGE_PMD_MASK; 1578 1587 if (hstart < hend) ··· 1827 1828 (vma->vm_flags & VM_NOHUGEPAGE)) 1828 1829 goto out; 1829 1830 1830 - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 1831 - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) 1831 + if (!vma->anon_vma || vma->vm_ops) 1832 1832 goto out; 1833 1833 if (is_vma_temporary_stack(vma)) 1834 1834 goto out; 1835 - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1835 + /* 1836 + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be 1837 + * true too, verify it here. 1838 + */ 1839 + VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); 1836 1840 1837 1841 pgd = pgd_offset(mm, address); 1838 1842 if (!pgd_present(*pgd)) ··· 2068 2066 progress++; 2069 2067 continue; 2070 2068 } 2071 - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 2072 - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) 2069 + if (!vma->anon_vma || vma->vm_ops) 2073 2070 goto skip; 2074 2071 if (is_vma_temporary_stack(vma)) 2075 2072 goto skip; 2076 - 2077 - VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 2073 + /* 2074 + * If is_pfn_mapping() is true is_learn_pfn_mapping() 2075 + * must be true too, verify it here. 2076 + */ 2077 + VM_BUG_ON(is_linear_pfn_mapping(vma) || 2078 + vma->vm_flags & VM_NO_THP); 2078 2079 2079 2080 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2080 2081 hend = vma->vm_end & HPAGE_PMD_MASK;