Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

thp: khugepaged: make khugepaged aware about madvise

MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after
mmap and before touching the memory. While this is enough for most
usages, it's little effort to make madvise more dynamic at runtime on an
existing mapping by making khugepaged aware about madvise.

MADV_HUGEPAGE: register in khugepaged immediately without waiting a page
fault (that may not ever happen if all pages are already mapped and the
"enabled" knob was set to madvise during the initial page faults).

MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop
collapsing pages where not needed.

[akpm@linux-foundation.org: tweak comment]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Andrea Arcangeli and committed by
Linus Torvalds
60ab3244 a664b2d8

+24 -7
+4 -2
include/linux/huge_mm.h
··· 105 105 #if HPAGE_PMD_ORDER > MAX_ORDER 106 106 #error "hugepages can't be allocated by the buddy allocator" 107 107 #endif 108 - extern int hugepage_madvise(unsigned long *vm_flags, int advice); 108 + extern int hugepage_madvise(struct vm_area_struct *vma, 109 + unsigned long *vm_flags, int advice); 109 110 extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, 110 111 unsigned long start, 111 112 unsigned long end, ··· 144 143 do { } while (0) 145 144 #define wait_split_huge_page(__anon_vma, __pmd) \ 146 145 do { } while (0) 147 - static inline int hugepage_madvise(unsigned long *vm_flags, int advice) 146 + static inline int hugepage_madvise(struct vm_area_struct *vma, 147 + unsigned long *vm_flags, int advice) 148 148 { 149 149 BUG(); 150 150 return 0;
+19 -4
mm/huge_memory.c
··· 1389 1389 return ret; 1390 1390 } 1391 1391 1392 - int hugepage_madvise(unsigned long *vm_flags, int advice) 1392 + int hugepage_madvise(struct vm_area_struct *vma, 1393 + unsigned long *vm_flags, int advice) 1393 1394 { 1394 1395 switch (advice) { 1395 1396 case MADV_HUGEPAGE: ··· 1405 1404 return -EINVAL; 1406 1405 *vm_flags &= ~VM_NOHUGEPAGE; 1407 1406 *vm_flags |= VM_HUGEPAGE; 1407 + /* 1408 + * If the vma become good for khugepaged to scan, 1409 + * register it here without waiting a page fault that 1410 + * may not happen any time soon. 1411 + */ 1412 + if (unlikely(khugepaged_enter_vma_merge(vma))) 1413 + return -ENOMEM; 1408 1414 break; 1409 1415 case MADV_NOHUGEPAGE: 1410 1416 /* ··· 1425 1417 return -EINVAL; 1426 1418 *vm_flags &= ~VM_HUGEPAGE; 1427 1419 *vm_flags |= VM_NOHUGEPAGE; 1420 + /* 1421 + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning 1422 + * this vma even if we leave the mm registered in khugepaged if 1423 + * it got registered before VM_NOHUGEPAGE was set. 1424 + */ 1428 1425 break; 1429 1426 } 1430 1427 ··· 1797 1784 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 1798 1785 goto out; 1799 1786 1800 - if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) 1787 + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || 1788 + (vma->vm_flags & VM_NOHUGEPAGE)) 1801 1789 goto out; 1802 1790 1803 1791 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ ··· 2021 2007 break; 2022 2008 } 2023 2009 2024 - if (!(vma->vm_flags & VM_HUGEPAGE) && 2025 - !khugepaged_always()) { 2010 + if ((!(vma->vm_flags & VM_HUGEPAGE) && 2011 + !khugepaged_always()) || 2012 + (vma->vm_flags & VM_NOHUGEPAGE)) { 2026 2013 progress++; 2027 2014 continue; 2028 2015 }
+1 -1
mm/madvise.c
··· 73 73 break; 74 74 case MADV_HUGEPAGE: 75 75 case MADV_NOHUGEPAGE: 76 - error = hugepage_madvise(&new_flags, behavior); 76 + error = hugepage_madvise(vma, &new_flags, behavior); 77 77 if (error) 78 78 goto out; 79 79 break;