Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: remove remaining references to NUMA hinting bits and helpers

This patch removes the NUMA PTE bits and associated helpers. As a
side-effect it increases the maximum possible swap space on x86-64.

One potential source of problems is races between the marking of PTEs
PROT_NONE, NUMA hinting faults and migration. It must be guaranteed that
a PTE being protected is not faulted in parallel, seen as a pte_none and
corrupting memory. The base case is safe but transhuge has problems in
the past due to an different migration mechanism and a dependance on page
lock to serialise migrations and warrants a closer look.

task_work hinting update parallel fault
------------------------ --------------
change_pmd_range
change_huge_pmd
__pmd_trans_huge_lock
pmdp_get_and_clear
__handle_mm_fault
pmd_none
do_huge_pmd_anonymous_page
read? pmd_lock blocks until hinting complete, fail !pmd_none test
write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none
pmd_modify
set_pmd_at

task_work hinting update parallel migration
------------------------ ------------------
change_pmd_range
change_huge_pmd
__pmd_trans_huge_lock
pmdp_get_and_clear
__handle_mm_fault
do_huge_pmd_numa_page
migrate_misplaced_transhuge_page
pmd_lock waits for updates to complete, recheck pmd_same
pmd_modify
set_pmd_at

Both of those are safe and the case where a transhuge page is inserted
during a protection update is unchanged. The case where two processes try
migrating at the same time is unchanged by this series so should still be
ok. I could not find a case where we are accidentally depending on the
PTE not being cleared and flushed. If one is missed, it'll manifest as
corruption problems that start triggering shortly after this series is
merged and only happen when NUMA balancing is enabled.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mel Gorman and committed by
Linus Torvalds
21d9ee3e 4d942466

+7 -283
+1 -53
arch/powerpc/include/asm/pgtable.h
··· 55 55 { 56 56 return pte_protnone(pmd_pte(pmd)); 57 57 } 58 - 59 - static inline int pte_present(pte_t pte) 60 - { 61 - return pte_val(pte) & _PAGE_NUMA_MASK; 62 - } 63 - 64 - #define pte_present_nonuma pte_present_nonuma 65 - static inline int pte_present_nonuma(pte_t pte) 66 - { 67 - return pte_val(pte) & (_PAGE_PRESENT); 68 - } 69 - 70 - #define ptep_set_numa ptep_set_numa 71 - static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 72 - pte_t *ptep) 73 - { 74 - if ((pte_val(*ptep) & _PAGE_PRESENT) == 0) 75 - VM_BUG_ON(1); 76 - 77 - pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); 78 - return; 79 - } 80 - 81 - #define pmdp_set_numa pmdp_set_numa 82 - static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 83 - pmd_t *pmdp) 84 - { 85 - if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0) 86 - VM_BUG_ON(1); 87 - 88 - pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); 89 - return; 90 - } 91 - 92 - /* 93 - * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist 94 - * which was inherited from x86. For the purposes of powerpc pte_basic_t and 95 - * pmd_t are equivalent 96 - */ 97 - #define pteval_t pte_basic_t 98 - #define pmdval_t pmd_t 99 - static inline pteval_t ptenuma_flags(pte_t pte) 100 - { 101 - return pte_val(pte) & _PAGE_NUMA_MASK; 102 - } 103 - 104 - static inline pmdval_t pmdnuma_flags(pmd_t pmd) 105 - { 106 - return pmd_val(pmd) & _PAGE_NUMA_MASK; 107 - } 108 - 109 - # else 58 + #endif /* CONFIG_NUMA_BALANCING */ 110 59 111 60 static inline int pte_present(pte_t pte) 112 61 { 113 62 return pte_val(pte) & _PAGE_PRESENT; 114 63 } 115 - #endif /* CONFIG_NUMA_BALANCING */ 116 64 117 65 /* Conversion functions: convert a page and protection to a page entry, 118 66 * and a page entry and page directory to the page they refer to.
-5
arch/powerpc/include/asm/pte-common.h
··· 104 104 _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \ 105 105 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) 106 106 107 - #ifdef CONFIG_NUMA_BALANCING 108 - /* Mask of bits that distinguish present and numa ptes */ 109 - #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT) 110 - #endif 111 - 112 107 /* 113 108 * We define 2 sets of base prot bits, one for basic pages (ie, 114 109 * cacheable kernel and user pages) and one for non cacheable
-6
arch/powerpc/include/asm/pte-hash64.h
··· 27 27 #define _PAGE_RW 0x0200 /* software: user write access allowed */ 28 28 #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ 29 29 30 - /* 31 - * Used for tracking numa faults 32 - */ 33 - #define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */ 34 - 35 - 36 30 /* No separate kernel read-only */ 37 31 #define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ 38 32 #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW
+3 -19
arch/x86/include/asm/pgtable.h
··· 300 300 301 301 static inline pmd_t pmd_mknotpresent(pmd_t pmd) 302 302 { 303 - return pmd_clear_flags(pmd, _PAGE_PRESENT); 303 + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); 304 304 } 305 305 306 306 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY ··· 443 443 444 444 static inline int pte_present(pte_t a) 445 445 { 446 - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | 447 - _PAGE_NUMA); 448 - } 449 - 450 - #define pte_present_nonuma pte_present_nonuma 451 - static inline int pte_present_nonuma(pte_t a) 452 - { 453 446 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 454 447 } 455 448 ··· 452 459 if (pte_flags(a) & _PAGE_PRESENT) 453 460 return true; 454 461 455 - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && 462 + if ((pte_flags(a) & _PAGE_PROTNONE) && 456 463 mm_tlb_flush_pending(mm)) 457 464 return true; 458 465 ··· 472 479 * the _PAGE_PSE flag will remain set at all times while the 473 480 * _PAGE_PRESENT bit is clear). 474 481 */ 475 - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | 476 - _PAGE_NUMA); 482 + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); 477 483 } 478 484 479 485 #ifdef CONFIG_NUMA_BALANCING ··· 547 555 548 556 static inline int pmd_bad(pmd_t pmd) 549 557 { 550 - #ifdef CONFIG_NUMA_BALANCING 551 - /* pmd_numa check */ 552 - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) 553 - return 0; 554 - #endif 555 558 return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; 556 559 } 557 560 ··· 865 878 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 866 879 static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 867 880 { 868 - VM_BUG_ON(pte_present_nonuma(pte)); 869 881 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); 870 882 } 871 883 872 884 static inline int pte_swp_soft_dirty(pte_t pte) 873 885 { 874 - VM_BUG_ON(pte_present_nonuma(pte)); 875 886 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; 876 887 } 877 888 878 889 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 879 890 { 880 - VM_BUG_ON(pte_present_nonuma(pte)); 881 891 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 882 892 } 883 893 #endif
-5
arch/x86/include/asm/pgtable_64.h
··· 142 142 143 143 /* Encode and de-code a swap entry */ 144 144 #define SWP_TYPE_BITS 5 145 - #ifdef CONFIG_NUMA_BALANCING 146 - /* Automatic NUMA balancing needs to be distinguishable from swap entries */ 147 - #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) 148 - #else 149 145 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) 150 - #endif 151 146 152 147 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 153 148
+2 -39
arch/x86/include/asm/pgtable_types.h
··· 27 27 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 28 28 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 29 29 30 - /* 31 - * Swap offsets on configurations that allow automatic NUMA balancing use the 32 - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from 33 - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the 34 - * maximum possible swap space from 16TB to 8TB. 35 - */ 36 - #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) 37 - 38 30 /* If _PAGE_BIT_PRESENT is clear, we use these: */ 39 31 /* - if the user mapped it with PROT_NONE; pte_present gives true */ 40 32 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL ··· 68 76 #endif 69 77 70 78 /* 71 - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page 72 - * that is not present. The hinting fault gathers numa placement statistics 73 - * (see pte_numa()). The bit is always zero when the PTE is not present. 74 - * 75 - * The bit picked must be always zero when the pmd is present and not 76 - * present, so that we don't lose information when we set it while 77 - * atomically clearing the present bit. 78 - */ 79 - #ifdef CONFIG_NUMA_BALANCING 80 - #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) 81 - #else 82 - #define _PAGE_NUMA (_AT(pteval_t, 0)) 83 - #endif 84 - 85 - /* 86 79 * Tracking soft dirty bit when a page goes to a swap is tricky. 87 80 * We need a bit which can be stored in pte _and_ not conflict 88 81 * with swap entry format. On x86 bits 6 and 7 are *not* involved ··· 99 122 /* Set of bits not changed in pte_modify */ 100 123 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 101 124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 102 - _PAGE_SOFT_DIRTY | _PAGE_NUMA) 103 - #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) 125 + _PAGE_SOFT_DIRTY) 126 + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) 104 127 105 128 /* 106 129 * The cache modes defined here are used to translate between pure SW usage ··· 300 323 { 301 324 return native_pte_val(pte) & PTE_FLAGS_MASK; 302 325 } 303 - 304 - #ifdef CONFIG_NUMA_BALANCING 305 - /* Set of bits that distinguishes present, prot_none and numa ptes */ 306 - #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) 307 - static inline pteval_t ptenuma_flags(pte_t pte) 308 - { 309 - return pte_flags(pte) & _PAGE_NUMA_MASK; 310 - } 311 - 312 - static inline pmdval_t pmdnuma_flags(pmd_t pmd) 313 - { 314 - return pmd_flags(pmd) & _PAGE_NUMA_MASK; 315 - } 316 - #endif /* CONFIG_NUMA_BALANCING */ 317 326 318 327 #define pgprot_val(x) ((x).pgprot) 319 328 #define __pgprot(x) ((pgprot_t) { (x) } )
-155
include/asm-generic/pgtable.h
··· 244 244 # define pte_accessible(mm, pte) ((void)(pte), 1) 245 245 #endif 246 246 247 - #ifndef pte_present_nonuma 248 - #define pte_present_nonuma(pte) pte_present(pte) 249 - #endif 250 - 251 247 #ifndef flush_tlb_fix_spurious_fault 252 248 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) 253 249 #endif ··· 686 690 static inline int pmd_protnone(pmd_t pmd) 687 691 { 688 692 return 0; 689 - } 690 - #endif /* CONFIG_NUMA_BALANCING */ 691 - 692 - #ifdef CONFIG_NUMA_BALANCING 693 - /* 694 - * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that 695 - * is protected for PROT_NONE and a NUMA hinting fault entry. If the 696 - * architecture defines __PAGE_PROTNONE then it should take that into account 697 - * but those that do not can rely on the fact that the NUMA hinting scanner 698 - * skips inaccessible VMAs. 699 - * 700 - * pte/pmd_present() returns true if pte/pmd_numa returns true. Page 701 - * fault triggers on those regions if pte/pmd_numa returns true 702 - * (because _PAGE_PRESENT is not set). 703 - */ 704 - #ifndef pte_numa 705 - static inline int pte_numa(pte_t pte) 706 - { 707 - return ptenuma_flags(pte) == _PAGE_NUMA; 708 - } 709 - #endif 710 - 711 - #ifndef pmd_numa 712 - static inline int pmd_numa(pmd_t pmd) 713 - { 714 - return pmdnuma_flags(pmd) == _PAGE_NUMA; 715 - } 716 - #endif 717 - 718 - /* 719 - * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically 720 - * because they're called by the NUMA hinting minor page fault. If we 721 - * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler 722 - * would be forced to set it later while filling the TLB after we 723 - * return to userland. That would trigger a second write to memory 724 - * that we optimize away by setting _PAGE_ACCESSED here. 725 - */ 726 - #ifndef pte_mknonnuma 727 - static inline pte_t pte_mknonnuma(pte_t pte) 728 - { 729 - pteval_t val = pte_val(pte); 730 - 731 - val &= ~_PAGE_NUMA; 732 - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); 733 - return __pte(val); 734 - } 735 - #endif 736 - 737 - #ifndef pmd_mknonnuma 738 - static inline pmd_t pmd_mknonnuma(pmd_t pmd) 739 - { 740 - pmdval_t val = pmd_val(pmd); 741 - 742 - val &= ~_PAGE_NUMA; 743 - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); 744 - 745 - return __pmd(val); 746 - } 747 - #endif 748 - 749 - #ifndef pte_mknuma 750 - static inline pte_t pte_mknuma(pte_t pte) 751 - { 752 - pteval_t val = pte_val(pte); 753 - 754 - VM_BUG_ON(!(val & _PAGE_PRESENT)); 755 - 756 - val &= ~_PAGE_PRESENT; 757 - val |= _PAGE_NUMA; 758 - 759 - return __pte(val); 760 - } 761 - #endif 762 - 763 - #ifndef ptep_set_numa 764 - static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 765 - pte_t *ptep) 766 - { 767 - pte_t ptent = *ptep; 768 - 769 - ptent = pte_mknuma(ptent); 770 - set_pte_at(mm, addr, ptep, ptent); 771 - return; 772 - } 773 - #endif 774 - 775 - #ifndef pmd_mknuma 776 - static inline pmd_t pmd_mknuma(pmd_t pmd) 777 - { 778 - pmdval_t val = pmd_val(pmd); 779 - 780 - val &= ~_PAGE_PRESENT; 781 - val |= _PAGE_NUMA; 782 - 783 - return __pmd(val); 784 - } 785 - #endif 786 - 787 - #ifndef pmdp_set_numa 788 - static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 789 - pmd_t *pmdp) 790 - { 791 - pmd_t pmd = *pmdp; 792 - 793 - pmd = pmd_mknuma(pmd); 794 - set_pmd_at(mm, addr, pmdp, pmd); 795 - return; 796 - } 797 - #endif 798 - #else 799 - static inline int pmd_numa(pmd_t pmd) 800 - { 801 - return 0; 802 - } 803 - 804 - static inline int pte_numa(pte_t pte) 805 - { 806 - return 0; 807 - } 808 - 809 - static inline pte_t pte_mknonnuma(pte_t pte) 810 - { 811 - return pte; 812 - } 813 - 814 - static inline pmd_t pmd_mknonnuma(pmd_t pmd) 815 - { 816 - return pmd; 817 - } 818 - 819 - static inline pte_t pte_mknuma(pte_t pte) 820 - { 821 - return pte; 822 - } 823 - 824 - static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 825 - pte_t *ptep) 826 - { 827 - return; 828 - } 829 - 830 - 831 - static inline pmd_t pmd_mknuma(pmd_t pmd) 832 - { 833 - return pmd; 834 - } 835 - 836 - static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 837 - pmd_t *pmdp) 838 - { 839 - return ; 840 693 } 841 694 #endif /* CONFIG_NUMA_BALANCING */ 842 695
+1 -1
include/linux/swapops.h
··· 54 54 /* check whether a pte points to a swap entry */ 55 55 static inline int is_swap_pte(pte_t pte) 56 56 { 57 - return !pte_none(pte) && !pte_present_nonuma(pte); 57 + return !pte_none(pte) && !pte_present(pte); 58 58 } 59 59 #endif 60 60