Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: Use ptep/pmdp_set_numa() for updating _PAGE_NUMA bit

Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions when using
a hash table MMU for various reasons (the flush is handled as part of
the PTE modification when necessary).

ppc64 thus doesn't implement flush_tlb_range for hash based MMUs.

Additionally ppc64 require the tlb flushing to be batched within ptl locks.

The reason to do that is to ensure that the hash page table is in sync with
linux page table.

We track the hpte index in linux pte and if we clear them without flushing
hash and drop the ptl lock, we can have another cpu update the pte and can
end up with duplicate entry in the hash table, which is fatal.

We also want to keep set_pte_at simpler by not requiring them to do hash
flush for performance reason. We do that by assuming that set_pte_at() is
never *ever* called on a PTE that is already valid.

This was the case until the NUMA code went in which broke that assumption.

Fix that by introducing a new pair of helpers to set _PAGE_NUMA in a
way similar to ptep/pmdp_set_wrprotect(), with a generic implementation
using set_pte_at() and a powerpc specific one using the appropriate
mechanism needed to keep the hash table in sync.

Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Aneesh Kumar K.V and committed by
Benjamin Herrenschmidt
56eecdb9 9d85d586

+64 -10
+22
arch/powerpc/include/asm/pgtable.h
··· 75 75 return pte; 76 76 } 77 77 78 + #define ptep_set_numa ptep_set_numa 79 + static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 80 + pte_t *ptep) 81 + { 82 + if ((pte_val(*ptep) & _PAGE_PRESENT) == 0) 83 + VM_BUG_ON(1); 84 + 85 + pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); 86 + return; 87 + } 88 + 78 89 #define pmd_numa pmd_numa 79 90 static inline int pmd_numa(pmd_t pmd) 80 91 { 81 92 return pte_numa(pmd_pte(pmd)); 93 + } 94 + 95 + #define pmdp_set_numa pmdp_set_numa 96 + static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 97 + pmd_t *pmdp) 98 + { 99 + if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0) 100 + VM_BUG_ON(1); 101 + 102 + pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); 103 + return; 82 104 } 83 105 84 106 #define pmd_mknonnuma pmd_mknonnuma
+39
include/asm-generic/pgtable.h
··· 701 701 } 702 702 #endif 703 703 704 + #ifndef ptep_set_numa 705 + static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 706 + pte_t *ptep) 707 + { 708 + pte_t ptent = *ptep; 709 + 710 + ptent = pte_mknuma(ptent); 711 + set_pte_at(mm, addr, ptep, ptent); 712 + return; 713 + } 714 + #endif 715 + 704 716 #ifndef pmd_mknuma 705 717 static inline pmd_t pmd_mknuma(pmd_t pmd) 706 718 { 707 719 pmd = pmd_set_flags(pmd, _PAGE_NUMA); 708 720 return pmd_clear_flags(pmd, _PAGE_PRESENT); 721 + } 722 + #endif 723 + 724 + #ifndef pmdp_set_numa 725 + static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 726 + pmd_t *pmdp) 727 + { 728 + pmd_t pmd = *pmdp; 729 + 730 + pmd = pmd_mknuma(pmd); 731 + set_pmd_at(mm, addr, pmdp, pmd); 732 + return; 709 733 } 710 734 #endif 711 735 #else ··· 739 715 extern pmd_t pmd_mknonnuma(pmd_t pmd); 740 716 extern pte_t pte_mknuma(pte_t pte); 741 717 extern pmd_t pmd_mknuma(pmd_t pmd); 718 + extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 719 + extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); 742 720 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ 743 721 #else 744 722 static inline int pmd_numa(pmd_t pmd) ··· 768 742 return pte; 769 743 } 770 744 745 + static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 746 + pte_t *ptep) 747 + { 748 + return; 749 + } 750 + 751 + 771 752 static inline pmd_t pmd_mknuma(pmd_t pmd) 772 753 { 773 754 return pmd; 755 + } 756 + 757 + static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 758 + pmd_t *pmdp) 759 + { 760 + return ; 774 761 } 775 762 #endif /* CONFIG_NUMA_BALANCING */ 776 763
+2 -7
mm/huge_memory.c
··· 1545 1545 entry = pmd_mknonnuma(entry); 1546 1546 entry = pmd_modify(entry, newprot); 1547 1547 ret = HPAGE_PMD_NR; 1548 + set_pmd_at(mm, addr, pmd, entry); 1548 1549 BUG_ON(pmd_write(entry)); 1549 1550 } else { 1550 1551 struct page *page = pmd_page(*pmd); ··· 1558 1557 */ 1559 1558 if (!is_huge_zero_page(page) && 1560 1559 !pmd_numa(*pmd)) { 1561 - entry = *pmd; 1562 - entry = pmd_mknuma(entry); 1560 + pmdp_set_numa(mm, addr, pmd); 1563 1561 ret = HPAGE_PMD_NR; 1564 1562 } 1565 1563 } 1566 - 1567 - /* Set PMD if cleared earlier */ 1568 - if (ret == HPAGE_PMD_NR) 1569 - set_pmd_at(mm, addr, pmd, entry); 1570 - 1571 1564 spin_unlock(ptl); 1572 1565 } 1573 1566
+1 -3
mm/mprotect.c
··· 69 69 } else { 70 70 struct page *page; 71 71 72 - ptent = *pte; 73 72 page = vm_normal_page(vma, addr, oldpte); 74 73 if (page && !PageKsm(page)) { 75 74 if (!pte_numa(oldpte)) { 76 - ptent = pte_mknuma(ptent); 77 - set_pte_at(mm, addr, pte, ptent); 75 + ptep_set_numa(mm, addr, pte); 78 76 updated = true; 79 77 } 80 78 }