Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/mm: Differentiate between hugetlb and THP during page walk

We need to properly identify whether a hugepage is an explicit or
a transparent hugepage in follow_huge_addr(). We used to depend
on hugepage shift argument to do that. But in some case that can
result in wrong results. For ex:

On finding a transparent hugepage we set hugepage shift to PMD_SHIFT.
But we can end up clearing the thp pte, via pmdp_huge_get_and_clear.
We do prevent reusing the pfn page via the usage of
kick_all_cpus_sync(). But that happens after we updated the pte to 0.
Hence in follow_huge_addr() we can find hugepage shift set, but transparent
huge page check fail for a thp pte.

NOTE: We fixed a variant of this race against thp split in commit
691e95fd7396905a38d98919e9c150dbc3ea21a3
("powerpc/mm/thp: Make page table walk safe against thp split/collapse")

Without this patch, we may hit the BUG_ON(flags & FOLL_GET) in
follow_page_mask occasionally.

In the long term, we may want to switch ppc64 64k page size config to
enable CONFIG_ARCH_WANT_GENERAL_HUGETLB

Reported-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Aneesh Kumar K.V and committed by
Michael Ellerman
891121e6 ec2640b1

+49 -24
+1
arch/powerpc/include/asm/mmu-hash64.h
··· 14 14 15 15 #include <asm/asm-compat.h> 16 16 #include <asm/page.h> 17 + #include <asm/bug.h> 17 18 18 19 /* 19 20 * This is necessary to get the definition of PGTABLE_RANGE which we
+9 -1
arch/powerpc/include/asm/pgtable-ppc64.h
··· 437 437 438 438 } 439 439 440 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 440 441 extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, 441 442 pmd_t *pmdp, unsigned long old_pmd); 442 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 443 443 extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); 444 444 extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); 445 445 extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); ··· 479 479 } 480 480 481 481 extern int has_transparent_hugepage(void); 482 + #else 483 + static inline void hpte_do_hugepage_flush(struct mm_struct *mm, 484 + unsigned long addr, pmd_t *pmdp, 485 + unsigned long old_pmd) 486 + { 487 + 488 + WARN(1, "%s called with THP disabled\n", __func__); 489 + } 482 490 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 483 491 484 492 static inline int pmd_large(pmd_t pmd)
+3 -3
arch/powerpc/include/asm/pgtable.h
··· 259 259 #define has_transparent_hugepage() 0 260 260 #endif 261 261 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 262 - unsigned *shift); 262 + bool *is_thp, unsigned *shift); 263 263 static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 264 - unsigned *shift) 264 + bool *is_thp, unsigned *shift) 265 265 { 266 266 if (!arch_irqs_disabled()) { 267 267 pr_info("%s called with irq enabled\n", __func__); 268 268 dump_stack(); 269 269 } 270 - return __find_linux_pte_or_hugepte(pgdir, ea, shift); 270 + return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); 271 271 } 272 272 #endif /* __ASSEMBLY__ */ 273 273
+2 -1
arch/powerpc/kernel/eeh.c
··· 351 351 * worried about _PAGE_SPLITTING/collapse. Also we will not hit 352 352 * page table free, because of init_mm. 353 353 */ 354 - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); 354 + ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, 355 + NULL, &hugepage_shift); 355 356 if (!ptep) 356 357 return token; 357 358 WARN_ON(hugepage_shift);
+1 -1
arch/powerpc/kernel/io-workarounds.c
··· 76 76 * a page table free due to init_mm 77 77 */ 78 78 ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, 79 - &hugepage_shift); 79 + NULL, &hugepage_shift); 80 80 if (ptep == NULL) 81 81 paddr = 0; 82 82 else {
+1 -1
arch/powerpc/kvm/book3s_64_mmu_hv.c
··· 543 543 */ 544 544 local_irq_save(flags); 545 545 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 546 - hva, NULL); 546 + hva, NULL, NULL); 547 547 if (ptep) { 548 548 pte = kvmppc_read_update_linux_pte(ptep, 1); 549 549 if (pte_write(pte))
+5 -3
arch/powerpc/kvm/book3s_hv_rm_mmu.c
··· 32 32 * So don't worry about THP collapse/split. Called 33 33 * Only in realmode, hence won't need irq_save/restore. 34 34 */ 35 - p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); 35 + p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL, NULL); 36 36 if (!p || !pte_present(*p)) 37 37 return NULL; 38 38 addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); ··· 221 221 * retry via mmu_notifier_retry. 222 222 */ 223 223 if (realmode) 224 - ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); 224 + ptep = __find_linux_pte_or_hugepte(pgdir, hva, NULL, 225 + &hpage_shift); 225 226 else { 226 227 local_irq_save(irq_flags); 227 - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); 228 + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, 229 + &hpage_shift); 228 230 } 229 231 if (ptep) { 230 232 pte_t pte;
+1 -1
arch/powerpc/kvm/e500_mmu_host.c
··· 476 476 * can't run hence pfn won't change. 477 477 */ 478 478 local_irq_save(flags); 479 - ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL); 479 + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, NULL); 480 480 if (ptep) { 481 481 pte_t pte = READ_ONCE(*ptep); 482 482
+4 -3
arch/powerpc/mm/hash_utils_64.c
··· 994 994 unsigned long access, unsigned long trap, 995 995 unsigned long flags) 996 996 { 997 + bool is_thp; 997 998 enum ctx_state prev_state = exception_enter(); 998 999 pgd_t *pgdir; 999 1000 unsigned long vsid; ··· 1069 1068 #endif /* CONFIG_PPC_64K_PAGES */ 1070 1069 1071 1070 /* Get PTE and page size from page tables */ 1072 - ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); 1071 + ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift); 1073 1072 if (ptep == NULL || !pte_present(*ptep)) { 1074 1073 DBG_LOW(" no PTE !\n"); 1075 1074 rc = 1; ··· 1089 1088 } 1090 1089 1091 1090 if (hugeshift) { 1092 - if (pmd_trans_huge(*(pmd_t *)ptep)) 1091 + if (is_thp) 1093 1092 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, 1094 1093 trap, flags, ssize, psize); 1095 1094 #ifdef CONFIG_HUGETLB_PAGE ··· 1244 1243 * THP pages use update_mmu_cache_pmd. We don't do 1245 1244 * hash preload there. Hence can ignore THP here 1246 1245 */ 1247 - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); 1246 + ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift); 1248 1247 if (!ptep) 1249 1248 goto out_exit; 1250 1249
+16 -5
arch/powerpc/mm/hugetlbpage.c
··· 128 128 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 129 129 { 130 130 /* Only called for hugetlbfs pages, hence can ignore THP */ 131 - return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 131 + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); 132 132 } 133 133 134 134 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, ··· 703 703 struct page * 704 704 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 705 705 { 706 + bool is_thp; 706 707 pte_t *ptep, pte; 707 708 unsigned shift; 708 709 unsigned long mask, flags; 709 710 struct page *page = ERR_PTR(-EINVAL); 710 711 711 712 local_irq_save(flags); 712 - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 713 + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); 713 714 if (!ptep) 714 715 goto no_page; 715 716 pte = READ_ONCE(*ptep); ··· 719 718 * Transparent hugepages are handled by generic code. We can skip them 720 719 * here. 721 720 */ 722 - if (!shift || pmd_trans_huge(__pmd(pte_val(pte)))) 721 + if (!shift || is_thp) 723 722 goto no_page; 724 723 725 724 if (!pte_present(pte)) { ··· 976 975 */ 977 976 978 977 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 979 - unsigned *shift) 978 + bool *is_thp, unsigned *shift) 980 979 { 981 980 pgd_t pgd, *pgdp; 982 981 pud_t pud, *pudp; ··· 987 986 988 987 if (shift) 989 988 *shift = 0; 989 + 990 + if (is_thp) 991 + *is_thp = false; 990 992 991 993 pgdp = pgdir + pgd_index(ea); 992 994 pgd = READ_ONCE(*pgdp); ··· 1038 1034 if (pmd_none(pmd)) 1039 1035 return NULL; 1040 1036 1041 - if (pmd_huge(pmd) || pmd_large(pmd)) { 1037 + if (pmd_trans_huge(pmd)) { 1038 + if (is_thp) 1039 + *is_thp = true; 1040 + ret_pte = (pte_t *) pmdp; 1041 + goto out; 1042 + } 1043 + 1044 + if (pmd_huge(pmd)) { 1042 1045 ret_pte = (pte_t *) pmdp; 1043 1046 goto out; 1044 1047 } else if (is_hugepd(__hugepd(pmd_val(pmd))))
+5 -4
arch/powerpc/mm/tlb_hash64.c
··· 190 190 void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 191 191 unsigned long end) 192 192 { 193 + bool is_thp; 193 194 int hugepage_shift; 194 195 unsigned long flags; 195 196 ··· 209 208 local_irq_save(flags); 210 209 arch_enter_lazy_mmu_mode(); 211 210 for (; start < end; start += PAGE_SIZE) { 212 - pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, 211 + pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp, 213 212 &hugepage_shift); 214 213 unsigned long pte; 215 214 216 215 if (ptep == NULL) 217 216 continue; 218 217 pte = pte_val(*ptep); 219 - if (hugepage_shift) 218 + if (is_thp) 220 219 trace_hugepage_invalidate(start, pte); 221 220 if (!(pte & _PAGE_HASHPTE)) 222 221 continue; 223 - if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) 222 + if (unlikely(is_thp)) 224 223 hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); 225 224 else 226 - hpte_need_flush(mm, start, ptep, pte, 0); 225 + hpte_need_flush(mm, start, ptep, pte, hugepage_shift); 227 226 } 228 227 arch_leave_lazy_mmu_mode(); 229 228 local_irq_restore(flags);
+1 -1
arch/powerpc/perf/callchain.c
··· 127 127 return -EFAULT; 128 128 129 129 local_irq_save(flags); 130 - ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); 130 + ptep = find_linux_pte_or_hugepte(pgdir, addr, NULL, &shift); 131 131 if (!ptep) 132 132 goto err_out; 133 133 if (!shift)