Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds hugepage

When scanning an anon pmd to see if it's eligible for collapse, return
SCAN_PMD_MAPPED if the pmd already maps a hugepage. Note that
SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the
file-collapse path, since the latter might identify pte-mapped compound
pages. This is required by MADV_COLLAPSE which necessarily needs to know
what hugepage-aligned/sized regions are already pmd-mapped.

In order to determine if a pmd already maps a hugepage, refactor
mm_find_pmd():

Return mm_find_pmd() to it's pre-commit f72e7dcdd252 ("mm: let mm_find_pmd
fix buggy race with THP fault") behavior. ksm was the only caller that
explicitly wanted a pte-mapping pmd, so open code the pte-mapping logic
there (pmd_present() and pmd_trans_huge() checks).

Undo revert change in commit f72e7dcdd252 ("mm: let mm_find_pmd fix buggy
race with THP fault") that open-coded split_huge_pmd_address() pmd lookup
and use mm_find_pmd() instead.

Link: https://lkml.kernel.org/r/20220706235936.2197195-9-zokeefe@google.com
Signed-off-by: Zach O'Keefe <zokeefe@google.com>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: "Souptick Joarder (HPE)" <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Zach O'Keefe and committed by
Andrew Morton
50722804 a7f4e6e4

+67 -39
+1
include/trace/events/huge_memory.h
··· 11 11 EM( SCAN_FAIL, "failed") \ 12 12 EM( SCAN_SUCCEED, "succeeded") \ 13 13 EM( SCAN_PMD_NULL, "pmd_null") \ 14 + EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ 14 15 EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ 15 16 EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ 16 17 EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
+2 -16
mm/huge_memory.c
··· 2286 2286 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, 2287 2287 bool freeze, struct folio *folio) 2288 2288 { 2289 - pgd_t *pgd; 2290 - p4d_t *p4d; 2291 - pud_t *pud; 2292 - pmd_t *pmd; 2289 + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); 2293 2290 2294 - pgd = pgd_offset(vma->vm_mm, address); 2295 - if (!pgd_present(*pgd)) 2291 + if (!pmd) 2296 2292 return; 2297 - 2298 - p4d = p4d_offset(pgd, address); 2299 - if (!p4d_present(*p4d)) 2300 - return; 2301 - 2302 - pud = pud_offset(p4d, address); 2303 - if (!pud_present(*pud)) 2304 - return; 2305 - 2306 - pmd = pmd_offset(pud, address); 2307 2293 2308 2294 __split_huge_pmd(vma, pmd, address, freeze, folio); 2309 2295 }
+1 -1
mm/internal.h
··· 187 187 /* 188 188 * in mm/rmap.c: 189 189 */ 190 - extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); 190 + pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); 191 191 192 192 /* 193 193 * in mm/page_alloc.c
+48 -12
mm/khugepaged.c
··· 28 28 SCAN_FAIL, 29 29 SCAN_SUCCEED, 30 30 SCAN_PMD_NULL, 31 + SCAN_PMD_MAPPED, 31 32 SCAN_EXCEED_NONE_PTE, 32 33 SCAN_EXCEED_SWAP_PTE, 33 34 SCAN_EXCEED_SHARED_PTE, ··· 878 877 return SCAN_SUCCEED; 879 878 } 880 879 880 + static int find_pmd_or_thp_or_none(struct mm_struct *mm, 881 + unsigned long address, 882 + pmd_t **pmd) 883 + { 884 + pmd_t pmde; 885 + 886 + *pmd = mm_find_pmd(mm, address); 887 + if (!*pmd) 888 + return SCAN_PMD_NULL; 889 + 890 + pmde = pmd_read_atomic(*pmd); 891 + 892 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 893 + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ 894 + barrier(); 895 + #endif 896 + if (!pmd_present(pmde)) 897 + return SCAN_PMD_NULL; 898 + if (pmd_trans_huge(pmde)) 899 + return SCAN_PMD_MAPPED; 900 + if (pmd_bad(pmde)) 901 + return SCAN_PMD_NULL; 902 + return SCAN_SUCCEED; 903 + } 904 + 905 + static int check_pmd_still_valid(struct mm_struct *mm, 906 + unsigned long address, 907 + pmd_t *pmd) 908 + { 909 + pmd_t *new_pmd; 910 + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); 911 + 912 + if (result != SCAN_SUCCEED) 913 + return result; 914 + if (new_pmd != pmd) 915 + return SCAN_FAIL; 916 + return SCAN_SUCCEED; 917 + } 918 + 881 919 /* 882 920 * Bring missing pages in from swap, to complete THP collapse. 883 921 * Only done if khugepaged_scan_pmd believes it is worthwhile. ··· 1028 988 goto out_nolock; 1029 989 } 1030 990 1031 - pmd = mm_find_pmd(mm, address); 1032 - if (!pmd) { 1033 - result = SCAN_PMD_NULL; 991 + result = find_pmd_or_thp_or_none(mm, address, &pmd); 992 + if (result != SCAN_SUCCEED) { 1034 993 mmap_read_unlock(mm); 1035 994 goto out_nolock; 1036 995 } ··· 1057 1018 if (result != SCAN_SUCCEED) 1058 1019 goto out_up_write; 1059 1020 /* check if the pmd is still valid */ 1060 - if (mm_find_pmd(mm, address) != pmd) 1021 + result = check_pmd_still_valid(mm, address, pmd); 1022 + if (result != SCAN_SUCCEED) 1061 1023 goto out_up_write; 1062 1024 1063 1025 anon_vma_lock_write(vma->anon_vma); ··· 1161 1121 1162 1122 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1163 1123 1164 - pmd = mm_find_pmd(mm, address); 1165 - if (!pmd) { 1166 - result = SCAN_PMD_NULL; 1124 + result = find_pmd_or_thp_or_none(mm, address, &pmd); 1125 + if (result != SCAN_SUCCEED) 1167 1126 goto out; 1168 - } 1169 1127 1170 1128 memset(cc->node_load, 0, sizeof(cc->node_load)); 1171 1129 pte = pte_offset_map_lock(mm, pmd, address, &ptl); ··· 1421 1383 if (!PageHead(hpage)) 1422 1384 goto drop_hpage; 1423 1385 1424 - pmd = mm_find_pmd(mm, haddr); 1425 - if (!pmd) 1386 + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) 1426 1387 goto drop_hpage; 1427 1388 1428 1389 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); ··· 1539 1502 if (vma->vm_end < addr + HPAGE_PMD_SIZE) 1540 1503 continue; 1541 1504 mm = vma->vm_mm; 1542 - pmd = mm_find_pmd(mm, addr); 1543 - if (!pmd) 1505 + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) 1544 1506 continue; 1545 1507 /* 1546 1508 * We need exclusive mmap_lock to retract page table.
+10
mm/ksm.c
··· 1134 1134 { 1135 1135 struct mm_struct *mm = vma->vm_mm; 1136 1136 pmd_t *pmd; 1137 + pmd_t pmde; 1137 1138 pte_t *ptep; 1138 1139 pte_t newpte; 1139 1140 spinlock_t *ptl; ··· 1148 1147 1149 1148 pmd = mm_find_pmd(mm, addr); 1150 1149 if (!pmd) 1150 + goto out; 1151 + /* 1152 + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 1153 + * without holding anon_vma lock for write. So when looking for a 1154 + * genuine pmde (in which to find pte), test present and !THP together. 1155 + */ 1156 + pmde = *pmd; 1157 + barrier(); 1158 + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 1151 1159 goto out; 1152 1160 1153 1161 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+5 -10
mm/rmap.c
··· 767 767 return vma_address(page, vma); 768 768 } 769 769 770 + /* 771 + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or 772 + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* 773 + * represents. 774 + */ 770 775 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 771 776 { 772 777 pgd_t *pgd; 773 778 p4d_t *p4d; 774 779 pud_t *pud; 775 780 pmd_t *pmd = NULL; 776 - pmd_t pmde; 777 781 778 782 pgd = pgd_offset(mm, address); 779 783 if (!pgd_present(*pgd)) ··· 792 788 goto out; 793 789 794 790 pmd = pmd_offset(pud, address); 795 - /* 796 - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 797 - * without holding anon_vma lock for write. So when looking for a 798 - * genuine pmde (in which to find pte), test present and !THP together. 799 - */ 800 - pmde = *pmd; 801 - barrier(); 802 - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 803 - pmd = NULL; 804 791 out: 805 792 return pmd; 806 793 }