Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/hugetlb: fix races when looking up a CONT-PTE/PMD size hugetlb page

On some architectures (like ARM64), it can support CONT-PTE/PMD size
hugetlb, which means it can support not only PMD/PUD size hugetlb (2M and
1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size specified.

So when looking up a CONT-PTE size hugetlb page by follow_page(), it will
use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE size
hugetlb in follow_page_pte(). However this pte entry lock is incorrect
for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to get
the correct lock, which is mm->page_table_lock.

That means the pte entry of the CONT-PTE size hugetlb under current pte
lock is unstable in follow_page_pte(), we can continue to migrate or
poison the pte entry of the CONT-PTE size hugetlb, which can cause some
potential race issues, even though they are under the 'pte lock'.

For example, suppose thread A is trying to look up a CONT-PTE size hugetlb
page by move_pages() syscall under the lock, however antoher thread B can
migrate the CONT-PTE hugetlb page at the same time, which will cause
thread A to get an incorrect page, if thread A also wants to do page
migration, then data inconsistency error occurs.

Moreover we have the same issue for CONT-PMD size hugetlb in
follow_huge_pmd().

To fix above issues, rename the follow_huge_pmd() as follow_huge_pmd_pte()
to handle PMD and PTE level size hugetlb, which uses huge_pte_lock() to
get the correct pte entry lock to make the pte entry stable.

Mike said:

Support for CONT_PMD/_PTE was added with bb9dd3df8ee9 ("arm64: hugetlb:
refactor find_num_contig()"). Patch series "Support for contiguous pte
hugepages", v4. However, I do not believe these code paths were
executed until migration support was added with 5480280d3f2d ("arm64/mm:
enable HugeTLB migration for contiguous bit HugeTLB pages") I would go
with 5480280d3f2d for the Fixes: targe.

Link: https://lkml.kernel.org/r/635f43bdd85ac2615a58405da82b4d33c6e5eb05.1662017562.git.baolin.wang@linux.alibaba.com
Fixes: 5480280d3f2d ("arm64/mm: enable HugeTLB migration for contiguous bit HugeTLB pages")
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Baolin Wang and committed by
Andrew Morton
fac35ba7 1c8e2349

+30 -19
+4 -4
include/linux/hugetlb.h
··· 207 207 struct page *follow_huge_pd(struct vm_area_struct *vma, 208 208 unsigned long address, hugepd_t hpd, 209 209 int flags, int pdshift); 210 - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 211 - pmd_t *pmd, int flags); 210 + struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, 211 + int flags); 212 212 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, 213 213 pud_t *pud, int flags); 214 214 struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, ··· 312 312 return NULL; 313 313 } 314 314 315 - static inline struct page *follow_huge_pmd(struct mm_struct *mm, 316 - unsigned long address, pmd_t *pmd, int flags) 315 + static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, 316 + unsigned long address, int flags) 317 317 { 318 318 return NULL; 319 319 }
+13 -1
mm/gup.c
··· 530 530 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 531 531 (FOLL_PIN | FOLL_GET))) 532 532 return ERR_PTR(-EINVAL); 533 + 534 + /* 535 + * Considering PTE level hugetlb, like continuous-PTE hugetlb on 536 + * ARM64 architecture. 537 + */ 538 + if (is_vm_hugetlb_page(vma)) { 539 + page = follow_huge_pmd_pte(vma, address, flags); 540 + if (page) 541 + return page; 542 + return no_page_table(vma, flags); 543 + } 544 + 533 545 retry: 534 546 if (unlikely(pmd_bad(*pmd))) 535 547 return no_page_table(vma, flags); ··· 674 662 if (pmd_none(pmdval)) 675 663 return no_page_table(vma, flags); 676 664 if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { 677 - page = follow_huge_pmd(mm, address, pmd, flags); 665 + page = follow_huge_pmd_pte(vma, address, flags); 678 666 if (page) 679 667 return page; 680 668 return no_page_table(vma, flags);
+13 -14
mm/hugetlb.c
··· 6946 6946 } 6947 6947 6948 6948 struct page * __weak 6949 - follow_huge_pmd(struct mm_struct *mm, unsigned long address, 6950 - pmd_t *pmd, int flags) 6949 + follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) 6951 6950 { 6951 + struct hstate *h = hstate_vma(vma); 6952 + struct mm_struct *mm = vma->vm_mm; 6952 6953 struct page *page = NULL; 6953 6954 spinlock_t *ptl; 6954 - pte_t pte; 6955 + pte_t *ptep, pte; 6955 6956 6956 6957 /* 6957 6958 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via ··· 6962 6961 return NULL; 6963 6962 6964 6963 retry: 6965 - ptl = pmd_lockptr(mm, pmd); 6966 - spin_lock(ptl); 6967 - /* 6968 - * make sure that the address range covered by this pmd is not 6969 - * unmapped from other threads. 6970 - */ 6971 - if (!pmd_huge(*pmd)) 6972 - goto out; 6973 - pte = huge_ptep_get((pte_t *)pmd); 6964 + ptep = huge_pte_offset(mm, address, huge_page_size(h)); 6965 + if (!ptep) 6966 + return NULL; 6967 + 6968 + ptl = huge_pte_lock(h, mm, ptep); 6969 + pte = huge_ptep_get(ptep); 6974 6970 if (pte_present(pte)) { 6975 - page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 6971 + page = pte_page(pte) + 6972 + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); 6976 6973 /* 6977 6974 * try_grab_page() should always succeed here, because: a) we 6978 6975 * hold the pmd (ptl) lock, and b) we've just checked that the ··· 6986 6987 } else { 6987 6988 if (is_hugetlb_entry_migration(pte)) { 6988 6989 spin_unlock(ptl); 6989 - __migration_entry_wait_huge((pte_t *)pmd, ptl); 6990 + __migration_entry_wait_huge(ptep, ptl); 6990 6991 goto retry; 6991 6992 } 6992 6993 /*