Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: close PageTail race

Commit bf6bddf1924e ("mm: introduce compaction and migration for
ballooned pages") introduces page_count(page) into memory compaction
which dereferences page->first_page if PageTail(page).

This results in a very rare NULL pointer dereference on the
aforementioned page_count(page). Indeed, anything that does
compound_head(), including page_count() is susceptible to racing with
prep_compound_page() and seeing a NULL or dangling page->first_page
pointer.

This patch uses Andrea's implementation of compound_trans_head() that
deals with such a race and makes it the default compound_head()
implementation. This includes a read memory barrier that ensures that
if PageTail(head) is true that we return a head page that is neither
NULL nor dangling. The patch then adds a store memory barrier to
prep_compound_page() to ensure page->first_page is set.

This is the safest way to ensure we see the head page that we are
expecting, PageTail(page) is already in the unlikely() path and the
memory barriers are unfortunately required.

Hugetlbfs is the exception, we don't enforce a store memory barrier
during init since no race is possible.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Holger Kiehl <Holger.Kiehl@dwd.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Rientjes and committed by
Linus Torvalds
668f9abb aa15aa0e

+25 -55
+2 -2
drivers/block/aoe/aoecmd.c
··· 874 874 /* Non-zero page count for non-head members of 875 875 * compound pages is no longer allowed by the kernel. 876 876 */ 877 - page = compound_trans_head(bv.bv_page); 877 + page = compound_head(bv.bv_page); 878 878 atomic_inc(&page->_count); 879 879 } 880 880 } ··· 887 887 struct bvec_iter iter; 888 888 889 889 bio_for_each_segment(bv, bio, iter) { 890 - page = compound_trans_head(bv.bv_page); 890 + page = compound_head(bv.bv_page); 891 891 atomic_dec(&page->_count); 892 892 } 893 893 }
+2 -2
drivers/vfio/vfio_iommu_type1.c
··· 186 186 if (pfn_valid(pfn)) { 187 187 bool reserved; 188 188 struct page *tail = pfn_to_page(pfn); 189 - struct page *head = compound_trans_head(tail); 189 + struct page *head = compound_head(tail); 190 190 reserved = !!(PageReserved(head)); 191 191 if (head != tail) { 192 192 /* 193 193 * "head" is not a dangling pointer 194 - * (compound_trans_head takes care of that) 194 + * (compound_head takes care of that) 195 195 * but the hugepage may have been split 196 196 * from under us (and we may not hold a 197 197 * reference count on the head page so it can
+2 -3
fs/proc/page.c
··· 121 121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon 122 122 * to make sure a given page is a thp, not a non-huge compound page. 123 123 */ 124 - else if (PageTransCompound(page) && 125 - (PageLRU(compound_trans_head(page)) || 126 - PageAnon(compound_trans_head(page)))) 124 + else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || 125 + PageAnon(compound_head(page)))) 127 126 u |= 1 << KPF_THP; 128 127 129 128 /*
-41
include/linux/huge_mm.h
··· 157 157 return HPAGE_PMD_NR; 158 158 return 1; 159 159 } 160 - /* 161 - * compound_trans_head() should be used instead of compound_head(), 162 - * whenever the "page" passed as parameter could be the tail of a 163 - * transparent hugepage that could be undergoing a 164 - * __split_huge_page_refcount(). The page structure layout often 165 - * changes across releases and it makes extensive use of unions. So if 166 - * the page structure layout will change in a way that 167 - * page->first_page gets clobbered by __split_huge_page_refcount, the 168 - * implementation making use of smp_rmb() will be required. 169 - * 170 - * Currently we define compound_trans_head as compound_head, because 171 - * page->private is in the same union with page->first_page, and 172 - * page->private isn't clobbered. However this also means we're 173 - * currently leaving dirt into the page->private field of anonymous 174 - * pages resulting from a THP split, instead of setting page->private 175 - * to zero like for every other page that has PG_private not set. But 176 - * anonymous pages don't use page->private so this is not a problem. 177 - */ 178 - #if 0 179 - /* This will be needed if page->private will be clobbered in split_huge_page */ 180 - static inline struct page *compound_trans_head(struct page *page) 181 - { 182 - if (PageTail(page)) { 183 - struct page *head; 184 - head = page->first_page; 185 - smp_rmb(); 186 - /* 187 - * head may be a dangling pointer. 188 - * __split_huge_page_refcount clears PageTail before 189 - * overwriting first_page, so if PageTail is still 190 - * there it means the head pointer isn't dangling. 191 - */ 192 - if (PageTail(page)) 193 - return head; 194 - } 195 - return page; 196 - } 197 - #else 198 - #define compound_trans_head(page) compound_head(page) 199 - #endif 200 160 201 161 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 202 162 unsigned long addr, pmd_t pmd, pmd_t *pmdp); ··· 186 226 do { } while (0) 187 227 #define split_huge_page_pmd_mm(__mm, __address, __pmd) \ 188 228 do { } while (0) 189 - #define compound_trans_head(page) compound_head(page) 190 229 static inline int hugepage_madvise(struct vm_area_struct *vma, 191 230 unsigned long *vm_flags, int advice) 192 231 {
+12 -2
include/linux/mm.h
··· 399 399 400 400 static inline struct page *compound_head(struct page *page) 401 401 { 402 - if (unlikely(PageTail(page))) 403 - return page->first_page; 402 + if (unlikely(PageTail(page))) { 403 + struct page *head = page->first_page; 404 + 405 + /* 406 + * page->first_page may be a dangling pointer to an old 407 + * compound page, so recheck that it is still a tail 408 + * page before returning. 409 + */ 410 + smp_rmb(); 411 + if (likely(PageTail(page))) 412 + return head; 413 + } 404 414 return page; 405 415 } 406 416
+1 -1
mm/ksm.c
··· 444 444 static struct page *page_trans_compound_anon(struct page *page) 445 445 { 446 446 if (PageTransCompound(page)) { 447 - struct page *head = compound_trans_head(page); 447 + struct page *head = compound_head(page); 448 448 /* 449 449 * head may actually be splitted and freed from under 450 450 * us but it's ok here.
+1 -1
mm/memory-failure.c
··· 1651 1651 { 1652 1652 int ret; 1653 1653 unsigned long pfn = page_to_pfn(page); 1654 - struct page *hpage = compound_trans_head(page); 1654 + struct page *hpage = compound_head(page); 1655 1655 1656 1656 if (PageHWPoison(page)) { 1657 1657 pr_info("soft offline: %#lx page already poisoned\n", pfn);
+3 -1
mm/page_alloc.c
··· 369 369 __SetPageHead(page); 370 370 for (i = 1; i < nr_pages; i++) { 371 371 struct page *p = page + i; 372 - __SetPageTail(p); 373 372 set_page_count(p, 0); 374 373 p->first_page = page; 374 + /* Make sure p->first_page is always valid for PageTail() */ 375 + smp_wmb(); 376 + __SetPageTail(p); 375 377 } 376 378 } 377 379
+2 -2
mm/swap.c
··· 98 98 } 99 99 100 100 /* __split_huge_page_refcount can run under us */ 101 - page_head = compound_trans_head(page); 101 + page_head = compound_head(page); 102 102 103 103 /* 104 104 * THP can not break up slab pages so avoid taking ··· 253 253 */ 254 254 unsigned long flags; 255 255 bool got; 256 - struct page *page_head = compound_trans_head(page); 256 + struct page *page_head = compound_head(page); 257 257 258 258 /* Ref to put_compound_page() comment. */ 259 259 if (!__compound_tail_refcounted(page_head)) {