x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()

If track_pfn_copy() fails, we already added the dst VMA to the maple
tree. As fork() fails, we'll cleanup the maple tree, and stumble over
the dst VMA for which we neither performed any reservation nor copied
any page tables.

Consequently untrack_pfn() will see VM_PAT and try obtaining the
PAT information from the page table -- which fails because the page
table was not copied.

The easiest fix would be to simply clear the VM_PAT flag of the dst VMA
if track_pfn_copy() fails. However, the whole thing is about "simply"
clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy()
and performed a reservation, but copying the page tables fails, we'll
simply clear the VM_PAT flag, not properly undoing the reservation ...
which is also wrong.

So let's fix it properly: set the VM_PAT flag only if the reservation
succeeded (leaving it clear initially), and undo the reservation if
anything goes wrong while copying the page tables: clearing the VM_PAT
flag after undoing the reservation.

Note that any copied page table entries will get zapped when the VMA will
get removed later, after copy_page_range() succeeded; as VM_PAT is not set
then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be
happy. Note that leaving these page tables in place without a reservation
is not a problem, as we are aborting fork(); this process will never run.

A reproducer can trigger this usually at the first try:

https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c

WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110
Modules linked in: ...
CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014
RIP: 0010:get_pat_info+0xf6/0x110
...
Call Trace:
<TASK>
...
untrack_pfn+0x52/0x110
unmap_single_vma+0xa6/0xe0
unmap_vmas+0x105/0x1f0
exit_mmap+0xf6/0x460
__mmput+0x4b/0x120
copy_process+0x1bf6/0x2aa0
kernel_clone+0xab/0x440
__do_sys_clone+0x66/0x90
do_syscall_64+0x95/0x180

Likely this case was missed in:

d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")

... and instead of undoing the reservation we simply cleared the VM_PAT flag.

Keep the documentation of these functions in include/linux/pgtable.h,
one place is more than sufficient -- we should clean that up for the other
functions like track_pfn_remap/untrack_pfn separately.

Fixes: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")
Fixes: 2ab640379a0a ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3")
Reported-by: xingwei lee <xrivendell7@gmail.com>
Reported-by: yuxin wang <wang1315768607@163.com>
Reported-by: Marius Fleischer <fleischermarius@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20250321112323.153741-1-david@redhat.com
Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/
Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/

authored by David Hildenbrand and committed by Ingo Molnar dc84bc2a 878477a5

+58 -37
+28 -24
arch/x86/mm/pat/memtype.c
··· 984 return -EINVAL; 985 } 986 987 - /* 988 - * track_pfn_copy is called when vma that is covering the pfnmap gets 989 - * copied through copy_page_range(). 990 - * 991 - * If the vma has a linear pfn mapping for the entire range, we get the prot 992 - * from pte and reserve the entire vma range with single reserve_pfn_range call. 993 - */ 994 - int track_pfn_copy(struct vm_area_struct *vma) 995 { 996 resource_size_t paddr; 997 - unsigned long vma_size = vma->vm_end - vma->vm_start; 998 pgprot_t pgprot; 999 1000 - if (vma->vm_flags & VM_PAT) { 1001 - if (get_pat_info(vma, &paddr, &pgprot)) 1002 - return -EINVAL; 1003 - /* reserve the whole chunk covered by vma. */ 1004 - return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1005 - } 1006 1007 return 0; 1008 } 1009 1010 /* ··· 1108 } 1109 } 1110 1111 - /* 1112 - * untrack_pfn_clear is called if the following situation fits: 1113 - * 1114 - * 1) while mremapping a pfnmap for a new region, with the old vma after 1115 - * its pfnmap page table has been removed. The new vma has a new pfnmap 1116 - * to the same pfn & cache type with VM_PAT set. 1117 - * 2) while duplicating vm area, the new vma fails to copy the pgtable from 1118 - * old vma. 1119 - */ 1120 void untrack_pfn_clear(struct vm_area_struct *vma) 1121 { 1122 vm_flags_clear(vma, VM_PAT);
··· 984 return -EINVAL; 985 } 986 987 + int track_pfn_copy(struct vm_area_struct *dst_vma, 988 + struct vm_area_struct *src_vma, unsigned long *pfn) 989 { 990 + const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; 991 resource_size_t paddr; 992 pgprot_t pgprot; 993 + int rc; 994 995 + if (!(src_vma->vm_flags & VM_PAT)) 996 + return 0; 997 998 + /* 999 + * Duplicate the PAT information for the dst VMA based on the src 1000 + * VMA. 1001 + */ 1002 + if (get_pat_info(src_vma, &paddr, &pgprot)) 1003 + return -EINVAL; 1004 + rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1005 + if (rc) 1006 + return rc; 1007 + 1008 + /* Reservation for the destination VMA succeeded. */ 1009 + vm_flags_set(dst_vma, VM_PAT); 1010 + *pfn = PHYS_PFN(paddr); 1011 return 0; 1012 + } 1013 + 1014 + void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) 1015 + { 1016 + untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true); 1017 + /* 1018 + * Reservation was freed, any copied page tables will get cleaned 1019 + * up later, but without getting PAT involved again. 1020 + */ 1021 } 1022 1023 /* ··· 1095 } 1096 } 1097 1098 void untrack_pfn_clear(struct vm_area_struct *vma) 1099 { 1100 vm_flags_clear(vma, VM_PAT);
+22 -6
include/linux/pgtable.h
··· 1508 } 1509 1510 /* 1511 - * track_pfn_copy is called when vma that is covering the pfnmap gets 1512 - * copied through copy_page_range(). 1513 */ 1514 - static inline int track_pfn_copy(struct vm_area_struct *vma) 1515 { 1516 return 0; 1517 } 1518 1519 /* ··· 1539 } 1540 1541 /* 1542 - * untrack_pfn_clear is called while mremapping a pfnmap for a new region 1543 - * or fails to copy pgtable during duplicate vm area. 1544 */ 1545 static inline void untrack_pfn_clear(struct vm_area_struct *vma) 1546 { ··· 1553 unsigned long size); 1554 extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 1555 pfn_t pfn); 1556 - extern int track_pfn_copy(struct vm_area_struct *vma); 1557 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1558 unsigned long size, bool mm_wr_locked); 1559 extern void untrack_pfn_clear(struct vm_area_struct *vma);
··· 1508 } 1509 1510 /* 1511 + * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page 1512 + * tables copied during copy_page_range(). On success, stores the pfn to be 1513 + * passed to untrack_pfn_copy(). 1514 */ 1515 + static inline int track_pfn_copy(struct vm_area_struct *dst_vma, 1516 + struct vm_area_struct *src_vma, unsigned long *pfn) 1517 { 1518 return 0; 1519 + } 1520 + 1521 + /* 1522 + * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during 1523 + * copy_page_range(), but after track_pfn_copy() was already called. 1524 + */ 1525 + static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, 1526 + unsigned long pfn) 1527 + { 1528 } 1529 1530 /* ··· 1528 } 1529 1530 /* 1531 + * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: 1532 + * 1533 + * 1) During mremap() on the src VMA after the page tables were moved. 1534 + * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. 1535 */ 1536 static inline void untrack_pfn_clear(struct vm_area_struct *vma) 1537 { ··· 1540 unsigned long size); 1541 extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 1542 pfn_t pfn); 1543 + extern int track_pfn_copy(struct vm_area_struct *dst_vma, 1544 + struct vm_area_struct *src_vma, unsigned long *pfn); 1545 + extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, 1546 + unsigned long pfn); 1547 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1548 unsigned long size, bool mm_wr_locked); 1549 extern void untrack_pfn_clear(struct vm_area_struct *vma);
+4
kernel/fork.c
··· 504 vma_numab_state_init(new); 505 dup_anon_vma_name(orig, new); 506 507 return new; 508 } 509
··· 504 vma_numab_state_init(new); 505 dup_anon_vma_name(orig, new); 506 507 + /* track_pfn_copy() will later take care of copying internal state. */ 508 + if (unlikely(new->vm_flags & VM_PFNMAP)) 509 + untrack_pfn_clear(new); 510 + 511 return new; 512 } 513
+4 -7
mm/memory.c
··· 1362 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1363 { 1364 pgd_t *src_pgd, *dst_pgd; 1365 - unsigned long next; 1366 unsigned long addr = src_vma->vm_start; 1367 unsigned long end = src_vma->vm_end; 1368 struct mm_struct *dst_mm = dst_vma->vm_mm; 1369 struct mm_struct *src_mm = src_vma->vm_mm; 1370 struct mmu_notifier_range range; 1371 bool is_cow; 1372 int ret; 1373 ··· 1378 return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); 1379 1380 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { 1381 - /* 1382 - * We do not free on error cases below as remove_vma 1383 - * gets called on error from higher level routine 1384 - */ 1385 - ret = track_pfn_copy(src_vma); 1386 if (ret) 1387 return ret; 1388 } ··· 1415 continue; 1416 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, 1417 addr, next))) { 1418 - untrack_pfn_clear(dst_vma); 1419 ret = -ENOMEM; 1420 break; 1421 } ··· 1424 raw_write_seqcount_end(&src_mm->write_protect_seq); 1425 mmu_notifier_invalidate_range_end(&range); 1426 } 1427 return ret; 1428 } 1429
··· 1362 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1363 { 1364 pgd_t *src_pgd, *dst_pgd; 1365 unsigned long addr = src_vma->vm_start; 1366 unsigned long end = src_vma->vm_end; 1367 struct mm_struct *dst_mm = dst_vma->vm_mm; 1368 struct mm_struct *src_mm = src_vma->vm_mm; 1369 struct mmu_notifier_range range; 1370 + unsigned long next, pfn; 1371 bool is_cow; 1372 int ret; 1373 ··· 1378 return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); 1379 1380 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { 1381 + ret = track_pfn_copy(dst_vma, src_vma, &pfn); 1382 if (ret) 1383 return ret; 1384 } ··· 1419 continue; 1420 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, 1421 addr, next))) { 1422 ret = -ENOMEM; 1423 break; 1424 } ··· 1429 raw_write_seqcount_end(&src_mm->write_protect_seq); 1430 mmu_notifier_invalidate_range_end(&range); 1431 } 1432 + if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) 1433 + untrack_pfn_copy(dst_vma, pfn); 1434 return ret; 1435 } 1436