x86/mm/pat: Fix VM_PAT handling when fork() fails in copy_page_range()

If track_pfn_copy() fails, we already added the dst VMA to the maple
tree. As fork() fails, we'll cleanup the maple tree, and stumble over
the dst VMA for which we neither performed any reservation nor copied
any page tables.

Consequently untrack_pfn() will see VM_PAT and try obtaining the
PAT information from the page table -- which fails because the page
table was not copied.

The easiest fix would be to simply clear the VM_PAT flag of the dst VMA
if track_pfn_copy() fails. However, the whole thing is about "simply"
clearing the VM_PAT flag is shaky as well: if we passed track_pfn_copy()
and performed a reservation, but copying the page tables fails, we'll
simply clear the VM_PAT flag, not properly undoing the reservation ...
which is also wrong.

So let's fix it properly: set the VM_PAT flag only if the reservation
succeeded (leaving it clear initially), and undo the reservation if
anything goes wrong while copying the page tables: clearing the VM_PAT
flag after undoing the reservation.

Note that any copied page table entries will get zapped when the VMA will
get removed later, after copy_page_range() succeeded; as VM_PAT is not set
then, we won't try cleaning VM_PAT up once more and untrack_pfn() will be
happy. Note that leaving these page tables in place without a reservation
is not a problem, as we are aborting fork(); this process will never run.

A reproducer can trigger this usually at the first try:

https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/reproducers/pat_fork.c

WARNING: CPU: 26 PID: 11650 at arch/x86/mm/pat/memtype.c:983 get_pat_info+0xf6/0x110
Modules linked in: ...
CPU: 26 UID: 0 PID: 11650 Comm: repro3 Not tainted 6.12.0-rc5+ #92
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014
RIP: 0010:get_pat_info+0xf6/0x110
...
Call Trace:
<TASK>
...
untrack_pfn+0x52/0x110
unmap_single_vma+0xa6/0xe0
unmap_vmas+0x105/0x1f0
exit_mmap+0xf6/0x460
__mmput+0x4b/0x120
copy_process+0x1bf6/0x2aa0
kernel_clone+0xab/0x440
__do_sys_clone+0x66/0x90
do_syscall_64+0x95/0x180

Likely this case was missed in:

d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")

... and instead of undoing the reservation we simply cleared the VM_PAT flag.

Keep the documentation of these functions in include/linux/pgtable.h,
one place is more than sufficient -- we should clean that up for the other
functions like track_pfn_remap/untrack_pfn separately.

Fixes: d155df53f310 ("x86/mm/pat: clear VM_PAT if copy_p4d_range failed")
Fixes: 2ab640379a0a ("x86: PAT: hooks in generic vm code to help archs to track pfnmap regions - v3")
Reported-by: xingwei lee <xrivendell7@gmail.com>
Reported-by: yuxin wang <wang1315768607@163.com>
Reported-by: Marius Fleischer <fleischermarius@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/20250321112323.153741-1-david@redhat.com
Closes: https://lore.kernel.org/lkml/CABOYnLx_dnqzpCW99G81DmOr+2UzdmZMk=T3uxwNxwz+R1RAwg@mail.gmail.com/
Closes: https://lore.kernel.org/lkml/CAJg=8jwijTP5fre8woS4JVJQ8iUA6v+iNcsOgtj9Zfpc3obDOQ@mail.gmail.com/

authored by David Hildenbrand and committed by Ingo Molnar dc84bc2a 878477a5

+58 -37
+28 -24
arch/x86/mm/pat/memtype.c
··· 984 984 return -EINVAL; 985 985 } 986 986 987 - /* 988 - * track_pfn_copy is called when vma that is covering the pfnmap gets 989 - * copied through copy_page_range(). 990 - * 991 - * If the vma has a linear pfn mapping for the entire range, we get the prot 992 - * from pte and reserve the entire vma range with single reserve_pfn_range call. 993 - */ 994 - int track_pfn_copy(struct vm_area_struct *vma) 987 + int track_pfn_copy(struct vm_area_struct *dst_vma, 988 + struct vm_area_struct *src_vma, unsigned long *pfn) 995 989 { 990 + const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; 996 991 resource_size_t paddr; 997 - unsigned long vma_size = vma->vm_end - vma->vm_start; 998 992 pgprot_t pgprot; 993 + int rc; 999 994 1000 - if (vma->vm_flags & VM_PAT) { 1001 - if (get_pat_info(vma, &paddr, &pgprot)) 1002 - return -EINVAL; 1003 - /* reserve the whole chunk covered by vma. */ 1004 - return reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1005 - } 995 + if (!(src_vma->vm_flags & VM_PAT)) 996 + return 0; 1006 997 998 + /* 999 + * Duplicate the PAT information for the dst VMA based on the src 1000 + * VMA. 1001 + */ 1002 + if (get_pat_info(src_vma, &paddr, &pgprot)) 1003 + return -EINVAL; 1004 + rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); 1005 + if (rc) 1006 + return rc; 1007 + 1008 + /* Reservation for the destination VMA succeeded. */ 1009 + vm_flags_set(dst_vma, VM_PAT); 1010 + *pfn = PHYS_PFN(paddr); 1007 1011 return 0; 1012 + } 1013 + 1014 + void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) 1015 + { 1016 + untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true); 1017 + /* 1018 + * Reservation was freed, any copied page tables will get cleaned 1019 + * up later, but without getting PAT involved again. 1020 + */ 1008 1021 } 1009 1022 1010 1023 /* ··· 1108 1095 } 1109 1096 } 1110 1097 1111 - /* 1112 - * untrack_pfn_clear is called if the following situation fits: 1113 - * 1114 - * 1) while mremapping a pfnmap for a new region, with the old vma after 1115 - * its pfnmap page table has been removed. The new vma has a new pfnmap 1116 - * to the same pfn & cache type with VM_PAT set. 1117 - * 2) while duplicating vm area, the new vma fails to copy the pgtable from 1118 - * old vma. 1119 - */ 1120 1098 void untrack_pfn_clear(struct vm_area_struct *vma) 1121 1099 { 1122 1100 vm_flags_clear(vma, VM_PAT);
+22 -6
include/linux/pgtable.h
··· 1508 1508 } 1509 1509 1510 1510 /* 1511 - * track_pfn_copy is called when vma that is covering the pfnmap gets 1512 - * copied through copy_page_range(). 1511 + * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page 1512 + * tables copied during copy_page_range(). On success, stores the pfn to be 1513 + * passed to untrack_pfn_copy(). 1513 1514 */ 1514 - static inline int track_pfn_copy(struct vm_area_struct *vma) 1515 + static inline int track_pfn_copy(struct vm_area_struct *dst_vma, 1516 + struct vm_area_struct *src_vma, unsigned long *pfn) 1515 1517 { 1516 1518 return 0; 1519 + } 1520 + 1521 + /* 1522 + * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during 1523 + * copy_page_range(), but after track_pfn_copy() was already called. 1524 + */ 1525 + static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, 1526 + unsigned long pfn) 1527 + { 1517 1528 } 1518 1529 1519 1530 /* ··· 1539 1528 } 1540 1529 1541 1530 /* 1542 - * untrack_pfn_clear is called while mremapping a pfnmap for a new region 1543 - * or fails to copy pgtable during duplicate vm area. 1531 + * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: 1532 + * 1533 + * 1) During mremap() on the src VMA after the page tables were moved. 1534 + * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. 1544 1535 */ 1545 1536 static inline void untrack_pfn_clear(struct vm_area_struct *vma) 1546 1537 { ··· 1553 1540 unsigned long size); 1554 1541 extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, 1555 1542 pfn_t pfn); 1556 - extern int track_pfn_copy(struct vm_area_struct *vma); 1543 + extern int track_pfn_copy(struct vm_area_struct *dst_vma, 1544 + struct vm_area_struct *src_vma, unsigned long *pfn); 1545 + extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, 1546 + unsigned long pfn); 1557 1547 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, 1558 1548 unsigned long size, bool mm_wr_locked); 1559 1549 extern void untrack_pfn_clear(struct vm_area_struct *vma);
+4
kernel/fork.c
··· 504 504 vma_numab_state_init(new); 505 505 dup_anon_vma_name(orig, new); 506 506 507 + /* track_pfn_copy() will later take care of copying internal state. */ 508 + if (unlikely(new->vm_flags & VM_PFNMAP)) 509 + untrack_pfn_clear(new); 510 + 507 511 return new; 508 512 } 509 513
+4 -7
mm/memory.c
··· 1362 1362 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1363 1363 { 1364 1364 pgd_t *src_pgd, *dst_pgd; 1365 - unsigned long next; 1366 1365 unsigned long addr = src_vma->vm_start; 1367 1366 unsigned long end = src_vma->vm_end; 1368 1367 struct mm_struct *dst_mm = dst_vma->vm_mm; 1369 1368 struct mm_struct *src_mm = src_vma->vm_mm; 1370 1369 struct mmu_notifier_range range; 1370 + unsigned long next, pfn; 1371 1371 bool is_cow; 1372 1372 int ret; 1373 1373 ··· 1378 1378 return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); 1379 1379 1380 1380 if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { 1381 - /* 1382 - * We do not free on error cases below as remove_vma 1383 - * gets called on error from higher level routine 1384 - */ 1385 - ret = track_pfn_copy(src_vma); 1381 + ret = track_pfn_copy(dst_vma, src_vma, &pfn); 1386 1382 if (ret) 1387 1383 return ret; 1388 1384 } ··· 1415 1419 continue; 1416 1420 if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, 1417 1421 addr, next))) { 1418 - untrack_pfn_clear(dst_vma); 1419 1422 ret = -ENOMEM; 1420 1423 break; 1421 1424 } ··· 1424 1429 raw_write_seqcount_end(&src_mm->write_protect_seq); 1425 1430 mmu_notifier_invalidate_range_end(&range); 1426 1431 } 1432 + if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) 1433 + untrack_pfn_copy(dst_vma, pfn); 1427 1434 return ret; 1428 1435 } 1429 1436