Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: Enable vmalloc-huge with ptdump

Our goal is to move towards enabling vmalloc-huge by default on arm64 so
as to reduce TLB pressure. Therefore, we need a way to analyze the portion
of block mappings in vmalloc space we can get on a production system; this
can be done through ptdump, but currently we disable vmalloc-huge if
CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel
pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump
may dereference a bogus address.

To solve this, we need to synchronize ptdump_walk() and ptdump_check_wx()
with pud_free_pmd_page() and pmd_free_pte_page().

Since this race is very unlikely to happen in practice, we do not want to
penalize the vmalloc pagetable tearing path by taking the init_mm
mmap_lock. Therefore, we use static keys. ptdump_walk() and
ptdump_check_wx() are the pagetable walkers; they will enable the static
key - upon observing that, the vmalloc pagetable tearing path will get
patched in with an mmap_read_lock/unlock sequence. A combination of the
patched-in mmap_read_lock/unlock, the acquire semantics of
static_branch_inc(), and the barriers in __flush_tlb_kernel_pgtable()
ensures that ptdump will never get a hold on the address of a freed PMD
or PTE table.

We can verify the correctness of the algorithm via the following litmus
test (thanks to James Houghton and Will Deacon):

AArch64 ptdump
Variant=Ifetch
{
uint64_t pud=0xa110c;
uint64_t pmd;

0:X0=label:"P1:L0"; 0:X1=instr:"NOP"; 0:X2=lock; 0:X3=pud; 0:X4=pmd;
1:X1=0xdead; 1:X2=lock; 1:X3=pud; 1:X4=pmd;
}
P0 | P1 ;
(* static_key_enable *) | (* pud_free_pmd_page *) ;
STR W1, [X0] | LDR X9, [X3] ;
DC CVAU,X0 | STR XZR, [X3] ;
DSB ISH | DSB ISH ;
IC IVAU,X0 | ISB ;
DSB ISH | ;
ISB | (* static key *) ;
| L0: ;
(* mmap_lock *) | B out1 ;
Lwlock: | ;
MOV W7, #1 | (* mmap_lock *) ;
SWPA W7, W8, [X2] | Lrlock: ;
| MOV W7, #1 ;
| SWPA W7, W8, [X2] ;
(* walk pgtable *) | ;
LDR X9, [X3] | (* mmap_unlock *) ;
CBZ X9, out0 | STLR WZR, [X2] ;
EOR X10, X9, X9 | ;
LDR X11, [X4, X10] | out1: ;
| EOR X10, X9, X9 ;
out0: | STR X1, [X4, X10] ;

exists (0:X8=0 /\ 1:X8=0 /\ (* Lock acquisitions succeed *)
0:X9=0xa110c /\ (* P0 sees the valid PUD ...*)
0:X11=0xdead) (* ... but the freed PMD *)

For an approximate written proof of why this algorithm works, please read
the code comment in [1], which is now removed for the sake of simplicity.

mm-selftests pass. No issues were observed while parallelly running
test_vmalloc.sh (which stresses the vmalloc subsystem),
and cat /sys/kernel/debug/{kernel_page_tables, check_wx_pages} in a loop.

Link: https://lore.kernel.org/all/20250723161827.15802-1-dev.jain@arm.com/ [1]
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>

authored by

Dev Jain and committed by
Will Deacon
fa93b45f 3df6979d

+52 -13
+2
arch/arm64/include/asm/ptdump.h
··· 7 7 8 8 #include <linux/ptdump.h> 9 9 10 + DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); 11 + 10 12 #ifdef CONFIG_PTDUMP 11 13 12 14 #include <linux/mm_types.h>
+2 -7
arch/arm64/include/asm/vmalloc.h
··· 9 9 #define arch_vmap_pud_supported arch_vmap_pud_supported 10 10 static inline bool arch_vmap_pud_supported(pgprot_t prot) 11 11 { 12 - /* 13 - * SW table walks can't handle removal of intermediate entries. 14 - */ 15 - return pud_sect_supported() && 16 - !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); 12 + return pud_sect_supported(); 17 13 } 18 14 19 15 #define arch_vmap_pmd_supported arch_vmap_pmd_supported 20 16 static inline bool arch_vmap_pmd_supported(pgprot_t prot) 21 17 { 22 - /* See arch_vmap_pud_supported() */ 23 - return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); 18 + return true; 24 19 } 25 20 26 21 #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
+39 -4
arch/arm64/mm/mmu.c
··· 56 56 TABLE_P4D, 57 57 }; 58 58 59 + DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); 60 + 59 61 u64 kimage_voffset __ro_after_init; 60 62 EXPORT_SYMBOL(kimage_voffset); 61 63 ··· 1667 1665 return 1; 1668 1666 } 1669 1667 1670 - int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1668 + static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, 1669 + bool acquire_mmap_lock) 1671 1670 { 1672 1671 pte_t *table; 1673 1672 pmd_t pmd; ··· 1680 1677 return 1; 1681 1678 } 1682 1679 1680 + /* See comment in pud_free_pmd_page for static key logic */ 1683 1681 table = pte_offset_kernel(pmdp, addr); 1684 1682 pmd_clear(pmdp); 1685 1683 __flush_tlb_kernel_pgtable(addr); 1684 + if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { 1685 + mmap_read_lock(&init_mm); 1686 + mmap_read_unlock(&init_mm); 1687 + } 1688 + 1686 1689 pte_free_kernel(NULL, table); 1687 1690 return 1; 1691 + } 1692 + 1693 + int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1694 + { 1695 + /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ 1696 + return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); 1688 1697 } 1689 1698 1690 1699 int pud_free_pmd_page(pud_t *pudp, unsigned long addr) ··· 1714 1699 } 1715 1700 1716 1701 table = pmd_offset(pudp, addr); 1702 + 1703 + /* 1704 + * Our objective is to prevent ptdump from reading a PMD table which has 1705 + * been freed. In this race, if pud_free_pmd_page observes the key on 1706 + * (which got flipped by ptdump) then the mmap lock sequence here will, 1707 + * as a result of the mmap write lock/unlock sequence in ptdump, give 1708 + * us the correct synchronization. If not, this means that ptdump has 1709 + * yet not started walking the pagetables - the sequence of barriers 1710 + * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will 1711 + * observe an empty PUD. 1712 + */ 1713 + pud_clear(pudp); 1714 + __flush_tlb_kernel_pgtable(addr); 1715 + if (static_branch_unlikely(&arm64_ptdump_lock_key)) { 1716 + mmap_read_lock(&init_mm); 1717 + mmap_read_unlock(&init_mm); 1718 + } 1719 + 1717 1720 pmdp = table; 1718 1721 next = addr; 1719 1722 end = addr + PUD_SIZE; 1720 1723 do { 1721 1724 if (pmd_present(pmdp_get(pmdp))) 1722 - pmd_free_pte_page(pmdp, next); 1725 + /* 1726 + * PMD has been isolated, so ptdump won't see it. No 1727 + * need to acquire init_mm.mmap_lock. 1728 + */ 1729 + __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); 1723 1730 } while (pmdp++, next += PMD_SIZE, next != end); 1724 1731 1725 - pud_clear(pudp); 1726 - __flush_tlb_kernel_pgtable(addr); 1727 1732 pmd_free(NULL, table); 1728 1733 return 1; 1729 1734 }
+9 -2
arch/arm64/mm/ptdump.c
··· 283 283 note_page(pt_st, 0, -1, pte_val(pte_zero)); 284 284 } 285 285 286 + static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) 287 + { 288 + static_branch_inc(&arm64_ptdump_lock_key); 289 + ptdump_walk_pgd(st, mm, NULL); 290 + static_branch_dec(&arm64_ptdump_lock_key); 291 + } 292 + 286 293 void ptdump_walk(struct seq_file *s, struct ptdump_info *info) 287 294 { 288 295 unsigned long end = ~0UL; ··· 318 311 } 319 312 }; 320 313 321 - ptdump_walk_pgd(&st.ptdump, info->mm, NULL); 314 + arm64_ptdump_walk_pgd(&st.ptdump, info->mm); 322 315 } 323 316 324 317 static void __init ptdump_initialize(void) ··· 360 353 } 361 354 }; 362 355 363 - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); 356 + arm64_ptdump_walk_pgd(&st.ptdump, &init_mm); 364 357 365 358 if (st.wx_pages || st.uxn_pages) { 366 359 pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",