Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

x86/mm: Fix PTI for i386 some more

So it turns out that we have to do two passes of
pti_clone_entry_text(), once before initcalls, such that device and
late initcalls can use user-mode-helper / modprobe and once after
free_initmem() / mark_readonly().

Now obviously mark_readonly() can cause PMD splits, and
pti_clone_pgtable() doesn't like that much.

Allow the late clone to split PMDs so that pagetables stay in sync.

[peterz: Changelog and comments]
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lkml.kernel.org/r/20240806184843.GX37996@noisy.programming.kicks-ass.net

authored by

Thomas Gleixner and committed by
Peter Zijlstra
c48b5a4c de9c2c66

+29 -16
+29 -16
arch/x86/mm/pti.c
··· 241 241 * 242 242 * Returns a pointer to a PTE on success, or NULL on failure. 243 243 */ 244 - static pte_t *pti_user_pagetable_walk_pte(unsigned long address) 244 + static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text) 245 245 { 246 246 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); 247 247 pmd_t *pmd; ··· 251 251 if (!pmd) 252 252 return NULL; 253 253 254 - /* We can't do anything sensible if we hit a large mapping. */ 254 + /* Large PMD mapping found */ 255 255 if (pmd_leaf(*pmd)) { 256 - WARN_ON(1); 257 - return NULL; 256 + /* Clear the PMD if we hit a large mapping from the first round */ 257 + if (late_text) { 258 + set_pmd(pmd, __pmd(0)); 259 + } else { 260 + WARN_ON_ONCE(1); 261 + return NULL; 262 + } 258 263 } 259 264 260 265 if (pmd_none(*pmd)) { ··· 288 283 if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) 289 284 return; 290 285 291 - target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); 286 + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false); 292 287 if (WARN_ON(!target_pte)) 293 288 return; 294 289 ··· 306 301 307 302 static void 308 303 pti_clone_pgtable(unsigned long start, unsigned long end, 309 - enum pti_clone_level level) 304 + enum pti_clone_level level, bool late_text) 310 305 { 311 306 unsigned long addr; 312 307 ··· 395 390 return; 396 391 397 392 /* Allocate PTE in the user page-table */ 398 - target_pte = pti_user_pagetable_walk_pte(addr); 393 + target_pte = pti_user_pagetable_walk_pte(addr, late_text); 399 394 if (WARN_ON(!target_pte)) 400 395 return; 401 396 ··· 457 452 phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); 458 453 pte_t *target_pte; 459 454 460 - target_pte = pti_user_pagetable_walk_pte(va); 455 + target_pte = pti_user_pagetable_walk_pte(va, false); 461 456 if (WARN_ON(!target_pte)) 462 457 return; 463 458 ··· 480 475 start = CPU_ENTRY_AREA_BASE; 481 476 end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); 482 477 483 - pti_clone_pgtable(start, end, PTI_CLONE_PMD); 478 + pti_clone_pgtable(start, end, PTI_CLONE_PMD, false); 484 479 } 485 480 #endif /* CONFIG_X86_64 */ 486 481 ··· 497 492 /* 498 493 * Clone the populated PMDs of the entry text and force it RO. 499 494 */ 500 - static void pti_clone_entry_text(void) 495 + static void pti_clone_entry_text(bool late) 501 496 { 502 497 pti_clone_pgtable((unsigned long) __entry_text_start, 503 498 (unsigned long) __entry_text_end, 504 - PTI_LEVEL_KERNEL_IMAGE); 499 + PTI_LEVEL_KERNEL_IMAGE, late); 505 500 } 506 501 507 502 /* ··· 576 571 * pti_set_kernel_image_nonglobal() did to clear the 577 572 * global bit. 578 573 */ 579 - pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); 574 + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false); 580 575 581 576 /* 582 577 * pti_clone_pgtable() will set the global bit in any PMDs ··· 643 638 644 639 /* Undo all global bits from the init pagetables in head_64.S: */ 645 640 pti_set_kernel_image_nonglobal(); 641 + 646 642 /* Replace some of the global bits just for shared entry text: */ 647 - pti_clone_entry_text(); 643 + /* 644 + * This is very early in boot. Device and Late initcalls can do 645 + * modprobe before free_initmem() and mark_readonly(). This 646 + * pti_clone_entry_text() allows those user-mode-helpers to function, 647 + * but notably the text is still RW. 648 + */ 649 + pti_clone_entry_text(false); 648 650 pti_setup_espfix64(); 649 651 pti_setup_vsyscall(); 650 652 } ··· 668 656 if (!boot_cpu_has(X86_FEATURE_PTI)) 669 657 return; 670 658 /* 671 - * We need to clone everything (again) that maps parts of the 672 - * kernel image. 659 + * This is after free_initmem() (all initcalls are done) and we've done 660 + * mark_readonly(). Text is now NX which might've split some PMDs 661 + * relative to the early clone. 673 662 */ 674 - pti_clone_entry_text(); 663 + pti_clone_entry_text(true); 675 664 pti_clone_kernel_text(); 676 665 677 666 debug_checkwx_user();