Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'x86_mm_for_v6.1_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Dave Hansen:
"There are some small things here, plus one big one.

The big one detected and refused to create W+X kernel mappings. This
caused a bit of trouble and it is entirely disabled on 32-bit due to
known unfixable EFI issues. It also oopsed on some systemd eBPF use,
which kept some users from booting.

The eBPF issue is fixed, but those troubles were caught relatively
recently which made me nervous that there are more lurking. The final
commit in here retains the warnings, but doesn't actually refuse to
create W+X mappings.

Summary:

- Detect insecure W+X mappings and warn about them, including a few
bug fixes and relaxing the enforcement

- Do a long-overdue defconfig update and enabling W+X boot-time
detection

- Cleanup _PAGE_PSE handling (follow-up on an earlier bug)

- Rename a change_page_attr function"

* tag 'x86_mm_for_v6.1_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm: Ease W^X enforcement back to just a warning
x86/mm: Disable W^X detection and enforcement on 32-bit
x86/mm: Add prot_sethuge() helper to abstract out _PAGE_PSE handling
x86/mm/32: Fix W^X detection when page tables do not support NX
x86/defconfig: Enable CONFIG_DEBUG_WX=y
x86/defconfig: Refresh the defconfigs
x86/mm: Refuse W^X violations
x86/mm: Rename set_memory_present() to set_memory_p()

+64 -15
+4 -2
arch/x86/configs/i386_defconfig
··· 27 27 CONFIG_CGROUP_DEBUG=y 28 28 CONFIG_BLK_DEV_INITRD=y 29 29 CONFIG_KALLSYMS_ALL=y 30 - # CONFIG_COMPAT_BRK is not set 31 30 CONFIG_PROFILING=y 32 31 CONFIG_SMP=y 33 32 CONFIG_HYPERVISOR_GUEST=y ··· 43 44 CONFIG_HZ_1000=y 44 45 CONFIG_KEXEC=y 45 46 CONFIG_CRASH_DUMP=y 47 + # CONFIG_RETHUNK is not set 46 48 CONFIG_HIBERNATION=y 47 49 CONFIG_PM_DEBUG=y 48 50 CONFIG_PM_TRACE_RTC=y ··· 62 62 CONFIG_BLK_CGROUP_IOCOST=y 63 63 CONFIG_BLK_CGROUP_IOPRIO=y 64 64 CONFIG_BINFMT_MISC=y 65 + # CONFIG_COMPAT_BRK is not set 65 66 CONFIG_NET=y 66 67 CONFIG_PACKET=y 67 68 CONFIG_UNIX=y ··· 270 269 CONFIG_SECURITY_SELINUX_BOOTPARAM=y 271 270 CONFIG_SECURITY_SELINUX_DISABLE=y 272 271 CONFIG_PRINTK_TIME=y 272 + CONFIG_DEBUG_KERNEL=y 273 273 CONFIG_FRAME_WARN=1024 274 274 CONFIG_MAGIC_SYSRQ=y 275 - CONFIG_DEBUG_KERNEL=y 275 + CONFIG_DEBUG_WX=y 276 276 CONFIG_DEBUG_STACK_USAGE=y 277 277 # CONFIG_SCHED_DEBUG is not set 278 278 CONFIG_SCHEDSTATS=y
+3 -2
arch/x86/configs/x86_64_defconfig
··· 26 26 CONFIG_CGROUP_DEBUG=y 27 27 CONFIG_BLK_DEV_INITRD=y 28 28 CONFIG_KALLSYMS_ALL=y 29 - # CONFIG_COMPAT_BRK is not set 30 29 CONFIG_PROFILING=y 31 30 CONFIG_SMP=y 32 31 CONFIG_HYPERVISOR_GUEST=y ··· 61 62 CONFIG_BLK_CGROUP_IOCOST=y 62 63 CONFIG_BLK_CGROUP_IOPRIO=y 63 64 CONFIG_BINFMT_MISC=y 65 + # CONFIG_COMPAT_BRK is not set 64 66 CONFIG_NET=y 65 67 CONFIG_PACKET=y 66 68 CONFIG_UNIX=y ··· 267 267 CONFIG_SECURITY_SELINUX_BOOTPARAM=y 268 268 CONFIG_SECURITY_SELINUX_DISABLE=y 269 269 CONFIG_PRINTK_TIME=y 270 - CONFIG_MAGIC_SYSRQ=y 271 270 CONFIG_DEBUG_KERNEL=y 271 + CONFIG_MAGIC_SYSRQ=y 272 + CONFIG_DEBUG_WX=y 272 273 CONFIG_DEBUG_STACK_USAGE=y 273 274 # CONFIG_SCHED_DEBUG is not set 274 275 CONFIG_SCHEDSTATS=y
+10 -9
arch/x86/mm/init_64.c
··· 90 90 DEFINE_ENTRY(pmd, pmd, init) 91 91 DEFINE_ENTRY(pte, pte, init) 92 92 93 + static inline pgprot_t prot_sethuge(pgprot_t prot) 94 + { 95 + WARN_ON_ONCE(pgprot_val(prot) & _PAGE_PAT); 96 + 97 + return __pgprot(pgprot_val(prot) | _PAGE_PSE); 98 + } 93 99 94 100 /* 95 101 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the ··· 563 557 if (page_size_mask & (1<<PG_LEVEL_2M)) { 564 558 pages++; 565 559 spin_lock(&init_mm.page_table_lock); 566 - set_pte_init((pte_t *)pmd, 567 - pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, 568 - __pgprot(pgprot_val(prot) | _PAGE_PSE)), 560 + set_pmd_init(pmd, 561 + pfn_pmd(paddr >> PAGE_SHIFT, prot_sethuge(prot)), 569 562 init); 570 563 spin_unlock(&init_mm.page_table_lock); 571 564 paddr_last = paddr_next; ··· 649 644 if (page_size_mask & (1<<PG_LEVEL_1G)) { 650 645 pages++; 651 646 spin_lock(&init_mm.page_table_lock); 652 - 653 - prot = __pgprot(pgprot_val(prot) | _PAGE_PSE); 654 - 655 - set_pte_init((pte_t *)pud, 656 - pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, 657 - prot), 647 + set_pud_init(pud, 648 + pfn_pud(paddr >> PAGE_SHIFT, prot_sethuge(prot)), 658 649 init); 659 650 spin_unlock(&init_mm.page_table_lock); 660 651 paddr_last = paddr_next;
+47 -2
arch/x86/mm/pat/set_memory.c
··· 580 580 } 581 581 582 582 /* 583 + * Validate strict W^X semantics. 584 + */ 585 + static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, 586 + unsigned long pfn, unsigned long npg) 587 + { 588 + unsigned long end; 589 + 590 + /* 591 + * 32-bit has some unfixable W+X issues, like EFI code 592 + * and writeable data being in the same page. Disable 593 + * detection and enforcement there. 594 + */ 595 + if (IS_ENABLED(CONFIG_X86_32)) 596 + return new; 597 + 598 + /* Only verify when NX is supported: */ 599 + if (!(__supported_pte_mask & _PAGE_NX)) 600 + return new; 601 + 602 + if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) 603 + return new; 604 + 605 + if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) 606 + return new; 607 + 608 + end = start + npg * PAGE_SIZE - 1; 609 + WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", 610 + (unsigned long long)pgprot_val(old), 611 + (unsigned long long)pgprot_val(new), 612 + start, end, pfn); 613 + 614 + /* 615 + * For now, allow all permission change attempts by returning the 616 + * attempted permissions. This can 'return old' to actively 617 + * refuse the permission change at a later time. 618 + */ 619 + return new; 620 + } 621 + 622 + /* 583 623 * Lookup the page table entry for a virtual address in a specific pgd. 584 624 * Return a pointer to the entry and the level of the mapping. 585 625 */ ··· 924 884 */ 925 885 new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, 926 886 psize, CPA_DETECT); 887 + 888 + new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages); 927 889 928 890 /* 929 891 * If there is a conflict, split the large page. ··· 1567 1525 1568 1526 if (level == PG_LEVEL_4K) { 1569 1527 pte_t new_pte; 1528 + pgprot_t old_prot = pte_pgprot(old_pte); 1570 1529 pgprot_t new_prot = pte_pgprot(old_pte); 1571 1530 unsigned long pfn = pte_pfn(old_pte); 1572 1531 ··· 1578 1535 /* Hand in lpsize = 0 to enforce the protection mechanism */ 1579 1536 new_prot = static_protections(new_prot, address, pfn, 1, 0, 1580 1537 CPA_PROTECT); 1538 + 1539 + new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1); 1581 1540 1582 1541 new_prot = pgprot_clear_protnone_bits(new_prot); 1583 1542 ··· 1989 1944 return rc; 1990 1945 } 1991 1946 1992 - static int set_memory_present(unsigned long *addr, int numpages) 1947 + static int set_memory_p(unsigned long *addr, int numpages) 1993 1948 { 1994 1949 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1995 1950 } ··· 1999 1954 { 2000 1955 unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); 2001 1956 2002 - return set_memory_present(&addr, 1); 1957 + return set_memory_p(&addr, 1); 2003 1958 } 2004 1959 EXPORT_SYMBOL_GPL(clear_mce_nospec); 2005 1960 #endif /* CONFIG_X86_64 */