Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/mm/64: Make 5-level paging support unconditional

Both Intel and AMD CPUs support 5-level paging, which is expected to
become more widely adopted in the future. All major x86 Linux
distributions have the feature enabled.

Remove CONFIG_X86_5LEVEL and related #ifdeffery for it to make it more readable.

Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250516123306.3812286-4-kirill.shutemov@linux.intel.com

authored by

Kirill A. Shutemov and committed by
Ingo Molnar
7212b58d cba5d9b3

+10 -102
+3 -5
Documentation/arch/x86/cpuinfo.rst
··· 173 173 The kernel disabled support for it at compile-time 174 174 -------------------------------------------------- 175 175 176 - For example, if 5-level-paging is not enabled when building (i.e., 177 - CONFIG_X86_5LEVEL is not selected) the flag "la57" will not show up [#f1]_. 176 + For example, if Linear Address Masking (LAM) is not enabled when building (i.e., 177 + CONFIG_ADDRESS_MASKING is not selected) the flag "lam" will not show up. 178 178 Even though the feature will still be detected via CPUID, the kernel disables 179 - it by clearing via setup_clear_cpu_cap(X86_FEATURE_LA57). 179 + it by clearing via setup_clear_cpu_cap(X86_FEATURE_LAM). 180 180 181 181 The feature is disabled at boot-time 182 182 ------------------------------------ ··· 200 200 is disabled since they depend on XSAVE feature. Another example would be broken 201 201 CPUs and them missing microcode patches. Due to that, the kernel decides not to 202 202 enable a feature. 203 - 204 - .. [#f1] 5-level paging uses linear address of 57 bits.
-9
Documentation/arch/x86/x86_64/5level-paging.rst
··· 22 22 Virtual memory layout for 5-level paging is described in 23 23 Documentation/arch/x86/x86_64/mm.rst 24 24 25 - 26 - Enabling 5-level paging 27 - ======================= 28 - CONFIG_X86_5LEVEL=y enables the feature. 29 - 30 - Kernel with CONFIG_X86_5LEVEL=y still able to boot on 4-level hardware. 31 - In this case additional page table level -- p4d -- will be folded at 32 - runtime. 33 - 34 25 User-space and large virtual address space 35 26 ========================================== 36 27 On x86, 5-level paging enables 56-bit userspace virtual address space.
+1 -21
arch/x86/Kconfig
··· 427 427 428 428 config PGTABLE_LEVELS 429 429 int 430 - default 5 if X86_5LEVEL 431 - default 4 if X86_64 430 + default 5 if X86_64 432 431 default 3 if X86_PAE 433 432 default 2 434 433 ··· 1462 1463 larger swapspace support for non-overcommit purposes. It 1463 1464 has the cost of more pagetable lookup overhead, and also 1464 1465 consumes more pagetable space per process. 1465 - 1466 - config X86_5LEVEL 1467 - bool "Enable 5-level page tables support" 1468 - default y 1469 - depends on X86_64 1470 - help 1471 - 5-level paging enables access to larger address space: 1472 - up to 128 PiB of virtual address space and 4 PiB of 1473 - physical address space. 1474 - 1475 - It will be supported by future Intel CPUs. 1476 - 1477 - A kernel with the option enabled can be booted on machines that 1478 - support 4- or 5-level paging. 1479 - 1480 - See Documentation/arch/x86/x86_64/5level-paging.rst for more 1481 - information. 1482 - 1483 - Say N if unsure. 1484 1466 1485 1467 config X86_DIRECT_GBPAGES 1486 1468 def_bool y
-4
arch/x86/Kconfig.cpufeatures
··· 132 132 def_bool y 133 133 depends on !X86_INTEL_MEMORY_PROTECTION_KEYS 134 134 135 - config X86_DISABLED_FEATURE_LA57 136 - def_bool y 137 - depends on !X86_5LEVEL 138 - 139 135 config X86_DISABLED_FEATURE_PTI 140 136 def_bool y 141 137 depends on !MITIGATION_PAGE_TABLE_ISOLATION
+2 -9
arch/x86/boot/compressed/pgtable_64.c
··· 10 10 #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ 11 11 #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ 12 12 13 - #ifdef CONFIG_X86_5LEVEL 14 13 /* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ 15 14 unsigned int __section(".data") __pgtable_l5_enabled; 16 15 unsigned int __section(".data") pgdir_shift = 39; 17 16 unsigned int __section(".data") ptrs_per_p4d = 1; 18 - #endif 19 17 20 18 /* Buffer to preserve trampoline memory */ 21 19 static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; ··· 112 114 * Check if LA57 is desired and supported. 113 115 * 114 116 * There are several parts to the check: 115 - * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y 116 117 * - if user asked to disable 5-level paging: no5lvl in cmdline 117 118 * - if the machine supports 5-level paging: 118 119 * + CPUID leaf 7 is supported 119 120 * + the leaf has the feature bit set 120 - * 121 - * That's substitute for boot_cpu_has() in early boot code. 122 121 */ 123 - if (IS_ENABLED(CONFIG_X86_5LEVEL) && 124 - !cmdline_find_option_bool("no5lvl") && 125 - native_cpuid_eax(0) >= 7 && 126 - (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { 122 + if (!cmdline_find_option_bool("no5lvl") && 123 + native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) { 127 124 l5_required = true; 128 125 129 126 /* Initialize variables for 5-level paging */
-4
arch/x86/boot/header.S
··· 361 361 #endif 362 362 363 363 #ifdef CONFIG_X86_64 364 - #ifdef CONFIG_X86_5LEVEL 365 364 #define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED) 366 - #else 367 - #define XLF56 XLF_5LEVEL 368 - #endif 369 365 #else 370 366 #define XLF56 0 371 367 #endif
+1 -4
arch/x86/boot/startup/map_kernel.c
··· 16 16 17 17 static inline bool check_la57_support(void) 18 18 { 19 - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) 20 - return false; 21 - 22 19 /* 23 20 * 5-level paging is detected and enabled at kernel decompression 24 21 * stage. Only check if it has been enabled there. ··· 126 129 pgd = rip_rel_ptr(early_top_pgt); 127 130 pgd[pgd_index(__START_KERNEL_map)] += load_delta; 128 131 129 - if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { 132 + if (la57) { 130 133 p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt); 131 134 p4d[MAX_PTRS_PER_P4D - 1] += load_delta; 132 135
-2
arch/x86/entry/vsyscall/vsyscall_64.c
··· 341 341 pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); 342 342 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); 343 343 p4d = p4d_offset(pgd, VSYSCALL_ADDR); 344 - #if CONFIG_PGTABLE_LEVELS >= 5 345 344 set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); 346 - #endif 347 345 pud = pud_offset(p4d, VSYSCALL_ADDR); 348 346 set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); 349 347 pmd = pmd_offset(pud, VSYSCALL_ADDR);
-2
arch/x86/include/asm/page_64.h
··· 62 62 void copy_page(void *to, void *from); 63 63 KCFI_REFERENCE(copy_page); 64 64 65 - #ifdef CONFIG_X86_5LEVEL 66 65 /* 67 66 * User space process size. This is the first address outside the user range. 68 67 * There are a few constraints that determine this: ··· 92 93 93 94 return ret; 94 95 } 95 - #endif /* CONFIG_X86_5LEVEL */ 96 96 97 97 #endif /* !__ASSEMBLER__ */ 98 98
-7
arch/x86/include/asm/page_64_types.h
··· 48 48 /* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */ 49 49 50 50 #define __PHYSICAL_MASK_SHIFT 52 51 - 52 - #ifdef CONFIG_X86_5LEVEL 53 51 #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) 54 - /* See task_size_max() in <asm/page_64.h> */ 55 - #else 56 - #define __VIRTUAL_MASK_SHIFT 47 57 - #define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 58 - #endif 59 52 60 53 #define TASK_SIZE_MAX task_size_max() 61 54 #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
-2
arch/x86/include/asm/pgtable_64.h
··· 41 41 pr_err("%s:%d: bad pud %p(%016lx)\n", \ 42 42 __FILE__, __LINE__, &(e), pud_val(e)) 43 43 44 - #if CONFIG_PGTABLE_LEVELS >= 5 45 44 #define p4d_ERROR(e) \ 46 45 pr_err("%s:%d: bad p4d %p(%016lx)\n", \ 47 46 __FILE__, __LINE__, &(e), p4d_val(e)) 48 - #endif 49 47 50 48 #define pgd_ERROR(e) \ 51 49 pr_err("%s:%d: bad pgd %p(%016lx)\n", \
-18
arch/x86/include/asm/pgtable_64_types.h
··· 23 23 24 24 extern unsigned int __pgtable_l5_enabled; 25 25 26 - #ifdef CONFIG_X86_5LEVEL 27 26 #ifdef USE_EARLY_PGTABLE_L5 28 27 /* 29 28 * cpu_feature_enabled() is not available in early boot code. ··· 36 37 #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) 37 38 #endif /* USE_EARLY_PGTABLE_L5 */ 38 39 39 - #else 40 - #define pgtable_l5_enabled() 0 41 - #endif /* CONFIG_X86_5LEVEL */ 42 - 43 40 extern unsigned int pgdir_shift; 44 41 extern unsigned int ptrs_per_p4d; 45 42 46 43 #endif /* !__ASSEMBLER__ */ 47 - 48 - #ifdef CONFIG_X86_5LEVEL 49 44 50 45 /* 51 46 * PGDIR_SHIFT determines what a top-level page table entry can map ··· 57 64 #define P4D_MASK (~(P4D_SIZE - 1)) 58 65 59 66 #define MAX_POSSIBLE_PHYSMEM_BITS 52 60 - 61 - #else /* CONFIG_X86_5LEVEL */ 62 - 63 - /* 64 - * PGDIR_SHIFT determines what a top-level page table entry can map 65 - */ 66 - #define PGDIR_SHIFT 39 67 - #define PTRS_PER_PGD 512 68 - #define MAX_PTRS_PER_P4D 1 69 - 70 - #endif /* CONFIG_X86_5LEVEL */ 71 67 72 68 /* 73 69 * 3rd level page
+1 -1
arch/x86/kernel/alternative.c
··· 590 590 DPRINTK(ALT, "alt table %px, -> %px", start, end); 591 591 592 592 /* 593 - * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using 593 + * KASAN_SHADOW_START is defined using 594 594 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. 595 595 * During the process, KASAN becomes confused seeing partial LA57 596 596 * conversion and triggers a false-positive out-of-bound report.
-2
arch/x86/kernel/head64.c
··· 51 51 SYM_PIC_ALIAS(next_early_pgt); 52 52 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); 53 53 54 - #ifdef CONFIG_X86_5LEVEL 55 54 unsigned int __pgtable_l5_enabled __ro_after_init; 56 55 unsigned int pgdir_shift __ro_after_init = 39; 57 56 EXPORT_SYMBOL(pgdir_shift); 58 57 unsigned int ptrs_per_p4d __ro_after_init = 1; 59 58 EXPORT_SYMBOL(ptrs_per_p4d); 60 - #endif 61 59 62 60 unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; 63 61 EXPORT_SYMBOL(page_offset_base);
-2
arch/x86/kernel/head_64.S
··· 649 649 SYM_DATA_END(init_top_pgt) 650 650 #endif 651 651 652 - #ifdef CONFIG_X86_5LEVEL 653 652 SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) 654 653 .fill 511,8,0 655 654 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 656 655 SYM_DATA_END(level4_kernel_pgt) 657 656 SYM_PIC_ALIAS(level4_kernel_pgt) 658 - #endif 659 657 660 658 SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) 661 659 .fill L3_START_KERNEL,8,0
-4
arch/x86/mm/init.c
··· 174 174 * randomization is enabled. 175 175 */ 176 176 177 - #ifndef CONFIG_X86_5LEVEL 178 - #define INIT_PGD_PAGE_TABLES 3 179 - #else 180 177 #define INIT_PGD_PAGE_TABLES 4 181 - #endif 182 178 183 179 #ifndef CONFIG_RANDOMIZE_MEMORY 184 180 #define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
+1 -1
arch/x86/mm/pgtable.c
··· 592 592 } 593 593 594 594 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 595 - #ifdef CONFIG_X86_5LEVEL 595 + #if CONFIG_PGTABLE_LEVELS > 4 596 596 /** 597 597 * p4d_set_huge - Set up kernel P4D mapping 598 598 * @p4d: Pointer to the P4D entry
-4
arch/x86/xen/mmu_pv.c
··· 578 578 xen_mc_issue(XEN_LAZY_MMU); 579 579 } 580 580 581 - #if CONFIG_PGTABLE_LEVELS >= 5 582 581 __visible p4dval_t xen_p4d_val(p4d_t p4d) 583 582 { 584 583 return pte_mfn_to_pfn(p4d.p4d); ··· 591 592 return native_make_p4d(p4d); 592 593 } 593 594 PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); 594 - #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ 595 595 596 596 static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, 597 597 void (*func)(struct mm_struct *mm, struct page *, ··· 2220 2222 .alloc_pud = xen_alloc_pmd_init, 2221 2223 .release_pud = xen_release_pmd_init, 2222 2224 2223 - #if CONFIG_PGTABLE_LEVELS >= 5 2224 2225 .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), 2225 2226 .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), 2226 - #endif 2227 2227 2228 2228 .enter_mmap = xen_enter_mmap, 2229 2229 .exit_mmap = xen_exit_mmap,
+1 -1
drivers/firmware/efi/libstub/x86-5lvl.c
··· 62 62 63 63 void efi_5level_switch(void) 64 64 { 65 - bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl; 65 + bool want_la57 = !efi_no5lvl; 66 66 bool have_la57 = native_read_cr4() & X86_CR4_LA57; 67 67 bool need_toggle = want_la57 ^ have_la57; 68 68 u64 *pgt = (void *)la57_toggle + PAGE_SIZE;