Merge branch 'for-next/mm' into for-next/core

+3 -2

Documentation/admin-guide/kernel-parameters.txt

··· 6405 6405 rodata= [KNL,EARLY] 6406 6406 on Mark read-only kernel memory as read-only (default). 6407 6407 off Leave read-only kernel memory writable for debugging. 6408 - full Mark read-only kernel memory and aliases as read-only 6409 - [arm64] 6408 + noalias Mark read-only kernel memory as read-only but retain 6409 + writable aliases in the direct map for regions outside 6410 + of the kernel image. [arm64] 6410 6411 6411 6412 rockchip.usb_uart 6412 6413 [EARLY]

-14

arch/arm64/Kconfig

··· 1700 1700 When taking an exception from user-space, a sequence of branches 1701 1701 or a firmware call overwrites the branch history. 1702 1702 1703 - config RODATA_FULL_DEFAULT_ENABLED 1704 - bool "Apply r/o permissions of VM areas also to their linear aliases" 1705 - default y 1706 - help 1707 - Apply read-only attributes of VM areas to the linear alias of 1708 - the backing pages as well. This prevents code or read-only data 1709 - from being modified (inadvertently or intentionally) via another 1710 - mapping of the same memory page. This additional enhancement can 1711 - be turned off at runtime by passing rodata=[off|on] (and turned on 1712 - with rodata=full if this option is set to 'n') 1713 - 1714 - This requires the linear region to be mapped down to pages, 1715 - which may adversely affect performance in some cases. 1716 - 1717 1703 config ARM64_SW_TTBR0_PAN 1718 1704 bool "Emulate Privileged Access Never using TTBR0_EL1 switching" 1719 1705 depends on !KCSAN

+2

arch/arm64/include/asm/cpufeature.h

··· 871 871 return cpus_have_final_cap(ARM64_HAS_PMUV3); 872 872 } 873 873 874 + bool cpu_supports_bbml2_noabort(void); 875 + 874 876 static inline bool system_supports_bbml2_noabort(void) 875 877 { 876 878 return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);

+3

arch/arm64/include/asm/mmu.h

··· 78 78 pgprot_t prot, bool page_mappings_only); 79 79 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); 80 80 extern void mark_linear_text_alias_ro(void); 81 + extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); 82 + extern void init_idmap_kpti_bbml2_flag(void); 83 + extern void linear_map_maybe_split_to_ptes(void); 81 84 82 85 /* 83 86 * This check is triggered during the early boot before the cpufeature

+5

arch/arm64/include/asm/pgtable.h

··· 371 371 return __pmd(pmd_val(pmd) | PMD_SECT_CONT); 372 372 } 373 373 374 + static inline pmd_t pmd_mknoncont(pmd_t pmd) 375 + { 376 + return __pmd(pmd_val(pmd) & ~PMD_SECT_CONT); 377 + } 378 + 374 379 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP 375 380 static inline int pte_uffd_wp(pte_t pte) 376 381 {

+2

arch/arm64/include/asm/ptdump.h

··· 7 7 8 8 #include <linux/ptdump.h> 9 9 10 + DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); 11 + 10 12 #ifdef CONFIG_PTDUMP 11 13 12 14 #include <linux/mm_types.h>

+2 -2

arch/arm64/include/asm/setup.h

··· 21 21 if (!arg) 22 22 return false; 23 23 24 - if (!strcmp(arg, "full")) { 24 + if (!strcmp(arg, "on")) { 25 25 rodata_enabled = rodata_full = true; 26 26 return true; 27 27 } ··· 31 31 return true; 32 32 } 33 33 34 - if (!strcmp(arg, "on")) { 34 + if (!strcmp(arg, "noalias")) { 35 35 rodata_enabled = true; 36 36 rodata_full = false; 37 37 return true;

+2 -7

arch/arm64/include/asm/vmalloc.h

··· 9 9 #define arch_vmap_pud_supported arch_vmap_pud_supported 10 10 static inline bool arch_vmap_pud_supported(pgprot_t prot) 11 11 { 12 - /* 13 - * SW table walks can't handle removal of intermediate entries. 14 - */ 15 - return pud_sect_supported() && 16 - !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); 12 + return pud_sect_supported(); 17 13 } 18 14 19 15 #define arch_vmap_pmd_supported arch_vmap_pmd_supported 20 16 static inline bool arch_vmap_pmd_supported(pgprot_t prot) 21 17 { 22 - /* See arch_vmap_pud_supported() */ 23 - return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); 18 + return true; 24 19 } 25 20 26 21 #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size

+9 -1

arch/arm64/kernel/cpufeature.c

··· 86 86 #include <asm/kvm_host.h> 87 87 #include <asm/mmu.h> 88 88 #include <asm/mmu_context.h> 89 + #include <asm/mmu.h> 89 90 #include <asm/mte.h> 90 91 #include <asm/hypervisor.h> 91 92 #include <asm/processor.h> ··· 2030 2029 if (arm64_use_ng_mappings) 2031 2030 return; 2032 2031 2032 + init_idmap_kpti_bbml2_flag(); 2033 2033 stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); 2034 2034 } 2035 2035 ··· 2221 2219 return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE); 2222 2220 } 2223 2221 2224 - static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) 2222 + bool cpu_supports_bbml2_noabort(void) 2225 2223 { 2226 2224 /* 2227 2225 * We want to allow usage of BBML2 in as wide a range of kernel contexts ··· 2255 2253 */ 2256 2254 2257 2255 return true; 2256 + } 2257 + 2258 + static bool has_bbml2_noabort(const struct arm64_cpu_capabilities *caps, int scope) 2259 + { 2260 + return cpu_supports_bbml2_noabort(); 2258 2261 } 2259 2262 2260 2263 #ifdef CONFIG_ARM64_PAN ··· 3937 3930 { 3938 3931 setup_system_capabilities(); 3939 3932 3933 + linear_map_maybe_split_to_ptes(); 3940 3934 kpti_install_ng_mappings(); 3941 3935 3942 3936 sve_setup();

+27 -20

arch/arm64/kernel/pi/map_kernel.c

··· 18 18 19 19 extern const u8 __eh_frame_start[], __eh_frame_end[]; 20 20 21 - extern void idmap_cpu_replace_ttbr1(void *pgdir); 21 + extern void idmap_cpu_replace_ttbr1(phys_addr_t pgdir); 22 22 23 - static void __init map_segment(pgd_t *pg_dir, u64 *pgd, u64 va_offset, 23 + static void __init map_segment(pgd_t *pg_dir, phys_addr_t *pgd, u64 va_offset, 24 24 void *start, void *end, pgprot_t prot, 25 25 bool may_use_cont, int root_level) 26 26 { ··· 40 40 { 41 41 bool enable_scs = IS_ENABLED(CONFIG_UNWIND_PATCH_PAC_INTO_SCS); 42 42 bool twopass = IS_ENABLED(CONFIG_RELOCATABLE); 43 - u64 pgdp = (u64)init_pg_dir + PAGE_SIZE; 43 + phys_addr_t pgdp = (phys_addr_t)init_pg_dir + PAGE_SIZE; 44 44 pgprot_t text_prot = PAGE_KERNEL_ROX; 45 45 pgprot_t data_prot = PAGE_KERNEL; 46 46 pgprot_t prot; ··· 78 78 twopass |= enable_scs; 79 79 prot = twopass ? data_prot : text_prot; 80 80 81 + /* 82 + * [_stext, _text) isn't executed after boot and contains some 83 + * non-executable, unpredictable data, so map it non-executable. 84 + */ 85 + map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, 86 + false, root_level); 81 87 map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, 82 88 !twopass, root_level); 83 89 map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, ··· 96 90 true, root_level); 97 91 dsb(ishst); 98 92 99 - idmap_cpu_replace_ttbr1(init_pg_dir); 93 + idmap_cpu_replace_ttbr1((phys_addr_t)init_pg_dir); 100 94 101 95 if (twopass) { 102 96 if (IS_ENABLED(CONFIG_RELOCATABLE)) ··· 135 129 /* Copy the root page table to its final location */ 136 130 memcpy((void *)swapper_pg_dir + va_offset, init_pg_dir, PAGE_SIZE); 137 131 dsb(ishst); 138 - idmap_cpu_replace_ttbr1(swapper_pg_dir); 132 + idmap_cpu_replace_ttbr1((phys_addr_t)swapper_pg_dir); 139 133 } 140 134 141 - static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr) 135 + static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(phys_addr_t ttbr) 142 136 { 143 137 u64 sctlr = read_sysreg(sctlr_el1); 144 138 u64 tcr = read_sysreg(tcr_el1) | TCR_DS; ··· 178 172 */ 179 173 create_init_idmap(init_pg_dir, mask); 180 174 dsb(ishst); 181 - set_ttbr0_for_lpa2((u64)init_pg_dir); 175 + set_ttbr0_for_lpa2((phys_addr_t)init_pg_dir); 182 176 183 177 /* 184 178 * Recreate the initial ID map with the same granularity as before. 185 179 * Don't bother with the FDT, we no longer need it after this. 186 180 */ 187 181 memset(init_idmap_pg_dir, 0, 188 - (u64)init_idmap_pg_end - (u64)init_idmap_pg_dir); 182 + (char *)init_idmap_pg_end - (char *)init_idmap_pg_dir); 189 183 190 184 create_init_idmap(init_idmap_pg_dir, mask); 191 185 dsb(ishst); 192 186 193 187 /* switch back to the updated initial ID map */ 194 - set_ttbr0_for_lpa2((u64)init_idmap_pg_dir); 188 + set_ttbr0_for_lpa2((phys_addr_t)init_idmap_pg_dir); 195 189 196 190 /* wipe the temporary ID map from memory */ 197 - memset(init_pg_dir, 0, (u64)init_pg_end - (u64)init_pg_dir); 191 + memset(init_pg_dir, 0, (char *)init_pg_end - (char *)init_pg_dir); 198 192 } 199 193 200 - static void __init map_fdt(u64 fdt) 194 + static void *__init map_fdt(phys_addr_t fdt) 201 195 { 202 196 static u8 ptes[INIT_IDMAP_FDT_SIZE] __initdata __aligned(PAGE_SIZE); 203 - u64 efdt = fdt + MAX_FDT_SIZE; 204 - u64 ptep = (u64)ptes; 197 + phys_addr_t efdt = fdt + MAX_FDT_SIZE; 198 + phys_addr_t ptep = (phys_addr_t)ptes; /* We're idmapped when called */ 205 199 206 200 /* 207 201 * Map up to MAX_FDT_SIZE bytes, but avoid overlap with ··· 211 205 fdt, PAGE_KERNEL, IDMAP_ROOT_LEVEL, 212 206 (pte_t *)init_idmap_pg_dir, false, 0); 213 207 dsb(ishst); 208 + 209 + return (void *)fdt; 214 210 } 215 211 216 212 /* ··· 238 230 return true; 239 231 } 240 232 241 - asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) 233 + asmlinkage void __init early_map_kernel(u64 boot_status, phys_addr_t fdt) 242 234 { 243 235 static char const chosen_str[] __initconst = "/chosen"; 244 236 u64 va_base, pa_base = (u64)&_text; ··· 246 238 int root_level = 4 - CONFIG_PGTABLE_LEVELS; 247 239 int va_bits = VA_BITS; 248 240 int chosen; 249 - 250 - map_fdt((u64)fdt); 241 + void *fdt_mapped = map_fdt(fdt); 251 242 252 243 /* Clear BSS and the initial page tables */ 253 - memset(__bss_start, 0, (u64)init_pg_end - (u64)__bss_start); 244 + memset(__bss_start, 0, (char *)init_pg_end - (char *)__bss_start); 254 245 255 246 /* Parse the command line for CPU feature overrides */ 256 - chosen = fdt_path_offset(fdt, chosen_str); 257 - init_feature_override(boot_status, fdt, chosen); 247 + chosen = fdt_path_offset(fdt_mapped, chosen_str); 248 + init_feature_override(boot_status, fdt_mapped, chosen); 258 249 259 250 if (IS_ENABLED(CONFIG_ARM64_64K_PAGES) && !cpu_has_lva()) { 260 251 va_bits = VA_BITS_MIN; ··· 273 266 * fill in the high bits from the seed. 274 267 */ 275 268 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { 276 - u64 kaslr_seed = kaslr_early_init(fdt, chosen); 269 + u64 kaslr_seed = kaslr_early_init(fdt_mapped, chosen); 277 270 278 271 if (kaslr_seed && kaslr_requires_kpti()) 279 272 arm64_use_ng_mappings = ng_mappings_allowed();

+12 -8

arch/arm64/kernel/pi/map_range.c

··· 26 26 * @va_offset: Offset between a physical page and its current mapping 27 27 * in the VA space 28 28 */ 29 - void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, 30 - int level, pte_t *tbl, bool may_use_cont, u64 va_offset) 29 + void __init map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, 30 + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, 31 + u64 va_offset) 31 32 { 32 33 u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; 33 34 ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; ··· 88 87 } 89 88 } 90 89 91 - asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) 90 + asmlinkage phys_addr_t __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask) 92 91 { 93 - u64 ptep = (u64)pg_dir + PAGE_SIZE; 92 + phys_addr_t ptep = (phys_addr_t)pg_dir + PAGE_SIZE; /* MMU is off */ 94 93 pgprot_t text_prot = PAGE_KERNEL_ROX; 95 94 pgprot_t data_prot = PAGE_KERNEL; 96 95 97 96 pgprot_val(text_prot) &= ~clrmask; 98 97 pgprot_val(data_prot) &= ~clrmask; 99 98 100 - map_range(&ptep, (u64)_stext, (u64)__initdata_begin, (u64)_stext, 101 - text_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); 102 - map_range(&ptep, (u64)__initdata_begin, (u64)_end, (u64)__initdata_begin, 103 - data_prot, IDMAP_ROOT_LEVEL, (pte_t *)pg_dir, false, 0); 99 + /* MMU is off; pointer casts to phys_addr_t are safe */ 100 + map_range(&ptep, (u64)_stext, (u64)__initdata_begin, 101 + (phys_addr_t)_stext, text_prot, IDMAP_ROOT_LEVEL, 102 + (pte_t *)pg_dir, false, 0); 103 + map_range(&ptep, (u64)__initdata_begin, (u64)_end, 104 + (phys_addr_t)__initdata_begin, data_prot, IDMAP_ROOT_LEVEL, 105 + (pte_t *)pg_dir, false, 0); 104 106 105 107 return ptep; 106 108 }

+5 -4

arch/arm64/kernel/pi/pi.h

··· 29 29 void relocate_kernel(u64 offset); 30 30 int scs_patch(const u8 eh_frame[], int size); 31 31 32 - void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, 33 - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); 32 + void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, 33 + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, 34 + u64 va_offset); 34 35 35 - asmlinkage void early_map_kernel(u64 boot_status, void *fdt); 36 + asmlinkage void early_map_kernel(u64 boot_status, phys_addr_t fdt); 36 37 37 - asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask); 38 + asmlinkage phys_addr_t create_init_idmap(pgd_t *pgd, ptdesc_t clrmask);

+2 -2

arch/arm64/kernel/setup.c

··· 214 214 unsigned long i = 0; 215 215 size_t res_size; 216 216 217 - kernel_code.start = __pa_symbol(_stext); 217 + kernel_code.start = __pa_symbol(_text); 218 218 kernel_code.end = __pa_symbol(__init_begin - 1); 219 219 kernel_data.start = __pa_symbol(_sdata); 220 220 kernel_data.end = __pa_symbol(_end - 1); ··· 280 280 281 281 void __init __no_sanitize_address setup_arch(char **cmdline_p) 282 282 { 283 - setup_initial_init_mm(_stext, _etext, _edata, _end); 283 + setup_initial_init_mm(_text, _etext, _edata, _end); 284 284 285 285 *cmdline_p = boot_command_line; 286 286

+4 -4

arch/arm64/mm/init.c

··· 243 243 */ 244 244 if (memory_limit != PHYS_ADDR_MAX) { 245 245 memblock_mem_limit_remove_map(memory_limit); 246 - memblock_add(__pa_symbol(_text), (u64)(_end - _text)); 246 + memblock_add(__pa_symbol(_text), (resource_size_t)(_end - _text)); 247 247 } 248 248 249 249 if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { ··· 252 252 * initrd to become inaccessible via the linear mapping. 253 253 * Otherwise, this is a no-op 254 254 */ 255 - u64 base = phys_initrd_start & PAGE_MASK; 256 - u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; 255 + phys_addr_t base = phys_initrd_start & PAGE_MASK; 256 + resource_size_t size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; 257 257 258 258 /* 259 259 * We can only add back the initrd memory if we don't end up ··· 279 279 * Register the kernel text, kernel data, initrd, and initial 280 280 * pagetables with memblock. 281 281 */ 282 - memblock_reserve(__pa_symbol(_stext), _end - _stext); 282 + memblock_reserve(__pa_symbol(_text), _end - _text); 283 283 if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { 284 284 /* the generic initrd code expects virtual addresses */ 285 285 initrd_start = __phys_to_virt(phys_initrd_start);

+465 -29

arch/arm64/mm/mmu.c

··· 27 27 #include <linux/kfence.h> 28 28 #include <linux/pkeys.h> 29 29 #include <linux/mm_inline.h> 30 + #include <linux/pagewalk.h> 31 + #include <linux/stop_machine.h> 30 32 31 33 #include <asm/barrier.h> 32 34 #include <asm/cputype.h> ··· 48 46 #define NO_BLOCK_MAPPINGS BIT(0) 49 47 #define NO_CONT_MAPPINGS BIT(1) 50 48 #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ 49 + 50 + DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); 51 51 52 52 u64 kimage_voffset __ro_after_init; 53 53 EXPORT_SYMBOL(kimage_voffset); ··· 478 474 int flags); 479 475 #endif 480 476 481 - static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, 477 + #define INVALID_PHYS_ADDR (-1ULL) 478 + 479 + static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, 482 480 enum pgtable_type pgtable_type) 483 481 { 484 482 /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ 485 - struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); 483 + struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); 486 484 phys_addr_t pa; 487 485 488 - BUG_ON(!ptdesc); 486 + if (!ptdesc) 487 + return INVALID_PHYS_ADDR; 488 + 489 489 pa = page_to_phys(ptdesc_page(ptdesc)); 490 490 491 491 switch (pgtable_type) { ··· 510 502 return pa; 511 503 } 512 504 505 + static phys_addr_t 506 + try_pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type, gfp_t gfp) 507 + { 508 + return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); 509 + } 510 + 513 511 static phys_addr_t __maybe_unused 514 512 pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) 515 513 { 516 - return __pgd_pgtable_alloc(&init_mm, pgtable_type); 514 + phys_addr_t pa; 515 + 516 + pa = __pgd_pgtable_alloc(&init_mm, GFP_PGTABLE_KERNEL, pgtable_type); 517 + BUG_ON(pa == INVALID_PHYS_ADDR); 518 + return pa; 517 519 } 518 520 519 521 static phys_addr_t 520 522 pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) 521 523 { 522 - return __pgd_pgtable_alloc(NULL, pgtable_type); 524 + phys_addr_t pa; 525 + 526 + pa = __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); 527 + BUG_ON(pa == INVALID_PHYS_ADDR); 528 + return pa; 529 + } 530 + 531 + static void split_contpte(pte_t *ptep) 532 + { 533 + int i; 534 + 535 + ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); 536 + for (i = 0; i < CONT_PTES; i++, ptep++) 537 + __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); 538 + } 539 + 540 + static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) 541 + { 542 + pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; 543 + unsigned long pfn = pmd_pfn(pmd); 544 + pgprot_t prot = pmd_pgprot(pmd); 545 + phys_addr_t pte_phys; 546 + pte_t *ptep; 547 + int i; 548 + 549 + pte_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PTE, gfp); 550 + if (pte_phys == INVALID_PHYS_ADDR) 551 + return -ENOMEM; 552 + ptep = (pte_t *)phys_to_virt(pte_phys); 553 + 554 + if (pgprot_val(prot) & PMD_SECT_PXN) 555 + tableprot |= PMD_TABLE_PXN; 556 + 557 + prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); 558 + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); 559 + if (to_cont) 560 + prot = __pgprot(pgprot_val(prot) | PTE_CONT); 561 + 562 + for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) 563 + __set_pte(ptep, pfn_pte(pfn, prot)); 564 + 565 + /* 566 + * Ensure the pte entries are visible to the table walker by the time 567 + * the pmd entry that points to the ptes is visible. 568 + */ 569 + dsb(ishst); 570 + __pmd_populate(pmdp, pte_phys, tableprot); 571 + 572 + return 0; 573 + } 574 + 575 + static void split_contpmd(pmd_t *pmdp) 576 + { 577 + int i; 578 + 579 + pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); 580 + for (i = 0; i < CONT_PMDS; i++, pmdp++) 581 + set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); 582 + } 583 + 584 + static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) 585 + { 586 + pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; 587 + unsigned int step = PMD_SIZE >> PAGE_SHIFT; 588 + unsigned long pfn = pud_pfn(pud); 589 + pgprot_t prot = pud_pgprot(pud); 590 + phys_addr_t pmd_phys; 591 + pmd_t *pmdp; 592 + int i; 593 + 594 + pmd_phys = try_pgd_pgtable_alloc_init_mm(TABLE_PMD, gfp); 595 + if (pmd_phys == INVALID_PHYS_ADDR) 596 + return -ENOMEM; 597 + pmdp = (pmd_t *)phys_to_virt(pmd_phys); 598 + 599 + if (pgprot_val(prot) & PMD_SECT_PXN) 600 + tableprot |= PUD_TABLE_PXN; 601 + 602 + prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); 603 + prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); 604 + if (to_cont) 605 + prot = __pgprot(pgprot_val(prot) | PTE_CONT); 606 + 607 + for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) 608 + set_pmd(pmdp, pfn_pmd(pfn, prot)); 609 + 610 + /* 611 + * Ensure the pmd entries are visible to the table walker by the time 612 + * the pud entry that points to the pmds is visible. 613 + */ 614 + dsb(ishst); 615 + __pud_populate(pudp, pmd_phys, tableprot); 616 + 617 + return 0; 618 + } 619 + 620 + static int split_kernel_leaf_mapping_locked(unsigned long addr) 621 + { 622 + pgd_t *pgdp, pgd; 623 + p4d_t *p4dp, p4d; 624 + pud_t *pudp, pud; 625 + pmd_t *pmdp, pmd; 626 + pte_t *ptep, pte; 627 + int ret = 0; 628 + 629 + /* 630 + * PGD: If addr is PGD aligned then addr already describes a leaf 631 + * boundary. If not present then there is nothing to split. 632 + */ 633 + if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) 634 + goto out; 635 + pgdp = pgd_offset_k(addr); 636 + pgd = pgdp_get(pgdp); 637 + if (!pgd_present(pgd)) 638 + goto out; 639 + 640 + /* 641 + * P4D: If addr is P4D aligned then addr already describes a leaf 642 + * boundary. If not present then there is nothing to split. 643 + */ 644 + if (ALIGN_DOWN(addr, P4D_SIZE) == addr) 645 + goto out; 646 + p4dp = p4d_offset(pgdp, addr); 647 + p4d = p4dp_get(p4dp); 648 + if (!p4d_present(p4d)) 649 + goto out; 650 + 651 + /* 652 + * PUD: If addr is PUD aligned then addr already describes a leaf 653 + * boundary. If not present then there is nothing to split. Otherwise, 654 + * if we have a pud leaf, split to contpmd. 655 + */ 656 + if (ALIGN_DOWN(addr, PUD_SIZE) == addr) 657 + goto out; 658 + pudp = pud_offset(p4dp, addr); 659 + pud = pudp_get(pudp); 660 + if (!pud_present(pud)) 661 + goto out; 662 + if (pud_leaf(pud)) { 663 + ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); 664 + if (ret) 665 + goto out; 666 + } 667 + 668 + /* 669 + * CONTPMD: If addr is CONTPMD aligned then addr already describes a 670 + * leaf boundary. If not present then there is nothing to split. 671 + * Otherwise, if we have a contpmd leaf, split to pmd. 672 + */ 673 + if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) 674 + goto out; 675 + pmdp = pmd_offset(pudp, addr); 676 + pmd = pmdp_get(pmdp); 677 + if (!pmd_present(pmd)) 678 + goto out; 679 + if (pmd_leaf(pmd)) { 680 + if (pmd_cont(pmd)) 681 + split_contpmd(pmdp); 682 + /* 683 + * PMD: If addr is PMD aligned then addr already describes a 684 + * leaf boundary. Otherwise, split to contpte. 685 + */ 686 + if (ALIGN_DOWN(addr, PMD_SIZE) == addr) 687 + goto out; 688 + ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); 689 + if (ret) 690 + goto out; 691 + } 692 + 693 + /* 694 + * CONTPTE: If addr is CONTPTE aligned then addr already describes a 695 + * leaf boundary. If not present then there is nothing to split. 696 + * Otherwise, if we have a contpte leaf, split to pte. 697 + */ 698 + if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) 699 + goto out; 700 + ptep = pte_offset_kernel(pmdp, addr); 701 + pte = __ptep_get(ptep); 702 + if (!pte_present(pte)) 703 + goto out; 704 + if (pte_cont(pte)) 705 + split_contpte(ptep); 706 + 707 + out: 708 + return ret; 709 + } 710 + 711 + static DEFINE_MUTEX(pgtable_split_lock); 712 + 713 + int split_kernel_leaf_mapping(unsigned long start, unsigned long end) 714 + { 715 + int ret; 716 + 717 + /* 718 + * !BBML2_NOABORT systems should not be trying to change permissions on 719 + * anything that is not pte-mapped in the first place. Just return early 720 + * and let the permission change code raise a warning if not already 721 + * pte-mapped. 722 + */ 723 + if (!system_supports_bbml2_noabort()) 724 + return 0; 725 + 726 + /* 727 + * Ensure start and end are at least page-aligned since this is the 728 + * finest granularity we can split to. 729 + */ 730 + if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) 731 + return -EINVAL; 732 + 733 + mutex_lock(&pgtable_split_lock); 734 + arch_enter_lazy_mmu_mode(); 735 + 736 + /* 737 + * The split_kernel_leaf_mapping_locked() may sleep, it is not a 738 + * problem for ARM64 since ARM64's lazy MMU implementation allows 739 + * sleeping. 740 + * 741 + * Optimize for the common case of splitting out a single page from a 742 + * larger mapping. Here we can just split on the "least aligned" of 743 + * start and end and this will guarantee that there must also be a split 744 + * on the more aligned address since the both addresses must be in the 745 + * same contpte block and it must have been split to ptes. 746 + */ 747 + if (end - start == PAGE_SIZE) { 748 + start = __ffs(start) < __ffs(end) ? start : end; 749 + ret = split_kernel_leaf_mapping_locked(start); 750 + } else { 751 + ret = split_kernel_leaf_mapping_locked(start); 752 + if (!ret) 753 + ret = split_kernel_leaf_mapping_locked(end); 754 + } 755 + 756 + arch_leave_lazy_mmu_mode(); 757 + mutex_unlock(&pgtable_split_lock); 758 + return ret; 759 + } 760 + 761 + static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, 762 + unsigned long next, 763 + struct mm_walk *walk) 764 + { 765 + pud_t pud = pudp_get(pudp); 766 + int ret = 0; 767 + 768 + if (pud_leaf(pud)) 769 + ret = split_pud(pudp, pud, GFP_ATOMIC, false); 770 + 771 + return ret; 772 + } 773 + 774 + static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, 775 + unsigned long next, 776 + struct mm_walk *walk) 777 + { 778 + pmd_t pmd = pmdp_get(pmdp); 779 + int ret = 0; 780 + 781 + if (pmd_leaf(pmd)) { 782 + if (pmd_cont(pmd)) 783 + split_contpmd(pmdp); 784 + ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false); 785 + 786 + /* 787 + * We have split the pmd directly to ptes so there is no need to 788 + * visit each pte to check if they are contpte. 789 + */ 790 + walk->action = ACTION_CONTINUE; 791 + } 792 + 793 + return ret; 794 + } 795 + 796 + static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, 797 + unsigned long next, 798 + struct mm_walk *walk) 799 + { 800 + pte_t pte = __ptep_get(ptep); 801 + 802 + if (pte_cont(pte)) 803 + split_contpte(ptep); 804 + 805 + return 0; 806 + } 807 + 808 + static const struct mm_walk_ops split_to_ptes_ops __initconst = { 809 + .pud_entry = split_to_ptes_pud_entry, 810 + .pmd_entry = split_to_ptes_pmd_entry, 811 + .pte_entry = split_to_ptes_pte_entry, 812 + }; 813 + 814 + static bool linear_map_requires_bbml2 __initdata; 815 + 816 + u32 idmap_kpti_bbml2_flag; 817 + 818 + void __init init_idmap_kpti_bbml2_flag(void) 819 + { 820 + WRITE_ONCE(idmap_kpti_bbml2_flag, 1); 821 + /* Must be visible to other CPUs before stop_machine() is called. */ 822 + smp_mb(); 823 + } 824 + 825 + static int __init linear_map_split_to_ptes(void *__unused) 826 + { 827 + /* 828 + * Repainting the linear map must be done by CPU0 (the boot CPU) because 829 + * that's the only CPU that we know supports BBML2. The other CPUs will 830 + * be held in a waiting area with the idmap active. 831 + */ 832 + if (!smp_processor_id()) { 833 + unsigned long lstart = _PAGE_OFFSET(vabits_actual); 834 + unsigned long lend = PAGE_END; 835 + unsigned long kstart = (unsigned long)lm_alias(_stext); 836 + unsigned long kend = (unsigned long)lm_alias(__init_begin); 837 + int ret; 838 + 839 + /* 840 + * Wait for all secondary CPUs to be put into the waiting area. 841 + */ 842 + smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); 843 + 844 + /* 845 + * Walk all of the linear map [lstart, lend), except the kernel 846 + * linear map alias [kstart, kend), and split all mappings to 847 + * PTE. The kernel alias remains static throughout runtime so 848 + * can continue to be safely mapped with large mappings. 849 + */ 850 + ret = walk_kernel_page_table_range_lockless(lstart, kstart, 851 + &split_to_ptes_ops, NULL, NULL); 852 + if (!ret) 853 + ret = walk_kernel_page_table_range_lockless(kend, lend, 854 + &split_to_ptes_ops, NULL, NULL); 855 + if (ret) 856 + panic("Failed to split linear map\n"); 857 + flush_tlb_kernel_range(lstart, lend); 858 + 859 + /* 860 + * Relies on dsb in flush_tlb_kernel_range() to avoid reordering 861 + * before any page table split operations. 862 + */ 863 + WRITE_ONCE(idmap_kpti_bbml2_flag, 0); 864 + } else { 865 + typedef void (wait_split_fn)(void); 866 + extern wait_split_fn wait_linear_map_split_to_ptes; 867 + wait_split_fn *wait_fn; 868 + 869 + wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); 870 + 871 + /* 872 + * At least one secondary CPU doesn't support BBML2 so cannot 873 + * tolerate the size of the live mappings changing. So have the 874 + * secondary CPUs wait for the boot CPU to make the changes 875 + * with the idmap active and init_mm inactive. 876 + */ 877 + cpu_install_idmap(); 878 + wait_fn(); 879 + cpu_uninstall_idmap(); 880 + } 881 + 882 + return 0; 883 + } 884 + 885 + void __init linear_map_maybe_split_to_ptes(void) 886 + { 887 + if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { 888 + init_idmap_kpti_bbml2_flag(); 889 + stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); 890 + } 523 891 } 524 892 525 893 /* ··· 958 574 /* 959 575 * Remove the write permissions from the linear alias of .text/.rodata 960 576 */ 961 - update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), 962 - (unsigned long)__init_begin - (unsigned long)_stext, 577 + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), 578 + (unsigned long)__init_begin - (unsigned long)_text, 963 579 PAGE_KERNEL_RO); 964 580 } 965 581 ··· 1017 633 1018 634 #endif /* CONFIG_KFENCE */ 1019 635 636 + static inline bool force_pte_mapping(void) 637 + { 638 + bool bbml2 = system_capabilities_finalized() ? 639 + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); 640 + 641 + return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || 642 + is_realm_world())) || 643 + debug_pagealloc_enabled(); 644 + } 645 + 1020 646 static void __init map_mem(pgd_t *pgdp) 1021 647 { 1022 648 static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); 1023 - phys_addr_t kernel_start = __pa_symbol(_stext); 649 + phys_addr_t kernel_start = __pa_symbol(_text); 1024 650 phys_addr_t kernel_end = __pa_symbol(__init_begin); 1025 651 phys_addr_t start, end; 1026 652 phys_addr_t early_kfence_pool; ··· 1052 658 1053 659 early_kfence_pool = arm64_kfence_alloc_pool(); 1054 660 1055 - if (can_set_direct_map()) 661 + linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); 662 + 663 + if (force_pte_mapping()) 1056 664 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1057 665 1058 666 /* ··· 1079 683 } 1080 684 1081 685 /* 1082 - * Map the linear alias of the [_stext, __init_begin) interval 686 + * Map the linear alias of the [_text, __init_begin) interval 1083 687 * as non-executable now, and remove the write permission in 1084 688 * mark_linear_text_alias_ro() below (which will be called after 1085 689 * alternative patching has completed). This makes the contents ··· 1106 710 WRITE_ONCE(rodata_is_rw, false); 1107 711 update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, 1108 712 section_size, PAGE_KERNEL_RO); 713 + /* mark the range between _text and _stext as read only. */ 714 + update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, 715 + (unsigned long)_stext - (unsigned long)_text, 716 + PAGE_KERNEL_RO); 1109 717 } 1110 718 1111 719 static void __init declare_vma(struct vm_struct *vma, ··· 1180 780 { 1181 781 static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; 1182 782 1183 - declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); 783 + declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); 1184 784 declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); 1185 785 declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); 1186 786 declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); 1187 787 declare_vma(&vmlinux_seg[4], _data, _end, 0); 1188 788 } 1189 789 1190 - void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, 1191 - int level, pte_t *tbl, bool may_use_cont, u64 va_offset); 790 + void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, 791 + pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, 792 + u64 va_offset); 1192 793 1193 794 static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, 1194 - kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; 795 + kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; 1195 796 1196 797 static void __init create_idmap(void) 1197 798 { 1198 - u64 start = __pa_symbol(__idmap_text_start); 1199 - u64 end = __pa_symbol(__idmap_text_end); 1200 - u64 ptep = __pa_symbol(idmap_ptes); 799 + phys_addr_t start = __pa_symbol(__idmap_text_start); 800 + phys_addr_t end = __pa_symbol(__idmap_text_end); 801 + phys_addr_t ptep = __pa_symbol(idmap_ptes); 1201 802 1202 803 __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, 1203 804 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 1204 805 __phys_to_virt(ptep) - ptep); 1205 806 1206 - if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { 1207 - extern u32 __idmap_kpti_flag; 1208 - u64 pa = __pa_symbol(&__idmap_kpti_flag); 807 + if (linear_map_requires_bbml2 || 808 + (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { 809 + phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); 1209 810 1210 811 /* 1211 812 * The KPTI G-to-nG conversion code needs a read-write mapping 1212 - * of its synchronization flag in the ID map. 813 + * of its synchronization flag in the ID map. This is also used 814 + * when splitting the linear map to ptes if a secondary CPU 815 + * doesn't support bbml2. 1213 816 */ 1214 - ptep = __pa_symbol(kpti_ptes); 817 + ptep = __pa_symbol(kpti_bbml2_ptes); 1215 818 __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, 1216 819 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 1217 820 __phys_to_virt(ptep) - ptep); ··· 1664 1261 return 1; 1665 1262 } 1666 1263 1667 - int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1264 + static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, 1265 + bool acquire_mmap_lock) 1668 1266 { 1669 1267 pte_t *table; 1670 1268 pmd_t pmd; ··· 1677 1273 return 1; 1678 1274 } 1679 1275 1276 + /* See comment in pud_free_pmd_page for static key logic */ 1680 1277 table = pte_offset_kernel(pmdp, addr); 1681 1278 pmd_clear(pmdp); 1682 1279 __flush_tlb_kernel_pgtable(addr); 1280 + if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { 1281 + mmap_read_lock(&init_mm); 1282 + mmap_read_unlock(&init_mm); 1283 + } 1284 + 1683 1285 pte_free_kernel(NULL, table); 1684 1286 return 1; 1287 + } 1288 + 1289 + int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1290 + { 1291 + /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ 1292 + return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); 1685 1293 } 1686 1294 1687 1295 int pud_free_pmd_page(pud_t *pudp, unsigned long addr) ··· 1711 1295 } 1712 1296 1713 1297 table = pmd_offset(pudp, addr); 1298 + 1299 + /* 1300 + * Our objective is to prevent ptdump from reading a PMD table which has 1301 + * been freed. In this race, if pud_free_pmd_page observes the key on 1302 + * (which got flipped by ptdump) then the mmap lock sequence here will, 1303 + * as a result of the mmap write lock/unlock sequence in ptdump, give 1304 + * us the correct synchronization. If not, this means that ptdump has 1305 + * yet not started walking the pagetables - the sequence of barriers 1306 + * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will 1307 + * observe an empty PUD. 1308 + */ 1309 + pud_clear(pudp); 1310 + __flush_tlb_kernel_pgtable(addr); 1311 + if (static_branch_unlikely(&arm64_ptdump_lock_key)) { 1312 + mmap_read_lock(&init_mm); 1313 + mmap_read_unlock(&init_mm); 1314 + } 1315 + 1714 1316 pmdp = table; 1715 1317 next = addr; 1716 1318 end = addr + PUD_SIZE; 1717 1319 do { 1718 1320 if (pmd_present(pmdp_get(pmdp))) 1719 - pmd_free_pte_page(pmdp, next); 1321 + /* 1322 + * PMD has been isolated, so ptdump won't see it. No 1323 + * need to acquire init_mm.mmap_lock. 1324 + */ 1325 + __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); 1720 1326 } while (pmdp++, next += PMD_SIZE, next != end); 1721 1327 1722 - pud_clear(pudp); 1723 - __flush_tlb_kernel_pgtable(addr); 1724 1328 pmd_free(NULL, table); 1725 1329 return 1; 1726 1330 } ··· 1760 1324 struct range arch_get_mappable_range(void) 1761 1325 { 1762 1326 struct range mhp_range; 1763 - u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); 1764 - u64 end_linear_pa = __pa(PAGE_END - 1); 1327 + phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); 1328 + phys_addr_t end_linear_pa = __pa(PAGE_END - 1); 1765 1329 1766 1330 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { 1767 1331 /* ··· 1796 1360 1797 1361 VM_BUG_ON(!mhp_range_allowed(start, size, true)); 1798 1362 1799 - if (can_set_direct_map()) 1363 + if (force_pte_mapping()) 1800 1364 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1801 1365 1802 1366 __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),

+95 -34

arch/arm64/mm/pageattr.c

··· 8 8 #include <linux/mem_encrypt.h> 9 9 #include <linux/sched.h> 10 10 #include <linux/vmalloc.h> 11 + #include <linux/pagewalk.h> 11 12 12 13 #include <asm/cacheflush.h> 13 14 #include <asm/pgtable-prot.h> ··· 21 20 pgprot_t clear_mask; 22 21 }; 23 22 24 - bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); 23 + static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) 24 + { 25 + struct page_change_data *masks = walk->private; 26 + 27 + val &= ~(pgprot_val(masks->clear_mask)); 28 + val |= (pgprot_val(masks->set_mask)); 29 + 30 + return val; 31 + } 32 + 33 + static int pageattr_pud_entry(pud_t *pud, unsigned long addr, 34 + unsigned long next, struct mm_walk *walk) 35 + { 36 + pud_t val = pudp_get(pud); 37 + 38 + if (pud_sect(val)) { 39 + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) 40 + return -EINVAL; 41 + val = __pud(set_pageattr_masks(pud_val(val), walk)); 42 + set_pud(pud, val); 43 + walk->action = ACTION_CONTINUE; 44 + } 45 + 46 + return 0; 47 + } 48 + 49 + static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, 50 + unsigned long next, struct mm_walk *walk) 51 + { 52 + pmd_t val = pmdp_get(pmd); 53 + 54 + if (pmd_sect(val)) { 55 + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) 56 + return -EINVAL; 57 + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); 58 + set_pmd(pmd, val); 59 + walk->action = ACTION_CONTINUE; 60 + } 61 + 62 + return 0; 63 + } 64 + 65 + static int pageattr_pte_entry(pte_t *pte, unsigned long addr, 66 + unsigned long next, struct mm_walk *walk) 67 + { 68 + pte_t val = __ptep_get(pte); 69 + 70 + val = __pte(set_pageattr_masks(pte_val(val), walk)); 71 + __set_pte(pte, val); 72 + 73 + return 0; 74 + } 75 + 76 + static const struct mm_walk_ops pageattr_ops = { 77 + .pud_entry = pageattr_pud_entry, 78 + .pmd_entry = pageattr_pmd_entry, 79 + .pte_entry = pageattr_pte_entry, 80 + }; 81 + 82 + bool rodata_full __ro_after_init = true; 25 83 26 84 bool can_set_direct_map(void) 27 85 { ··· 97 37 arm64_kfence_can_set_direct_map() || is_realm_world(); 98 38 } 99 39 100 - static int change_page_range(pte_t *ptep, unsigned long addr, void *data) 101 - { 102 - struct page_change_data *cdata = data; 103 - pte_t pte = __ptep_get(ptep); 104 - 105 - pte = clear_pte_bit(pte, cdata->clear_mask); 106 - pte = set_pte_bit(pte, cdata->set_mask); 107 - 108 - __set_pte(ptep, pte); 109 - return 0; 110 - } 111 - 112 - /* 113 - * This function assumes that the range is mapped with PAGE_SIZE pages. 114 - */ 115 - static int __change_memory_common(unsigned long start, unsigned long size, 116 - pgprot_t set_mask, pgprot_t clear_mask) 40 + static int update_range_prot(unsigned long start, unsigned long size, 41 + pgprot_t set_mask, pgprot_t clear_mask) 117 42 { 118 43 struct page_change_data data; 119 44 int ret; ··· 106 61 data.set_mask = set_mask; 107 62 data.clear_mask = clear_mask; 108 63 109 - ret = apply_to_page_range(&init_mm, start, size, change_page_range, 110 - &data); 64 + ret = split_kernel_leaf_mapping(start, start + size); 65 + if (WARN_ON_ONCE(ret)) 66 + return ret; 67 + 68 + arch_enter_lazy_mmu_mode(); 69 + 70 + /* 71 + * The caller must ensure that the range we are operating on does not 72 + * partially overlap a block mapping, or a cont mapping. Any such case 73 + * must be eliminated by splitting the mapping. 74 + */ 75 + ret = walk_kernel_page_table_range_lockless(start, start + size, 76 + &pageattr_ops, NULL, &data); 77 + arch_leave_lazy_mmu_mode(); 78 + 79 + return ret; 80 + } 81 + 82 + static int __change_memory_common(unsigned long start, unsigned long size, 83 + pgprot_t set_mask, pgprot_t clear_mask) 84 + { 85 + int ret; 86 + 87 + ret = update_range_prot(start, size, set_mask, clear_mask); 111 88 112 89 /* 113 90 * If the memory is being made valid without changing any other bits ··· 241 174 242 175 int set_direct_map_invalid_noflush(struct page *page) 243 176 { 244 - struct page_change_data data = { 245 - .set_mask = __pgprot(0), 246 - .clear_mask = __pgprot(PTE_VALID), 247 - }; 177 + pgprot_t clear_mask = __pgprot(PTE_VALID); 178 + pgprot_t set_mask = __pgprot(0); 248 179 249 180 if (!can_set_direct_map()) 250 181 return 0; 251 182 252 - return apply_to_page_range(&init_mm, 253 - (unsigned long)page_address(page), 254 - PAGE_SIZE, change_page_range, &data); 183 + return update_range_prot((unsigned long)page_address(page), 184 + PAGE_SIZE, set_mask, clear_mask); 255 185 } 256 186 257 187 int set_direct_map_default_noflush(struct page *page) 258 188 { 259 - struct page_change_data data = { 260 - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), 261 - .clear_mask = __pgprot(PTE_RDONLY), 262 - }; 189 + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); 190 + pgprot_t clear_mask = __pgprot(PTE_RDONLY); 263 191 264 192 if (!can_set_direct_map()) 265 193 return 0; 266 194 267 - return apply_to_page_range(&init_mm, 268 - (unsigned long)page_address(page), 269 - PAGE_SIZE, change_page_range, &data); 195 + return update_range_prot((unsigned long)page_address(page), 196 + PAGE_SIZE, set_mask, clear_mask); 270 197 } 271 198 272 199 static int __set_memory_enc_dec(unsigned long addr,

+20 -7

arch/arm64/mm/proc.S

··· 245 245 * 246 246 * Called exactly once from stop_machine context by each CPU found during boot. 247 247 */ 248 - .pushsection ".data", "aw", %progbits 249 - SYM_DATA(__idmap_kpti_flag, .long 1) 250 - .popsection 251 - 252 248 SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings) 253 249 cpu .req w0 254 250 temp_pte .req x0 ··· 269 273 270 274 mov x5, x3 // preserve temp_pte arg 271 275 mrs swapper_ttb, ttbr1_el1 272 - adr_l flag_ptr, __idmap_kpti_flag 276 + adr_l flag_ptr, idmap_kpti_bbml2_flag 273 277 274 278 cbnz cpu, __idmap_kpti_secondary 275 279 ··· 412 416 __idmap_kpti_secondary: 413 417 /* Uninstall swapper before surgery begins */ 414 418 __idmap_cpu_set_reserved_ttbr1 x16, x17 419 + b scondary_cpu_wait 415 420 421 + .unreq swapper_ttb 422 + .unreq flag_ptr 423 + SYM_FUNC_END(idmap_kpti_install_ng_mappings) 424 + .popsection 425 + #endif 426 + 427 + .pushsection ".idmap.text", "a" 428 + SYM_TYPED_FUNC_START(wait_linear_map_split_to_ptes) 429 + /* Must be same registers as in idmap_kpti_install_ng_mappings */ 430 + swapper_ttb .req x3 431 + flag_ptr .req x4 432 + 433 + mrs swapper_ttb, ttbr1_el1 434 + adr_l flag_ptr, idmap_kpti_bbml2_flag 435 + __idmap_cpu_set_reserved_ttbr1 x16, x17 436 + 437 + scondary_cpu_wait: 416 438 /* Increment the flag to let the boot CPU we're ready */ 417 439 1: ldxr w16, [flag_ptr] 418 440 add w16, w16, #1 ··· 450 436 451 437 .unreq swapper_ttb 452 438 .unreq flag_ptr 453 - SYM_FUNC_END(idmap_kpti_install_ng_mappings) 439 + SYM_FUNC_END(wait_linear_map_split_to_ptes) 454 440 .popsection 455 - #endif 456 441 457 442 /* 458 443 * __cpu_setup

+9 -2

arch/arm64/mm/ptdump.c

··· 283 283 note_page(pt_st, 0, -1, pte_val(pte_zero)); 284 284 } 285 285 286 + static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) 287 + { 288 + static_branch_inc(&arm64_ptdump_lock_key); 289 + ptdump_walk_pgd(st, mm, NULL); 290 + static_branch_dec(&arm64_ptdump_lock_key); 291 + } 292 + 286 293 void ptdump_walk(struct seq_file *s, struct ptdump_info *info) 287 294 { 288 295 unsigned long end = ~0UL; ··· 318 311 } 319 312 }; 320 313 321 - ptdump_walk_pgd(&st.ptdump, info->mm, NULL); 314 + arm64_ptdump_walk_pgd(&st.ptdump, info->mm); 322 315 } 323 316 324 317 static void __init ptdump_initialize(void) ··· 360 353 } 361 354 }; 362 355 363 - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); 356 + arm64_ptdump_walk_pgd(&st.ptdump, &init_mm); 364 357 365 358 if (st.wx_pages || st.uxn_pages) { 366 359 pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",

+3

include/linux/pagewalk.h

··· 134 134 int walk_kernel_page_table_range(unsigned long start, 135 135 unsigned long end, const struct mm_walk_ops *ops, 136 136 pgd_t *pgd, void *private); 137 + int walk_kernel_page_table_range_lockless(unsigned long start, 138 + unsigned long end, const struct mm_walk_ops *ops, 139 + pgd_t *pgd, void *private); 137 140 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, 138 141 unsigned long end, const struct mm_walk_ops *ops, 139 142 void *private);

+24 -12

mm/pagewalk.c

··· 606 606 int walk_kernel_page_table_range(unsigned long start, unsigned long end, 607 607 const struct mm_walk_ops *ops, pgd_t *pgd, void *private) 608 608 { 609 - struct mm_struct *mm = &init_mm; 609 + /* 610 + * Kernel intermediate page tables are usually not freed, so the mmap 611 + * read lock is sufficient. But there are some exceptions. 612 + * E.g. memory hot-remove. In which case, the mmap lock is insufficient 613 + * to prevent the intermediate kernel pages tables belonging to the 614 + * specified address range from being freed. The caller should take 615 + * other actions to prevent this race. 616 + */ 617 + mmap_assert_locked(&init_mm); 618 + 619 + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, 620 + private); 621 + } 622 + 623 + /* 624 + * Use this function to walk the kernel page tables locklessly. It should be 625 + * guaranteed that the caller has exclusive access over the range they are 626 + * operating on - that there should be no concurrent access, for example, 627 + * changing permissions for vmalloc objects. 628 + */ 629 + int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, 630 + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) 631 + { 610 632 struct mm_walk walk = { 611 633 .ops = ops, 612 - .mm = mm, 634 + .mm = &init_mm, 613 635 .pgd = pgd, 614 636 .private = private, 615 637 .no_vma = true ··· 641 619 return -EINVAL; 642 620 if (!check_ops_valid(ops)) 643 621 return -EINVAL; 644 - 645 - /* 646 - * Kernel intermediate page tables are usually not freed, so the mmap 647 - * read lock is sufficient. But there are some exceptions. 648 - * E.g. memory hot-remove. In which case, the mmap lock is insufficient 649 - * to prevent the intermediate kernel pages tables belonging to the 650 - * specified address range from being freed. The caller should take 651 - * other actions to prevent this race. 652 - */ 653 - mmap_assert_locked(mm); 654 622 655 623 return walk_pgd_range(start, end, &walk); 656 624 }