Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

riscv: Move kernel mapping outside of linear mapping

This is a preparatory patch for relocatable kernel and sv48 support.

The kernel used to be linked at PAGE_OFFSET address therefore we could use
the linear mapping for the kernel mapping. But the relocated kernel base
address will be different from PAGE_OFFSET and since in the linear mapping,
two different virtual addresses cannot point to the same physical address,
the kernel mapping needs to lie outside the linear mapping so that we don't
have to copy it at the same physical offset.

The kernel mapping is moved to the last 2GB of the address space, BPF
is now always after the kernel and modules use the 2GB memory range right
before the kernel, so BPF and modules regions do not overlap. KASLR
implementation will simply have to move the kernel in the last 2GB range
and just take care of leaving enough space for BPF.

In addition, by moving the kernel to the end of the address space, both
sv39 and sv48 kernels will be exactly the same without needing to be
relocated at runtime.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexandre Ghiti <alex@ghiti.fr>
[Palmer: Squash the STRICT_RWX fix, and a !MMU fix]
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>

authored by

Alexandre Ghiti and committed by
Palmer Dabbelt
2bfc6cd8 8a07ac39

+182 -36
+2 -1
arch/riscv/boot/loader.lds.S
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 3 3 #include <asm/page.h> 4 + #include <asm/pgtable.h> 4 5 5 6 OUTPUT_ARCH(riscv) 6 7 ENTRY(_start) 7 8 8 9 SECTIONS 9 10 { 10 - . = PAGE_OFFSET; 11 + . = KERNEL_LINK_ADDR; 11 12 12 13 .payload : { 13 14 *(.payload)
+24 -2
arch/riscv/include/asm/page.h
··· 90 90 91 91 #ifdef CONFIG_MMU 92 92 extern unsigned long va_pa_offset; 93 + #ifdef CONFIG_64BIT 94 + extern unsigned long va_kernel_pa_offset; 95 + #endif 93 96 extern unsigned long pfn_base; 94 97 #define ARCH_PFN_OFFSET (pfn_base) 95 98 #else 96 99 #define va_pa_offset 0 100 + #ifdef CONFIG_64BIT 101 + #define va_kernel_pa_offset 0 102 + #endif 97 103 #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) 98 104 #endif /* CONFIG_MMU */ 99 105 100 - #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) 101 - #define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) 106 + #ifdef CONFIG_64BIT 107 + extern unsigned long kernel_virt_addr; 108 + 109 + #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_pa_offset)) 110 + #define kernel_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_kernel_pa_offset)) 111 + #define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x) 112 + 113 + #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) 114 + #define kernel_mapping_va_to_pa(x) ((unsigned long)(x) - va_kernel_pa_offset) 115 + #define __va_to_pa_nodebug(x) ({ \ 116 + unsigned long _x = x; \ 117 + (_x < kernel_virt_addr) ? \ 118 + linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x); \ 119 + }) 120 + #else 121 + #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) 122 + #define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) 123 + #endif 102 124 103 125 #ifdef CONFIG_DEBUG_VIRTUAL 104 126 extern phys_addr_t __virt_to_phys(unsigned long x);
+31 -8
arch/riscv/include/asm/pgtable.h
··· 11 11 12 12 #include <asm/pgtable-bits.h> 13 13 14 - #ifndef __ASSEMBLY__ 14 + #ifndef CONFIG_MMU 15 + #define KERNEL_LINK_ADDR PAGE_OFFSET 16 + #else 15 17 16 - /* Page Upper Directory not used in RISC-V */ 17 - #include <asm-generic/pgtable-nopud.h> 18 - #include <asm/page.h> 19 - #include <asm/tlbflush.h> 20 - #include <linux/mm_types.h> 18 + #define ADDRESS_SPACE_END (UL(-1)) 21 19 22 - #ifdef CONFIG_MMU 20 + #ifdef CONFIG_64BIT 21 + /* Leave 2GB for kernel and BPF at the end of the address space */ 22 + #define KERNEL_LINK_ADDR (ADDRESS_SPACE_END - SZ_2G + 1) 23 + #else 24 + #define KERNEL_LINK_ADDR PAGE_OFFSET 25 + #endif 23 26 24 27 #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) 25 28 #define VMALLOC_END (PAGE_OFFSET - 1) 26 29 #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) 27 30 28 31 #define BPF_JIT_REGION_SIZE (SZ_128M) 32 + #ifdef CONFIG_64BIT 33 + /* KASLR should leave at least 128MB for BPF after the kernel */ 34 + #define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) 35 + #define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) 36 + #else 29 37 #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) 30 38 #define BPF_JIT_REGION_END (VMALLOC_END) 39 + #endif 40 + 41 + /* Modules always live before the kernel */ 42 + #ifdef CONFIG_64BIT 43 + #define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) 44 + #define MODULES_END (PFN_ALIGN((unsigned long)&_start)) 45 + #endif 31 46 32 47 /* 33 48 * Roughly size the vmemmap space to be large enough to fit enough ··· 72 57 #define FIXADDR_SIZE PGDIR_SIZE 73 58 #endif 74 59 #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) 75 - 76 60 #endif 61 + 62 + #ifndef __ASSEMBLY__ 63 + 64 + /* Page Upper Directory not used in RISC-V */ 65 + #include <asm-generic/pgtable-nopud.h> 66 + #include <asm/page.h> 67 + #include <asm/tlbflush.h> 68 + #include <linux/mm_types.h> 77 69 78 70 #ifdef CONFIG_64BIT 79 71 #include <asm/pgtable-64.h> ··· 506 484 507 485 #define kern_addr_valid(addr) (1) /* FIXME */ 508 486 487 + extern char _start[]; 509 488 extern void *dtb_early_va; 510 489 extern uintptr_t dtb_early_pa; 511 490 void setup_bootmem(void);
+1
arch/riscv/include/asm/set_memory.h
··· 17 17 int set_memory_nx(unsigned long addr, int numpages); 18 18 int set_memory_rw_nx(unsigned long addr, int numpages); 19 19 void protect_kernel_text_data(void); 20 + void protect_kernel_linear_mapping_text_rodata(void); 20 21 #else 21 22 static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } 22 23 static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
+2 -1
arch/riscv/kernel/head.S
··· 69 69 #ifdef CONFIG_MMU 70 70 relocate: 71 71 /* Relocate return address */ 72 - li a1, PAGE_OFFSET 72 + la a1, kernel_virt_addr 73 + REG_L a1, 0(a1) 73 74 la a2, _start 74 75 sub a1, a1, a2 75 76 add ra, ra, a1
+2 -4
arch/riscv/kernel/module.c
··· 408 408 } 409 409 410 410 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) 411 - #define VMALLOC_MODULE_START \ 412 - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) 413 411 void *module_alloc(unsigned long size) 414 412 { 415 - return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, 416 - VMALLOC_END, GFP_KERNEL, 413 + return __vmalloc_node_range(size, 1, MODULES_VADDR, 414 + MODULES_END, GFP_KERNEL, 417 415 PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, 418 416 __builtin_return_address(0)); 419 417 }
+6 -1
arch/riscv/kernel/setup.c
··· 263 263 264 264 sbi_init(); 265 265 266 - if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) 266 + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { 267 267 protect_kernel_text_data(); 268 + #if defined(CONFIG_64BIT) && defined(CONFIG_MMU) 269 + protect_kernel_linear_mapping_text_rodata(); 270 + #endif 271 + } 272 + 268 273 #ifdef CONFIG_SWIOTLB 269 274 swiotlb_init(1); 270 275 #endif
+2 -1
arch/riscv/kernel/vmlinux.lds.S
··· 4 4 * Copyright (C) 2017 SiFive 5 5 */ 6 6 7 - #define LOAD_OFFSET PAGE_OFFSET 7 + #include <asm/pgtable.h> 8 + #define LOAD_OFFSET KERNEL_LINK_ADDR 8 9 #include <asm/vmlinux.lds.h> 9 10 #include <asm/page.h> 10 11 #include <asm/cache.h>
+13
arch/riscv/mm/fault.c
··· 231 231 return; 232 232 } 233 233 234 + #ifdef CONFIG_64BIT 235 + /* 236 + * Modules in 64bit kernels lie in their own virtual region which is not 237 + * in the vmalloc region, but dealing with page faults in this region 238 + * or the vmalloc region amounts to doing the same thing: checking that 239 + * the mapping exists in init_mm.pgd and updating user page table, so 240 + * just use vmalloc_fault. 241 + */ 242 + if (unlikely(addr >= MODULES_VADDR && addr < MODULES_END)) { 243 + vmalloc_fault(regs, code, addr); 244 + return; 245 + } 246 + #endif 234 247 /* Enable interrupts if they were enabled in the parent context. */ 235 248 if (likely(regs->status & SR_PIE)) 236 249 local_irq_enable();
+89 -17
arch/riscv/mm/init.c
··· 25 25 26 26 #include "../kernel/head.h" 27 27 28 + unsigned long kernel_virt_addr = KERNEL_LINK_ADDR; 29 + EXPORT_SYMBOL(kernel_virt_addr); 30 + 28 31 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] 29 32 __page_aligned_bss; 30 33 EXPORT_SYMBOL(empty_zero_page); ··· 91 88 (unsigned long)VMALLOC_END); 92 89 print_mlm("lowmem", (unsigned long)PAGE_OFFSET, 93 90 (unsigned long)high_memory); 91 + #ifdef CONFIG_64BIT 92 + print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, 93 + (unsigned long)ADDRESS_SPACE_END); 94 + #endif 94 95 } 95 96 #else 96 97 static void print_vm_layout(void) { } ··· 123 116 /* The maximal physical memory size is -PAGE_OFFSET. */ 124 117 memblock_enforce_memory_limit(-PAGE_OFFSET); 125 118 126 - /* Reserve from the start of the kernel to the end of the kernel */ 127 - memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); 119 + /* 120 + * Reserve from the start of the kernel to the end of the kernel 121 + * and make sure we align the reservation on PMD_SIZE since we will 122 + * map the kernel in the linear mapping as read-only: we do not want 123 + * any allocation to happen between _end and the next pmd aligned page. 124 + */ 125 + memblock_reserve(vmlinux_start, (vmlinux_end - vmlinux_start + PMD_SIZE - 1) & PMD_MASK); 128 126 129 127 /* 130 128 * memblock allocator is not aware of the fact that last 4K bytes of ··· 164 152 #ifdef CONFIG_MMU 165 153 static struct pt_alloc_ops pt_ops; 166 154 155 + /* Offset between linear mapping virtual address and kernel load address */ 167 156 unsigned long va_pa_offset; 168 157 EXPORT_SYMBOL(va_pa_offset); 158 + #ifdef CONFIG_64BIT 159 + /* Offset between kernel mapping virtual address and kernel load address */ 160 + unsigned long va_kernel_pa_offset; 161 + EXPORT_SYMBOL(va_kernel_pa_offset); 162 + #endif 169 163 unsigned long pfn_base; 170 164 EXPORT_SYMBOL(pfn_base); 171 165 ··· 275 257 276 258 static phys_addr_t __init alloc_pmd_early(uintptr_t va) 277 259 { 278 - BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT); 260 + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT); 279 261 280 262 return (uintptr_t)early_pmd; 281 263 } ··· 390 372 #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." 391 373 #endif 392 374 375 + uintptr_t load_pa, load_sz; 376 + 377 + static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) 378 + { 379 + uintptr_t va, end_va; 380 + 381 + end_va = kernel_virt_addr + load_sz; 382 + for (va = kernel_virt_addr; va < end_va; va += map_size) 383 + create_pgd_mapping(pgdir, va, 384 + load_pa + (va - kernel_virt_addr), 385 + map_size, PAGE_KERNEL_EXEC); 386 + } 387 + 393 388 asmlinkage void __init setup_vm(uintptr_t dtb_pa) 394 389 { 395 - uintptr_t va, pa, end_va; 396 - uintptr_t load_pa = (uintptr_t)(&_start); 397 - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; 390 + uintptr_t pa; 398 391 uintptr_t map_size; 399 392 #ifndef __PAGETABLE_PMD_FOLDED 400 393 pmd_t fix_bmap_spmd, fix_bmap_epmd; 401 394 #endif 395 + load_pa = (uintptr_t)(&_start); 396 + load_sz = (uintptr_t)(&_end) - load_pa; 402 397 403 398 va_pa_offset = PAGE_OFFSET - load_pa; 399 + #ifdef CONFIG_64BIT 400 + va_kernel_pa_offset = kernel_virt_addr - load_pa; 401 + #endif 402 + 404 403 pfn_base = PFN_DOWN(load_pa); 405 404 406 405 /* ··· 445 410 create_pmd_mapping(fixmap_pmd, FIXADDR_START, 446 411 (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); 447 412 /* Setup trampoline PGD and PMD */ 448 - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, 413 + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, 449 414 (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); 450 - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, 415 + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, 451 416 load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); 452 417 #else 453 418 /* Setup trampoline PGD */ 454 - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, 419 + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, 455 420 load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); 456 421 #endif 457 422 458 423 /* 459 - * Setup early PGD covering entire kernel which will allows 424 + * Setup early PGD covering entire kernel which will allow 460 425 * us to reach paging_init(). We map all memory banks later 461 426 * in setup_vm_final() below. 462 427 */ 463 - end_va = PAGE_OFFSET + load_sz; 464 - for (va = PAGE_OFFSET; va < end_va; va += map_size) 465 - create_pgd_mapping(early_pg_dir, va, 466 - load_pa + (va - PAGE_OFFSET), 467 - map_size, PAGE_KERNEL_EXEC); 428 + create_kernel_page_table(early_pg_dir, map_size); 468 429 469 430 #ifndef __PAGETABLE_PMD_FOLDED 470 431 /* Setup early PMD for DTB */ ··· 475 444 pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); 476 445 dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); 477 446 #else /* CONFIG_BUILTIN_DTB */ 447 + #ifdef CONFIG_64BIT 448 + /* 449 + * __va can't be used since it would return a linear mapping address 450 + * whereas dtb_early_va will be used before setup_vm_final installs 451 + * the linear mapping. 452 + */ 453 + dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); 454 + #else 478 455 dtb_early_va = __va(dtb_pa); 456 + #endif /* CONFIG_64BIT */ 479 457 #endif /* CONFIG_BUILTIN_DTB */ 480 458 #else 481 459 #ifndef CONFIG_BUILTIN_DTB ··· 496 456 pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); 497 457 dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); 498 458 #else /* CONFIG_BUILTIN_DTB */ 459 + #ifdef CONFIG_64BIT 460 + dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); 461 + #else 499 462 dtb_early_va = __va(dtb_pa); 463 + #endif /* CONFIG_64BIT */ 500 464 #endif /* CONFIG_BUILTIN_DTB */ 501 465 #endif 502 466 dtb_early_pa = dtb_pa; ··· 536 492 #endif 537 493 } 538 494 495 + #ifdef CONFIG_64BIT 496 + void protect_kernel_linear_mapping_text_rodata(void) 497 + { 498 + unsigned long text_start = (unsigned long)lm_alias(_start); 499 + unsigned long init_text_start = (unsigned long)lm_alias(__init_text_begin); 500 + unsigned long rodata_start = (unsigned long)lm_alias(__start_rodata); 501 + unsigned long data_start = (unsigned long)lm_alias(_data); 502 + 503 + set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT); 504 + set_memory_nx(text_start, (init_text_start - text_start) >> PAGE_SHIFT); 505 + 506 + set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); 507 + set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); 508 + } 509 + #endif 510 + 539 511 static void __init setup_vm_final(void) 540 512 { 541 513 uintptr_t va, map_size; ··· 573 513 __pa_symbol(fixmap_pgd_next), 574 514 PGDIR_SIZE, PAGE_TABLE); 575 515 576 - /* Map all memory banks */ 516 + /* Map all memory banks in the linear mapping */ 577 517 for_each_mem_range(i, &start, &end) { 578 518 if (start >= end) 579 519 break; ··· 585 525 for (pa = start; pa < end; pa += map_size) { 586 526 va = (uintptr_t)__va(pa); 587 527 create_pgd_mapping(swapper_pg_dir, va, pa, 588 - map_size, PAGE_KERNEL_EXEC); 528 + map_size, 529 + #ifdef CONFIG_64BIT 530 + PAGE_KERNEL 531 + #else 532 + PAGE_KERNEL_EXEC 533 + #endif 534 + ); 535 + 589 536 } 590 537 } 538 + 539 + #ifdef CONFIG_64BIT 540 + /* Map the kernel */ 541 + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); 542 + #endif 591 543 592 544 /* Clear fixmap PTE and PMD mappings */ 593 545 clear_fixmap(FIX_PTE);
+9
arch/riscv/mm/kasan_init.c
··· 171 171 phys_addr_t _start, _end; 172 172 u64 i; 173 173 174 + /* 175 + * Populate all kernel virtual address space with kasan_early_shadow_page 176 + * except for the linear mapping and the modules/kernel/BPF mapping. 177 + */ 174 178 kasan_populate_early_shadow((void *)KASAN_SHADOW_START, 175 179 (void *)kasan_mem_to_shadow((void *) 176 180 VMEMMAP_END)); ··· 187 183 (void *)kasan_mem_to_shadow((void *)VMALLOC_START), 188 184 (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); 189 185 186 + /* Populate the linear mapping */ 190 187 for_each_mem_range(i, &_start, &_end) { 191 188 void *start = (void *)__va(_start); 192 189 void *end = (void *)__va(_end); ··· 197 192 198 193 kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); 199 194 }; 195 + 196 + /* Populate kernel, BPF, modules mapping */ 197 + kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR), 198 + kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END)); 200 199 201 200 for (i = 0; i < PTRS_PER_PTE; i++) 202 201 set_pte(&kasan_early_shadow_pte[i],
+1 -1
arch/riscv/mm/physaddr.c
··· 23 23 24 24 phys_addr_t __phys_addr_symbol(unsigned long x) 25 25 { 26 - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; 26 + unsigned long kernel_start = (unsigned long)kernel_virt_addr; 27 27 unsigned long kernel_end = (unsigned long)_end; 28 28 29 29 /*