Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: move kernel image to base of vmalloc area

This moves the module area to right before the vmalloc area, and moves
the kernel image to the base of the vmalloc area. This is an intermediate
step towards implementing KASLR, which allows the kernel image to be
located anywhere in the vmalloc area.

Since other subsystems such as hibernate may still need to refer to the
kernel text or data segments via their linears addresses, both are mapped
in the linear region as well. The linear alias of the text region is
mapped read-only/non-executable to prevent inadvertent modification or
execution.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

authored by

Ard Biesheuvel and committed by
Catalin Marinas
f9040773 a0bf9776

+137 -68
+1 -1
arch/arm64/include/asm/kasan.h
··· 14 14 * KASAN_SHADOW_END: KASAN_SHADOW_START + 1/8 of kernel virtual addresses. 15 15 */ 16 16 #define KASAN_SHADOW_START (VA_START) 17 - #define KASAN_SHADOW_END (KASAN_SHADOW_START + (1UL << (VA_BITS - 3))) 17 + #define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) 18 18 19 19 /* 20 20 * This value is used to map an address to the corresponding shadow
+15 -6
arch/arm64/include/asm/memory.h
··· 45 45 * VA_START - the first kernel virtual address. 46 46 * TASK_SIZE - the maximum size of a user space task. 47 47 * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. 48 - * The module space lives between the addresses given by TASK_SIZE 49 - * and PAGE_OFFSET - it must be within 128MB of the kernel text. 50 48 */ 51 49 #define VA_BITS (CONFIG_ARM64_VA_BITS) 52 50 #define VA_START (UL(0xffffffffffffffff) << VA_BITS) 53 51 #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) 54 - #define KIMAGE_VADDR (PAGE_OFFSET) 55 - #define MODULES_END (KIMAGE_VADDR) 56 - #define MODULES_VADDR (MODULES_END - SZ_64M) 57 - #define PCI_IO_END (MODULES_VADDR - SZ_2M) 52 + #define KIMAGE_VADDR (MODULES_END) 53 + #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) 54 + #define MODULES_VADDR (VA_START + KASAN_SHADOW_SIZE) 55 + #define MODULES_VSIZE (SZ_64M) 56 + #define PCI_IO_END (PAGE_OFFSET - SZ_2M) 58 57 #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) 59 58 #define FIXADDR_TOP (PCI_IO_START - SZ_2M) 60 59 #define TASK_SIZE_64 (UL(1) << VA_BITS) ··· 69 70 #endif /* CONFIG_COMPAT */ 70 71 71 72 #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) 73 + 74 + /* 75 + * The size of the KASAN shadow region. This should be 1/8th of the 76 + * size of the entire kernel virtual address space. 77 + */ 78 + #ifdef CONFIG_KASAN 79 + #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) 80 + #else 81 + #define KASAN_SHADOW_SIZE (0) 82 + #endif 72 83 73 84 /* 74 85 * Physical vs virtual RAM address space conversion. These are
+2 -8
arch/arm64/include/asm/pgtable.h
··· 36 36 * 37 37 * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array 38 38 * (rounded up to PUD_SIZE). 39 - * VMALLOC_START: beginning of the kernel VA space 39 + * VMALLOC_START: beginning of the kernel vmalloc space 40 40 * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space, 41 41 * fixed mappings and modules 42 42 */ 43 43 #define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) 44 44 45 - #ifndef CONFIG_KASAN 46 - #define VMALLOC_START (VA_START) 47 - #else 48 - #include <asm/kasan.h> 49 - #define VMALLOC_START (KASAN_SHADOW_END + SZ_64K) 50 - #endif 51 - 45 + #define VMALLOC_START (MODULES_END) 52 46 #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) 53 47 54 48 #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K))
+6 -6
arch/arm64/mm/dump.c
··· 35 35 }; 36 36 37 37 enum address_markers_idx { 38 - VMALLOC_START_NR = 0, 38 + MODULES_START_NR = 0, 39 + MODULES_END_NR, 40 + VMALLOC_START_NR, 39 41 VMALLOC_END_NR, 40 42 #ifdef CONFIG_SPARSEMEM_VMEMMAP 41 43 VMEMMAP_START_NR, ··· 47 45 FIXADDR_END_NR, 48 46 PCI_START_NR, 49 47 PCI_END_NR, 50 - MODULES_START_NR, 51 - MODULES_END_NR, 52 48 KERNEL_SPACE_NR, 53 49 }; 54 50 55 51 static struct addr_marker address_markers[] = { 52 + { MODULES_VADDR, "Modules start" }, 53 + { MODULES_END, "Modules end" }, 56 54 { VMALLOC_START, "vmalloc() Area" }, 57 55 { VMALLOC_END, "vmalloc() End" }, 58 56 #ifdef CONFIG_SPARSEMEM_VMEMMAP ··· 63 61 { FIXADDR_TOP, "Fixmap end" }, 64 62 { PCI_IO_START, "PCI I/O start" }, 65 63 { PCI_IO_END, "PCI I/O end" }, 66 - { MODULES_VADDR, "Modules start" }, 67 - { MODULES_END, "Modules end" }, 68 - { PAGE_OFFSET, "Kernel Mapping" }, 64 + { PAGE_OFFSET, "Linear Mapping" }, 69 65 { -1, NULL }, 70 66 }; 71 67
+12 -11
arch/arm64/mm/init.c
··· 36 36 #include <linux/swiotlb.h> 37 37 38 38 #include <asm/fixmap.h> 39 + #include <asm/kasan.h> 39 40 #include <asm/memory.h> 40 41 #include <asm/sections.h> 41 42 #include <asm/setup.h> ··· 303 302 #ifdef CONFIG_KASAN 304 303 " kasan : 0x%16lx - 0x%16lx (%6ld GB)\n" 305 304 #endif 305 + " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" 306 306 " vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n" 307 + " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" 308 + " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" 309 + " .data : 0x%p" " - 0x%p" " (%6ld KB)\n" 307 310 #ifdef CONFIG_SPARSEMEM_VMEMMAP 308 311 " vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n" 309 312 " 0x%16lx - 0x%16lx (%6ld MB actual)\n" 310 313 #endif 311 314 " fixed : 0x%16lx - 0x%16lx (%6ld KB)\n" 312 315 " PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n" 313 - " modules : 0x%16lx - 0x%16lx (%6ld MB)\n" 314 - " memory : 0x%16lx - 0x%16lx (%6ld MB)\n" 315 - " .init : 0x%p" " - 0x%p" " (%6ld KB)\n" 316 - " .text : 0x%p" " - 0x%p" " (%6ld KB)\n" 317 - " .data : 0x%p" " - 0x%p" " (%6ld KB)\n", 316 + " memory : 0x%16lx - 0x%16lx (%6ld MB)\n", 318 317 #ifdef CONFIG_KASAN 319 318 MLG(KASAN_SHADOW_START, KASAN_SHADOW_END), 320 319 #endif 320 + MLM(MODULES_VADDR, MODULES_END), 321 321 MLG(VMALLOC_START, VMALLOC_END), 322 + MLK_ROUNDUP(__init_begin, __init_end), 323 + MLK_ROUNDUP(_text, _etext), 324 + MLK_ROUNDUP(_sdata, _edata), 322 325 #ifdef CONFIG_SPARSEMEM_VMEMMAP 323 326 MLG((unsigned long)vmemmap, 324 327 (unsigned long)vmemmap + VMEMMAP_SIZE), ··· 331 326 #endif 332 327 MLK(FIXADDR_START, FIXADDR_TOP), 333 328 MLM(PCI_IO_START, PCI_IO_END), 334 - MLM(MODULES_VADDR, MODULES_END), 335 - MLM(PAGE_OFFSET, (unsigned long)high_memory), 336 - MLK_ROUNDUP(__init_begin, __init_end), 337 - MLK_ROUNDUP(_text, _etext), 338 - MLK_ROUNDUP(_sdata, _edata)); 329 + MLM(PAGE_OFFSET, (unsigned long)high_memory)); 339 330 340 331 #undef MLK 341 332 #undef MLM ··· 359 358 360 359 void free_initmem(void) 361 360 { 362 - fixup_init(); 363 361 free_initmem_default(0); 362 + fixup_init(); 364 363 } 365 364 366 365 #ifdef CONFIG_BLK_DEV_INITRD
+24 -3
arch/arm64/mm/kasan_init.c
··· 17 17 #include <linux/start_kernel.h> 18 18 19 19 #include <asm/mmu_context.h> 20 + #include <asm/kernel-pgtable.h> 20 21 #include <asm/page.h> 21 22 #include <asm/pgalloc.h> 22 23 #include <asm/pgtable.h> 24 + #include <asm/sections.h> 23 25 #include <asm/tlbflush.h> 24 26 25 27 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); ··· 35 33 if (pmd_none(*pmd)) 36 34 pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); 37 35 38 - pte = pte_offset_kernel(pmd, addr); 36 + pte = pte_offset_kimg(pmd, addr); 39 37 do { 40 38 next = addr + PAGE_SIZE; 41 39 set_pte(pte, pfn_pte(virt_to_pfn(kasan_zero_page), ··· 53 51 if (pud_none(*pud)) 54 52 pud_populate(&init_mm, pud, kasan_zero_pmd); 55 53 56 - pmd = pmd_offset(pud, addr); 54 + pmd = pmd_offset_kimg(pud, addr); 57 55 do { 58 56 next = pmd_addr_end(addr, end); 59 57 kasan_early_pte_populate(pmd, addr, next); ··· 70 68 if (pgd_none(*pgd)) 71 69 pgd_populate(&init_mm, pgd, kasan_zero_pud); 72 70 73 - pud = pud_offset(pgd, addr); 71 + pud = pud_offset_kimg(pgd, addr); 74 72 do { 75 73 next = pud_addr_end(addr, end); 76 74 kasan_early_pmd_populate(pud, addr, next); ··· 128 126 129 127 void __init kasan_init(void) 130 128 { 129 + u64 kimg_shadow_start, kimg_shadow_end; 131 130 struct memblock_region *reg; 132 131 int i; 132 + 133 + kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); 134 + kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); 133 135 134 136 /* 135 137 * We are going to perform proper setup of shadow memory. ··· 148 142 149 143 clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); 150 144 145 + vmemmap_populate(kimg_shadow_start, kimg_shadow_end, NUMA_NO_NODE); 146 + 147 + /* 148 + * vmemmap_populate() has populated the shadow region that covers the 149 + * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round 150 + * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent 151 + * kasan_populate_zero_shadow() from replacing the PMD block mappings 152 + * with PMD table mappings at the edges of the shadow region for the 153 + * kernel image. 154 + */ 155 + if (ARM64_SWAPPER_USES_SECTION_MAPS) 156 + kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); 157 + 151 158 kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, 152 159 kasan_mem_to_shadow((void *)MODULES_VADDR)); 160 + kasan_populate_zero_shadow((void *)kimg_shadow_end, 161 + kasan_mem_to_shadow((void *)PAGE_OFFSET)); 153 162 154 163 for_each_memblock(memory, reg) { 155 164 void *start = (void *)__phys_to_virt(reg->base);
+77 -33
arch/arm64/mm/mmu.c
··· 53 53 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; 54 54 EXPORT_SYMBOL(empty_zero_page); 55 55 56 + static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; 57 + static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; 58 + static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; 59 + 56 60 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 57 61 unsigned long size, pgprot_t vma_prot) 58 62 { ··· 384 380 385 381 static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) 386 382 { 387 - 388 383 unsigned long kernel_start = __pa(_stext); 389 - unsigned long kernel_end = __pa(_end); 384 + unsigned long kernel_end = __pa(_etext); 390 385 391 386 /* 392 - * The kernel itself is mapped at page granularity. Map all other 393 - * memory, making sure we don't overwrite the existing kernel mappings. 387 + * Take care not to create a writable alias for the 388 + * read-only text and rodata sections of the kernel image. 394 389 */ 395 390 396 - /* No overlap with the kernel. */ 391 + /* No overlap with the kernel text */ 397 392 if (end < kernel_start || start >= kernel_end) { 398 393 __create_pgd_mapping(pgd, start, __phys_to_virt(start), 399 394 end - start, PAGE_KERNEL, ··· 401 398 } 402 399 403 400 /* 404 - * This block overlaps the kernel mapping. Map the portion(s) which 405 - * don't overlap. 401 + * This block overlaps the kernel text mapping. 402 + * Map the portion(s) which don't overlap. 406 403 */ 407 404 if (start < kernel_start) 408 405 __create_pgd_mapping(pgd, start, ··· 414 411 __phys_to_virt(kernel_end), 415 412 end - kernel_end, PAGE_KERNEL, 416 413 early_pgtable_alloc); 414 + 415 + /* 416 + * Map the linear alias of the [_stext, _etext) interval as 417 + * read-only/non-executable. This makes the contents of the 418 + * region accessible to subsystems such as hibernate, but 419 + * protects it from inadvertent modification or execution. 420 + */ 421 + __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), 422 + kernel_end - kernel_start, PAGE_KERNEL_RO, 423 + early_pgtable_alloc); 417 424 } 418 425 419 426 static void __init map_mem(pgd_t *pgd) ··· 444 431 } 445 432 } 446 433 447 - #ifdef CONFIG_DEBUG_RODATA 448 434 void mark_rodata_ro(void) 449 435 { 436 + if (!IS_ENABLED(CONFIG_DEBUG_RODATA)) 437 + return; 438 + 450 439 create_mapping_late(__pa(_stext), (unsigned long)_stext, 451 440 (unsigned long)_etext - (unsigned long)_stext, 452 441 PAGE_KERNEL_ROX); 453 - 454 442 } 455 - #endif 456 443 457 444 void fixup_init(void) 458 445 { 459 - create_mapping_late(__pa(__init_begin), (unsigned long)__init_begin, 460 - (unsigned long)__init_end - (unsigned long)__init_begin, 461 - PAGE_KERNEL); 446 + /* 447 + * Unmap the __init region but leave the VM area in place. This 448 + * prevents the region from being reused for kernel modules, which 449 + * is not supported by kallsyms. 450 + */ 451 + unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); 462 452 } 463 453 464 454 static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, 465 - pgprot_t prot) 455 + pgprot_t prot, struct vm_struct *vma) 466 456 { 467 457 phys_addr_t pa_start = __pa(va_start); 468 458 unsigned long size = va_end - va_start; ··· 475 459 476 460 __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, 477 461 early_pgtable_alloc); 462 + 463 + vma->addr = va_start; 464 + vma->phys_addr = pa_start; 465 + vma->size = size; 466 + vma->flags = VM_MAP; 467 + vma->caller = __builtin_return_address(0); 468 + 469 + vm_area_add_early(vma); 478 470 } 479 471 480 472 /* ··· 490 466 */ 491 467 static void __init map_kernel(pgd_t *pgd) 492 468 { 469 + static struct vm_struct vmlinux_text, vmlinux_init, vmlinux_data; 493 470 494 - map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); 495 - map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); 496 - map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); 471 + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); 472 + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, 473 + &vmlinux_init); 474 + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); 497 475 498 - /* 499 - * The fixmap falls in a separate pgd to the kernel, and doesn't live 500 - * in the carveout for the swapper_pg_dir. We can simply re-use the 501 - * existing dir for the fixmap. 502 - */ 503 - set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); 476 + if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { 477 + /* 478 + * The fixmap falls in a separate pgd to the kernel, and doesn't 479 + * live in the carveout for the swapper_pg_dir. We can simply 480 + * re-use the existing dir for the fixmap. 481 + */ 482 + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), 483 + *pgd_offset_k(FIXADDR_START)); 484 + } else if (CONFIG_PGTABLE_LEVELS > 3) { 485 + /* 486 + * The fixmap shares its top level pgd entry with the kernel 487 + * mapping. This can really only occur when we are running 488 + * with 16k/4 levels, so we can simply reuse the pud level 489 + * entry instead. 490 + */ 491 + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); 492 + set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START), 493 + __pud(__pa(bm_pmd) | PUD_TYPE_TABLE)); 494 + pud_clear_fixmap(); 495 + } else { 496 + BUG(); 497 + } 504 498 505 499 kasan_copy_shadow(pgd); 506 500 } ··· 644 602 } 645 603 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 646 604 647 - static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; 648 - #if CONFIG_PGTABLE_LEVELS > 2 649 - static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; 650 - #endif 651 - #if CONFIG_PGTABLE_LEVELS > 3 652 - static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; 653 - #endif 654 - 655 605 static inline pud_t * fixmap_pud(unsigned long addr) 656 606 { 657 607 pgd_t *pgd = pgd_offset_k(addr); ··· 675 641 unsigned long addr = FIXADDR_START; 676 642 677 643 pgd = pgd_offset_k(addr); 678 - pgd_populate(&init_mm, pgd, bm_pud); 679 - pud = fixmap_pud(addr); 644 + if (CONFIG_PGTABLE_LEVELS > 3 && !pgd_none(*pgd)) { 645 + /* 646 + * We only end up here if the kernel mapping and the fixmap 647 + * share the top level pgd entry, which should only happen on 648 + * 16k/4 levels configurations. 649 + */ 650 + BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); 651 + pud = pud_offset_kimg(pgd, addr); 652 + } else { 653 + pgd_populate(&init_mm, pgd, bm_pud); 654 + pud = fixmap_pud(addr); 655 + } 680 656 pud_populate(&init_mm, pud, bm_pmd); 681 657 pmd = fixmap_pmd(addr); 682 658 pmd_populate_kernel(&init_mm, pmd, bm_pte);