Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: mm: create new fine-grained mappings at boot

At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).

Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.

Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.

To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.

This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

authored by

Mark Rutland and committed by
Catalin Marinas
068a17a5 fca082bf

+110 -63
+3
arch/arm64/include/asm/kasan.h
··· 7 7 8 8 #include <linux/linkage.h> 9 9 #include <asm/memory.h> 10 + #include <asm/pgtable-types.h> 10 11 11 12 /* 12 13 * KASAN_SHADOW_START: beginning of the kernel virtual addresses. ··· 29 28 #define KASAN_SHADOW_OFFSET (KASAN_SHADOW_END - (1ULL << (64 - 3))) 30 29 31 30 void kasan_init(void); 31 + void kasan_copy_shadow(pgd_t *pgdir); 32 32 asmlinkage void kasan_early_init(void); 33 33 34 34 #else 35 35 static inline void kasan_init(void) { } 36 + static inline void kasan_copy_shadow(pgd_t *pgdir) { } 36 37 #endif 37 38 38 39 #endif
+15
arch/arm64/mm/kasan_init.c
··· 97 97 kasan_map_early_shadow(); 98 98 } 99 99 100 + /* 101 + * Copy the current shadow region into a new pgdir. 102 + */ 103 + void __init kasan_copy_shadow(pgd_t *pgdir) 104 + { 105 + pgd_t *pgd, *pgd_new, *pgd_end; 106 + 107 + pgd = pgd_offset_k(KASAN_SHADOW_START); 108 + pgd_end = pgd_offset_k(KASAN_SHADOW_END); 109 + pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); 110 + do { 111 + set_pgd(pgd_new, *pgd); 112 + } while (pgd++, pgd_new++, pgd != pgd_end); 113 + } 114 + 100 115 static void __init clear_pgds(unsigned long start, 101 116 unsigned long end) 102 117 {
+92 -63
arch/arm64/mm/mmu.c
··· 33 33 #include <asm/barrier.h> 34 34 #include <asm/cputype.h> 35 35 #include <asm/fixmap.h> 36 + #include <asm/kasan.h> 36 37 #include <asm/kernel-pgtable.h> 37 38 #include <asm/sections.h> 38 39 #include <asm/setup.h> ··· 345 344 late_pgtable_alloc); 346 345 } 347 346 348 - #ifdef CONFIG_DEBUG_RODATA 349 - static void __init __map_memblock(phys_addr_t start, phys_addr_t end) 347 + static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) 350 348 { 351 - /* 352 - * Set up the executable regions using the existing section mappings 353 - * for now. This will get more fine grained later once all memory 354 - * is mapped 355 - */ 356 - unsigned long kernel_x_start = round_down(__pa(_stext), SWAPPER_BLOCK_SIZE); 357 - unsigned long kernel_x_end = round_up(__pa(__init_end), SWAPPER_BLOCK_SIZE); 358 349 359 - if (end < kernel_x_start) { 360 - create_mapping(start, __phys_to_virt(start), 361 - end - start, PAGE_KERNEL); 362 - } else if (start >= kernel_x_end) { 363 - create_mapping(start, __phys_to_virt(start), 364 - end - start, PAGE_KERNEL); 365 - } else { 366 - if (start < kernel_x_start) 367 - create_mapping(start, __phys_to_virt(start), 368 - kernel_x_start - start, 369 - PAGE_KERNEL); 370 - create_mapping(kernel_x_start, 371 - __phys_to_virt(kernel_x_start), 372 - kernel_x_end - kernel_x_start, 373 - PAGE_KERNEL_EXEC); 374 - if (kernel_x_end < end) 375 - create_mapping(kernel_x_end, 376 - __phys_to_virt(kernel_x_end), 377 - end - kernel_x_end, 378 - PAGE_KERNEL); 350 + unsigned long kernel_start = __pa(_stext); 351 + unsigned long kernel_end = __pa(_end); 352 + 353 + /* 354 + * The kernel itself is mapped at page granularity. Map all other 355 + * memory, making sure we don't overwrite the existing kernel mappings. 356 + */ 357 + 358 + /* No overlap with the kernel. */ 359 + if (end < kernel_start || start >= kernel_end) { 360 + __create_pgd_mapping(pgd, start, __phys_to_virt(start), 361 + end - start, PAGE_KERNEL, 362 + early_pgtable_alloc); 363 + return; 379 364 } 380 365 366 + /* 367 + * This block overlaps the kernel mapping. Map the portion(s) which 368 + * don't overlap. 369 + */ 370 + if (start < kernel_start) 371 + __create_pgd_mapping(pgd, start, 372 + __phys_to_virt(start), 373 + kernel_start - start, PAGE_KERNEL, 374 + early_pgtable_alloc); 375 + if (kernel_end < end) 376 + __create_pgd_mapping(pgd, kernel_end, 377 + __phys_to_virt(kernel_end), 378 + end - kernel_end, PAGE_KERNEL, 379 + early_pgtable_alloc); 381 380 } 382 - #else 383 - static void __init __map_memblock(phys_addr_t start, phys_addr_t end) 384 - { 385 - create_mapping(start, __phys_to_virt(start), end - start, 386 - PAGE_KERNEL_EXEC); 387 - } 388 - #endif 389 381 390 - static void __init map_mem(void) 382 + static void __init map_mem(pgd_t *pgd) 391 383 { 392 384 struct memblock_region *reg; 393 385 ··· 394 400 if (memblock_is_nomap(reg)) 395 401 continue; 396 402 397 - __map_memblock(start, end); 403 + __map_memblock(pgd, start, end); 398 404 } 399 - } 400 - 401 - static void __init fixup_executable(void) 402 - { 403 - #ifdef CONFIG_DEBUG_RODATA 404 - /* now that we are actually fully mapped, make the start/end more fine grained */ 405 - if (!IS_ALIGNED((unsigned long)_stext, SWAPPER_BLOCK_SIZE)) { 406 - unsigned long aligned_start = round_down(__pa(_stext), 407 - SWAPPER_BLOCK_SIZE); 408 - 409 - create_mapping(aligned_start, __phys_to_virt(aligned_start), 410 - __pa(_stext) - aligned_start, 411 - PAGE_KERNEL); 412 - } 413 - 414 - if (!IS_ALIGNED((unsigned long)__init_end, SWAPPER_BLOCK_SIZE)) { 415 - unsigned long aligned_end = round_up(__pa(__init_end), 416 - SWAPPER_BLOCK_SIZE); 417 - create_mapping(__pa(__init_end), (unsigned long)__init_end, 418 - aligned_end - __pa(__init_end), 419 - PAGE_KERNEL); 420 - } 421 - #endif 422 405 } 423 406 424 407 #ifdef CONFIG_DEBUG_RODATA ··· 415 444 PAGE_KERNEL); 416 445 } 417 446 447 + static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end, 448 + pgprot_t prot) 449 + { 450 + phys_addr_t pa_start = __pa(va_start); 451 + unsigned long size = va_end - va_start; 452 + 453 + BUG_ON(!PAGE_ALIGNED(pa_start)); 454 + BUG_ON(!PAGE_ALIGNED(size)); 455 + 456 + __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, 457 + early_pgtable_alloc); 458 + } 459 + 460 + /* 461 + * Create fine-grained mappings for the kernel. 462 + */ 463 + static void __init map_kernel(pgd_t *pgd) 464 + { 465 + 466 + map_kernel_chunk(pgd, _stext, _etext, PAGE_KERNEL_EXEC); 467 + map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC); 468 + map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL); 469 + 470 + /* 471 + * The fixmap falls in a separate pgd to the kernel, and doesn't live 472 + * in the carveout for the swapper_pg_dir. We can simply re-use the 473 + * existing dir for the fixmap. 474 + */ 475 + set_pgd(pgd_offset_raw(pgd, FIXADDR_START), *pgd_offset_k(FIXADDR_START)); 476 + 477 + kasan_copy_shadow(pgd); 478 + } 479 + 418 480 /* 419 481 * paging_init() sets up the page tables, initialises the zone memory 420 482 * maps and sets up the zero page. 421 483 */ 422 484 void __init paging_init(void) 423 485 { 424 - map_mem(); 425 - fixup_executable(); 486 + phys_addr_t pgd_phys = early_pgtable_alloc(); 487 + pgd_t *pgd = pgd_set_fixmap(pgd_phys); 488 + 489 + map_kernel(pgd); 490 + map_mem(pgd); 491 + 492 + /* 493 + * We want to reuse the original swapper_pg_dir so we don't have to 494 + * communicate the new address to non-coherent secondaries in 495 + * secondary_entry, and so cpu_switch_mm can generate the address with 496 + * adrp+add rather than a load from some global variable. 497 + * 498 + * To do this we need to go via a temporary pgd. 499 + */ 500 + cpu_replace_ttbr1(__va(pgd_phys)); 501 + memcpy(swapper_pg_dir, pgd, PAGE_SIZE); 502 + cpu_replace_ttbr1(swapper_pg_dir); 503 + 504 + pgd_clear_fixmap(); 505 + memblock_free(pgd_phys, PAGE_SIZE); 506 + 507 + /* 508 + * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd 509 + * allocated with it. 510 + */ 511 + memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE, 512 + SWAPPER_DIR_SIZE - PAGE_SIZE); 426 513 427 514 bootmem_init(); 428 515 }