Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: allow kernel Image to be loaded anywhere in physical memory

This relaxes the kernel Image placement requirements, so that it
may be placed at any 2 MB aligned offset in physical memory.

This is accomplished by ignoring PHYS_OFFSET when installing
memblocks, and accounting for the apparent virtual offset of
the kernel Image. As a result, virtual address references
below PAGE_OFFSET are correctly mapped onto physical references
into the kernel Image regardless of where it sits in memory.

Special care needs to be taken for dealing with memory limits passed
via mem=, since the generic implementation clips memory top down, which
may clip the kernel image itself if it is loaded high up in memory. To
deal with this case, we simply add back the memory covering the kernel
image, which may result in more memory to be retained than was passed
as a mem= parameter.

Since mem= should not be considered a production feature, a panic notifier
handler is installed that dumps the memory limit at panic time if one was
set.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

authored by

Ard Biesheuvel and committed by
Catalin Marinas
a7f8de16 a89dea58

+119 -39
+13 -7
Documentation/arm64/booting.txt
··· 109 109 1 - 4K 110 110 2 - 16K 111 111 3 - 64K 112 - Bits 3-63: Reserved. 112 + Bit 3: Kernel physical placement 113 + 0 - 2MB aligned base should be as close as possible 114 + to the base of DRAM, since memory below it is not 115 + accessible via the linear mapping 116 + 1 - 2MB aligned base may be anywhere in physical 117 + memory 118 + Bits 4-63: Reserved. 113 119 114 120 - When image_size is zero, a bootloader should attempt to keep as much 115 121 memory as possible free for use by the kernel immediately after the ··· 123 117 depending on selected features, and is effectively unbound. 124 118 125 119 The Image must be placed text_offset bytes from a 2MB aligned base 126 - address near the start of usable system RAM and called there. Memory 127 - below that base address is currently unusable by Linux, and therefore it 128 - is strongly recommended that this location is the start of system RAM. 129 - The region between the 2 MB aligned base address and the start of the 130 - image has no special significance to the kernel, and may be used for 131 - other purposes. 120 + address anywhere in usable system RAM and called there. The region 121 + between the 2 MB aligned base address and the start of the image has no 122 + special significance to the kernel, and may be used for other purposes. 132 123 At least image_size bytes from the start of the image must be free for 133 124 use by the kernel. 125 + NOTE: versions prior to v4.6 cannot make use of memory below the 126 + physical offset of the Image so it is recommended that the Image be 127 + placed as close as possible to the start of system RAM. 134 128 135 129 Any memory described to the kernel (even that below the start of the 136 130 image) which is not marked as reserved from the kernel (e.g., with a
+6
arch/arm64/include/asm/boot.h
··· 11 11 #define MIN_FDT_ALIGN 8 12 12 #define MAX_FDT_SIZE SZ_2M 13 13 14 + /* 15 + * arm64 requires the kernel image to placed 16 + * TEXT_OFFSET bytes beyond a 2 MB aligned base 17 + */ 18 + #define MIN_KIMG_ALIGN SZ_2M 19 + 14 20 #endif
+12
arch/arm64/include/asm/kernel-pgtable.h
··· 79 79 #define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS) 80 80 #endif 81 81 82 + /* 83 + * To make optimal use of block mappings when laying out the linear 84 + * mapping, round down the base of physical memory to a size that can 85 + * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE 86 + * (64k granule), or a multiple that can be mapped using contiguous bits 87 + * in the page tables: 32 * PMD_SIZE (16k granule) 88 + */ 89 + #ifdef CONFIG_ARM64_64K_PAGES 90 + #define ARM64_MEMSTART_ALIGN SZ_512M 91 + #else 92 + #define ARM64_MEMSTART_ALIGN SZ_1G 93 + #endif 82 94 83 95 #endif /* __ASM_KERNEL_PGTABLE_H */
+1 -16
arch/arm64/include/asm/kvm_asm.h
··· 26 26 #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 27 27 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) 28 28 29 - #define kvm_ksym_ref(sym) ((void *)&sym + kvm_ksym_shift) 29 + #define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset) 30 30 31 31 #ifndef __ASSEMBLY__ 32 - #if __GNUC__ > 4 33 - #define kvm_ksym_shift (PAGE_OFFSET - KIMAGE_VADDR) 34 - #else 35 - /* 36 - * GCC versions 4.9 and older will fold the constant below into the addend of 37 - * the reference to 'sym' above if kvm_ksym_shift is declared static or if the 38 - * constant is used directly. However, since we use the small code model for 39 - * the core kernel, the reference to 'sym' will be emitted as a adrp/add pair, 40 - * with a +/- 4 GB range, resulting in linker relocation errors if the shift 41 - * is sufficiently large. So prevent the compiler from folding the shift into 42 - * the addend, by making the shift a variable with external linkage. 43 - */ 44 - __weak u64 kvm_ksym_shift = PAGE_OFFSET - KIMAGE_VADDR; 45 - #endif 46 - 47 32 struct kvm; 48 33 struct kvm_vcpu; 49 34
+10 -8
arch/arm64/include/asm/memory.h
··· 24 24 #include <linux/compiler.h> 25 25 #include <linux/const.h> 26 26 #include <linux/types.h> 27 + #include <asm/bug.h> 27 28 #include <asm/sizes.h> 28 29 29 30 /* ··· 89 88 #define __virt_to_phys(x) ({ \ 90 89 phys_addr_t __x = (phys_addr_t)(x); \ 91 90 __x >= PAGE_OFFSET ? (__x - PAGE_OFFSET + PHYS_OFFSET) : \ 92 - (__x - KIMAGE_VADDR + PHYS_OFFSET); }) 91 + (__x - kimage_voffset); }) 93 92 94 93 #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) 95 - #define __phys_to_kimg(x) ((unsigned long)((x) - PHYS_OFFSET + KIMAGE_VADDR)) 94 + #define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset)) 96 95 97 96 /* 98 97 * Convert a page to/from a physical address ··· 134 133 135 134 extern phys_addr_t memstart_addr; 136 135 /* PHYS_OFFSET - the physical address of the start of memory. */ 137 - #define PHYS_OFFSET ({ memstart_addr; }) 136 + #define PHYS_OFFSET ({ BUG_ON(memstart_addr & 1); memstart_addr; }) 137 + 138 + /* the offset between the kernel virtual and physical mappings */ 139 + extern u64 kimage_voffset; 138 140 139 141 /* 140 - * The maximum physical address that the linear direct mapping 141 - * of system RAM can cover. (PAGE_OFFSET can be interpreted as 142 - * a 2's complement signed quantity and negated to derive the 143 - * maximum size of the linear mapping.) 142 + * Allow all memory at the discovery stage. We will clip it later. 144 143 */ 145 - #define MAX_MEMBLOCK_ADDR ({ memstart_addr - PAGE_OFFSET - 1; }) 144 + #define MIN_MEMBLOCK_ADDR 0 145 + #define MAX_MEMBLOCK_ADDR U64_MAX 146 146 147 147 /* 148 148 * PFNs are used to describe any physical page; this means
+5 -1
arch/arm64/kernel/head.S
··· 428 428 and x4, x4, #~(THREAD_SIZE - 1) 429 429 msr sp_el0, x4 // Save thread_info 430 430 str_l x21, __fdt_pointer, x5 // Save FDT pointer 431 - str_l x24, memstart_addr, x6 // Save PHYS_OFFSET 431 + 432 + ldr x4, =KIMAGE_VADDR // Save the offset between 433 + sub x4, x4, x24 // the kernel virtual and 434 + str_l x4, kimage_voffset, x5 // physical mappings 435 + 432 436 mov x29, #0 433 437 #ifdef CONFIG_KASAN 434 438 bl kasan_early_init
+8 -5
arch/arm64/kernel/image.h
··· 42 42 #endif 43 43 44 44 #ifdef CONFIG_CPU_BIG_ENDIAN 45 - #define __HEAD_FLAG_BE 1 45 + #define __HEAD_FLAG_BE 1 46 46 #else 47 - #define __HEAD_FLAG_BE 0 47 + #define __HEAD_FLAG_BE 0 48 48 #endif 49 49 50 - #define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) 50 + #define __HEAD_FLAG_PAGE_SIZE ((PAGE_SHIFT - 10) / 2) 51 51 52 - #define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ 53 - (__HEAD_FLAG_PAGE_SIZE << 1)) 52 + #define __HEAD_FLAG_PHYS_BASE 1 53 + 54 + #define __HEAD_FLAGS ((__HEAD_FLAG_BE << 0) | \ 55 + (__HEAD_FLAG_PAGE_SIZE << 1) | \ 56 + (__HEAD_FLAG_PHYS_BASE << 3)) 54 57 55 58 /* 56 59 * These will output as part of the Image header, which should be little-endian
+61 -2
arch/arm64/mm/init.c
··· 35 35 #include <linux/efi.h> 36 36 #include <linux/swiotlb.h> 37 37 38 + #include <asm/boot.h> 38 39 #include <asm/fixmap.h> 39 40 #include <asm/kasan.h> 41 + #include <asm/kernel-pgtable.h> 40 42 #include <asm/memory.h> 41 43 #include <asm/sections.h> 42 44 #include <asm/setup.h> ··· 48 46 49 47 #include "mm.h" 50 48 51 - phys_addr_t memstart_addr __read_mostly = 0; 49 + /* 50 + * We need to be able to catch inadvertent references to memstart_addr 51 + * that occur (potentially in generic code) before arm64_memblock_init() 52 + * executes, which assigns it its actual value. So use a default value 53 + * that cannot be mistaken for a real physical address. 54 + */ 55 + phys_addr_t memstart_addr __read_mostly = ~0ULL; 52 56 phys_addr_t arm64_dma_phys_limit __read_mostly; 53 57 54 58 #ifdef CONFIG_BLK_DEV_INITRD ··· 168 160 169 161 void __init arm64_memblock_init(void) 170 162 { 171 - memblock_enforce_memory_limit(memory_limit); 163 + const s64 linear_region_size = -(s64)PAGE_OFFSET; 164 + 165 + /* 166 + * Select a suitable value for the base of physical memory. 167 + */ 168 + memstart_addr = round_down(memblock_start_of_DRAM(), 169 + ARM64_MEMSTART_ALIGN); 170 + 171 + /* 172 + * Remove the memory that we will not be able to cover with the 173 + * linear mapping. Take care not to clip the kernel which may be 174 + * high in memory. 175 + */ 176 + memblock_remove(max(memstart_addr + linear_region_size, __pa(_end)), 177 + ULLONG_MAX); 178 + if (memblock_end_of_DRAM() > linear_region_size) 179 + memblock_remove(0, memblock_end_of_DRAM() - linear_region_size); 180 + 181 + /* 182 + * Apply the memory limit if it was set. Since the kernel may be loaded 183 + * high up in memory, add back the kernel region that must be accessible 184 + * via the linear mapping. 185 + */ 186 + if (memory_limit != (phys_addr_t)ULLONG_MAX) { 187 + memblock_enforce_memory_limit(memory_limit); 188 + memblock_add(__pa(_text), (u64)(_end - _text)); 189 + } 172 190 173 191 /* 174 192 * Register the kernel text, kernel data, initrd, and initial ··· 420 386 421 387 __setup("keepinitrd", keepinitrd_setup); 422 388 #endif 389 + 390 + /* 391 + * Dump out memory limit information on panic. 392 + */ 393 + static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p) 394 + { 395 + if (memory_limit != (phys_addr_t)ULLONG_MAX) { 396 + pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20); 397 + } else { 398 + pr_emerg("Memory Limit: none\n"); 399 + } 400 + return 0; 401 + } 402 + 403 + static struct notifier_block mem_limit_notifier = { 404 + .notifier_call = dump_mem_limit, 405 + }; 406 + 407 + static int __init register_mem_limit_dumper(void) 408 + { 409 + atomic_notifier_chain_register(&panic_notifier_list, 410 + &mem_limit_notifier); 411 + return 0; 412 + } 413 + __initcall(register_mem_limit_dumper);
+3
arch/arm64/mm/mmu.c
··· 46 46 47 47 u64 idmap_t0sz = TCR_T0SZ(VA_BITS); 48 48 49 + u64 kimage_voffset __read_mostly; 50 + EXPORT_SYMBOL(kimage_voffset); 51 + 49 52 /* 50 53 * Empty_zero_page is a special page that is used for zero-initialized data 51 54 * and COW.