Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/efistub: Avoid legacy decompressor when doing EFI boot

The bare metal decompressor code was never really intended to run in a
hosted environment such as the EFI boot services, and does a few things
that are becoming problematic in the context of EFI boot now that the
logo requirements are getting tighter: EFI executables will no longer be
allowed to consist of a single executable section that is mapped with
read, write and execute permissions if they are intended for use in a
context where Secure Boot is enabled (and where Microsoft's set of
certificates is used, i.e., every x86 PC built to run Windows).

To avoid stepping on reserved memory before having inspected the E820
tables, and to ensure the correct placement when running a kernel build
that is non-relocatable, the bare metal decompressor moves its own
executable image to the end of the allocation that was reserved for it,
in order to perform the decompression in place. This means the region in
question requires both write and execute permissions, which either need
to be given upfront (which EFI will no longer permit), or need to be
applied on demand using the existing page fault handling framework.

However, the physical placement of the kernel is usually randomized
anyway, and even if it isn't, a dedicated decompression output buffer
can be allocated anywhere in memory using EFI APIs when still running in
the boot services, given that EFI support already implies a relocatable
kernel. This means that decompression in place is never necessary, nor
is moving the compressed image from one end to the other.

Since EFI already maps all of memory 1:1, it is also unnecessary to
create new page tables or handle page faults when decompressing the
kernel. That means there is also no need to replace the special
exception handlers for SEV. Generally, there is little need to do
any of the things that the decompressor does beyond

- initialize SEV encryption, if needed,
- perform the 4/5 level paging switch, if needed,
- decompress the kernel
- relocate the kernel

So do all of this from the EFI stub code, and avoid the bare metal
decompressor altogether.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20230807162720.545787-24-ardb@kernel.org

authored by

Ard Biesheuvel and committed by
Borislav Petkov (AMD)
a1b87d54 31c77a50

+84 -191
+5
arch/x86/boot/compressed/Makefile
··· 74 74 ifeq ($(CONFIG_LD_IS_BFD),y) 75 75 LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments) 76 76 endif 77 + ifeq ($(CONFIG_EFI_STUB),y) 78 + # ensure that the static EFI stub library will be pulled in, even if it is 79 + # never referenced explicitly from the startup code 80 + LDFLAGS_vmlinux += -u efi_pe_entry 81 + endif 77 82 LDFLAGS_vmlinux += -T 78 83 79 84 hostprogs := mkpiggy
-55
arch/x86/boot/compressed/efi_mixed.S
··· 269 269 jmp startup_32 270 270 SYM_FUNC_END(efi32_entry) 271 271 272 - #define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) 273 - #define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) 274 - #define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) 275 - 276 272 /* 277 273 * efi_status_t efi32_pe_entry(efi_handle_t image_handle, 278 274 * efi_system_table_32_t *sys_table) ··· 276 280 SYM_FUNC_START(efi32_pe_entry) 277 281 pushl %ebp 278 282 movl %esp, %ebp 279 - pushl %eax // dummy push to allocate loaded_image 280 - 281 283 pushl %ebx // save callee-save registers 282 284 pushl %edi 283 285 ··· 284 290 movl $0x80000003, %eax // EFI_UNSUPPORTED 285 291 jnz 2f 286 292 287 - call 1f 288 - 1: pop %ebx 289 - 290 - /* Get the loaded image protocol pointer from the image handle */ 291 - leal -4(%ebp), %eax 292 - pushl %eax // &loaded_image 293 - leal (loaded_image_proto - 1b)(%ebx), %eax 294 - pushl %eax // pass the GUID address 295 - pushl 8(%ebp) // pass the image handle 296 - 297 - /* 298 - * Note the alignment of the stack frame. 299 - * sys_table 300 - * handle <-- 16-byte aligned on entry by ABI 301 - * return address 302 - * frame pointer 303 - * loaded_image <-- local variable 304 - * saved %ebx <-- 16-byte aligned here 305 - * saved %edi 306 - * &loaded_image 307 - * &loaded_image_proto 308 - * handle <-- 16-byte aligned for call to handle_protocol 309 - */ 310 - 311 - movl 12(%ebp), %eax // sys_table 312 - movl ST32_boottime(%eax), %eax // sys_table->boottime 313 - call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol 314 - addl $12, %esp // restore argument space 315 - testl %eax, %eax 316 - jnz 2f 317 - 318 293 movl 8(%ebp), %ecx // image_handle 319 294 movl 12(%ebp), %edx // sys_table 320 - movl -4(%ebp), %esi // loaded_image 321 - movl LI32_image_base(%esi), %esi // loaded_image->image_base 322 - leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32 323 - /* 324 - * We need to set the image_offset variable here since startup_32() will 325 - * use it before we get to the 64-bit efi_pe_entry() in C code. 326 - */ 327 - subl %esi, %ebp // calculate image_offset 328 - movl %ebp, (image_offset - 1b)(%ebx) // save image_offset 329 295 xorl %esi, %esi 330 296 jmp efi32_entry // pass %ecx, %edx, %esi 331 297 // no other registers remain live ··· 303 349 jmp efi_handover_entry 304 350 SYM_FUNC_END(efi64_stub_entry) 305 351 #endif 306 - 307 - .section ".rodata" 308 - /* EFI loaded image protocol GUID */ 309 - .balign 4 310 - SYM_DATA_START_LOCAL(loaded_image_proto) 311 - .long 0x5b1b31a1 312 - .word 0x9562, 0x11d2 313 - .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b 314 - SYM_DATA_END(loaded_image_proto) 315 352 316 353 .data 317 354 .balign 8
-13
arch/x86/boot/compressed/head_32.S
··· 84 84 85 85 #ifdef CONFIG_RELOCATABLE 86 86 leal startup_32@GOTOFF(%edx), %ebx 87 - 88 - #ifdef CONFIG_EFI_STUB 89 - /* 90 - * If we were loaded via the EFI LoadImage service, startup_32() will be at an 91 - * offset to the start of the space allocated for the image. efi_pe_entry() will 92 - * set up image_offset to tell us where the image actually starts, so that we 93 - * can use the full available buffer. 94 - * image_offset = startup_32 - image_base 95 - * Otherwise image_offset will be zero and has no effect on the calculations. 96 - */ 97 - subl image_offset@GOTOFF(%edx), %ebx 98 - #endif 99 - 100 87 movl BP_kernel_alignment(%esi), %eax 101 88 decl %eax 102 89 addl %eax, %ebx
-27
arch/x86/boot/compressed/head_64.S
··· 146 146 147 147 #ifdef CONFIG_RELOCATABLE 148 148 movl %ebp, %ebx 149 - 150 - #ifdef CONFIG_EFI_STUB 151 - /* 152 - * If we were loaded via the EFI LoadImage service, startup_32 will be at an 153 - * offset to the start of the space allocated for the image. efi_pe_entry will 154 - * set up image_offset to tell us where the image actually starts, so that we 155 - * can use the full available buffer. 156 - * image_offset = startup_32 - image_base 157 - * Otherwise image_offset will be zero and has no effect on the calculations. 158 - */ 159 - subl rva(image_offset)(%ebp), %ebx 160 - #endif 161 - 162 149 movl BP_kernel_alignment(%esi), %eax 163 150 decl %eax 164 151 addl %eax, %ebx ··· 322 335 /* Start with the delta to where the kernel will run at. */ 323 336 #ifdef CONFIG_RELOCATABLE 324 337 leaq startup_32(%rip) /* - $startup_32 */, %rbp 325 - 326 - #ifdef CONFIG_EFI_STUB 327 - /* 328 - * If we were loaded via the EFI LoadImage service, startup_32 will be at an 329 - * offset to the start of the space allocated for the image. efi_pe_entry will 330 - * set up image_offset to tell us where the image actually starts, so that we 331 - * can use the full available buffer. 332 - * image_offset = startup_32 - image_base 333 - * Otherwise image_offset will be zero and has no effect on the calculations. 334 - */ 335 - movl image_offset(%rip), %eax 336 - subq %rax, %rbp 337 - #endif 338 - 339 338 movl BP_kernel_alignment(%rsi), %eax 340 339 decl %eax 341 340 addq %rax, %rbp
+5 -2
arch/x86/include/asm/efi.h
··· 90 90 } 91 91 92 92 #ifdef CONFIG_X86_32 93 + #define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) 94 + 93 95 #define arch_efi_call_virt_setup() \ 94 96 ({ \ 95 97 efi_fpu_begin(); \ ··· 105 103 }) 106 104 107 105 #else /* !CONFIG_X86_32 */ 108 - 109 - #define EFI_LOADER_SIGNATURE "EL64" 106 + #define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT 110 107 111 108 extern asmlinkage u64 __efi_call(void *fp, ...); 112 109 ··· 218 217 /* arch specific definitions used by the stub code */ 219 218 220 219 #ifdef CONFIG_EFI_MIXED 220 + 221 + #define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX) 221 222 222 223 #define ARCH_HAS_EFISTUB_WRAPPERS 223 224
+2
arch/x86/include/asm/sev.h
··· 164 164 __sev_es_nmi_complete(); 165 165 } 166 166 extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); 167 + extern void sev_enable(struct boot_params *bp); 167 168 168 169 static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) 169 170 { ··· 219 218 static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } 220 219 static inline void sev_es_nmi_complete(void) { } 221 220 static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } 221 + static inline void sev_enable(struct boot_params *bp) { } 222 222 static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } 223 223 static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } 224 224 static inline void setup_ghcb(void) { }
+72 -94
drivers/firmware/efi/libstub/x86-stub.c
··· 15 15 #include <asm/setup.h> 16 16 #include <asm/desc.h> 17 17 #include <asm/boot.h> 18 + #include <asm/kaslr.h> 18 19 #include <asm/sev.h> 19 20 20 21 #include "efistub.h" 21 22 #include "x86-stub.h" 22 23 23 - /* Maximum physical address for 64-bit kernel with 4-level paging */ 24 - #define MAXMEM_X86_64_4LEVEL (1ull << 46) 25 - 26 24 const efi_system_table_t *efi_system_table; 27 25 const efi_dxe_services_table_t *efi_dxe_table; 28 - u32 image_offset __section(".data"); 29 26 static efi_loaded_image_t *image = NULL; 30 27 static efi_memory_attribute_protocol_t *memattr; 31 28 ··· 284 287 } 285 288 } 286 289 287 - extern const u8 startup_32[], startup_64[]; 288 - 289 - static void 290 - setup_memory_protection(unsigned long image_base, unsigned long image_size) 291 - { 292 - #ifdef CONFIG_64BIT 293 - if (image_base != (unsigned long)startup_32) 294 - efi_adjust_memory_range_protection(image_base, image_size); 295 - #else 296 - /* 297 - * Clear protection flags on a whole range of possible 298 - * addresses used for KASLR. We don't need to do that 299 - * on x86_64, since KASLR/extraction is performed after 300 - * dedicated identity page tables are built and we only 301 - * need to remove possible protection on relocated image 302 - * itself disregarding further relocations. 303 - */ 304 - efi_adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, 305 - KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); 306 - #endif 307 - } 308 - 309 290 static void setup_unaccepted_memory(void) 310 291 { 311 292 efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID; ··· 309 334 310 335 static const efi_char16_t apple[] = L"Apple"; 311 336 312 - static void setup_quirks(struct boot_params *boot_params, 313 - unsigned long image_base, 314 - unsigned long image_size) 337 + static void setup_quirks(struct boot_params *boot_params) 315 338 { 316 339 efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) 317 340 efi_table_attr(efi_system_table, fw_vendor); ··· 318 345 if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) 319 346 retrieve_apple_device_properties(boot_params); 320 347 } 321 - 322 - if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) 323 - setup_memory_protection(image_base, image_size); 324 348 } 325 349 326 350 /* ··· 470 500 } 471 501 472 502 image_base = efi_table_attr(image, image_base); 473 - image_offset = (void *)startup_32 - image_base; 474 503 475 504 status = efi_allocate_pages(sizeof(struct boot_params), 476 505 (unsigned long *)&boot_params, ULONG_MAX); ··· 773 804 return false; 774 805 } 775 806 807 + static void efi_get_seed(void *seed, int size) 808 + { 809 + efi_get_random_bytes(size, seed); 810 + 811 + /* 812 + * This only updates seed[0] when running on 32-bit, but in that case, 813 + * seed[1] is not used anyway, as there is no virtual KASLR on 32-bit. 814 + */ 815 + *(unsigned long *)seed ^= kaslr_get_random_long("EFI"); 816 + } 817 + 818 + static void error(char *str) 819 + { 820 + efi_warn("Decompression failed: %s\n", str); 821 + } 822 + 823 + static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry) 824 + { 825 + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; 826 + unsigned long addr, alloc_size, entry; 827 + efi_status_t status; 828 + u32 seed[2] = {}; 829 + 830 + /* determine the required size of the allocation */ 831 + alloc_size = ALIGN(max_t(unsigned long, output_len, kernel_total_size), 832 + MIN_KERNEL_ALIGN); 833 + 834 + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && !efi_nokaslr) { 835 + u64 range = KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR - kernel_total_size; 836 + 837 + efi_get_seed(seed, sizeof(seed)); 838 + 839 + virt_addr += (range * seed[1]) >> 32; 840 + virt_addr &= ~(CONFIG_PHYSICAL_ALIGN - 1); 841 + } 842 + 843 + status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr, 844 + seed[0], EFI_LOADER_CODE, 845 + EFI_X86_KERNEL_ALLOC_LIMIT); 846 + if (status != EFI_SUCCESS) 847 + return status; 848 + 849 + entry = decompress_kernel((void *)addr, virt_addr, error); 850 + if (entry == ULONG_MAX) { 851 + efi_free(alloc_size, addr); 852 + return EFI_LOAD_ERROR; 853 + } 854 + 855 + *kernel_entry = addr + entry; 856 + 857 + efi_adjust_memory_range_protection(addr, kernel_total_size); 858 + 859 + return EFI_SUCCESS; 860 + } 861 + 776 862 static void __noreturn enter_kernel(unsigned long kernel_addr, 777 863 struct boot_params *boot_params) 778 864 { ··· 847 823 struct boot_params *boot_params) 848 824 { 849 825 efi_guid_t guid = EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID; 850 - unsigned long bzimage_addr = (unsigned long)startup_32; 851 - unsigned long buffer_start, buffer_end; 852 826 struct setup_header *hdr = &boot_params->hdr; 853 827 const struct linux_efi_initrd *initrd = NULL; 828 + unsigned long kernel_entry; 854 829 efi_status_t status; 855 830 856 831 efi_system_table = sys_table_arg; ··· 878 855 goto fail; 879 856 } 880 857 881 - /* 882 - * If the kernel isn't already loaded at a suitable address, 883 - * relocate it. 884 - * 885 - * It must be loaded above LOAD_PHYSICAL_ADDR. 886 - * 887 - * The maximum address for 64-bit is 1 << 46 for 4-level paging. This 888 - * is defined as the macro MAXMEM, but unfortunately that is not a 889 - * compile-time constant if 5-level paging is configured, so we instead 890 - * define our own macro for use here. 891 - * 892 - * For 32-bit, the maximum address is complicated to figure out, for 893 - * now use KERNEL_IMAGE_SIZE, which will be 512MiB, the same as what 894 - * KASLR uses. 895 - * 896 - * Also relocate it if image_offset is zero, i.e. the kernel wasn't 897 - * loaded by LoadImage, but rather by a bootloader that called the 898 - * handover entry. The reason we must always relocate in this case is 899 - * to handle the case of systemd-boot booting a unified kernel image, 900 - * which is a PE executable that contains the bzImage and an initrd as 901 - * COFF sections. The initrd section is placed after the bzImage 902 - * without ensuring that there are at least init_size bytes available 903 - * for the bzImage, and thus the compressed kernel's startup code may 904 - * overwrite the initrd unless it is moved out of the way. 905 - */ 906 - 907 - buffer_start = ALIGN(bzimage_addr - image_offset, 908 - hdr->kernel_alignment); 909 - buffer_end = buffer_start + hdr->init_size; 910 - 911 - if ((buffer_start < LOAD_PHYSICAL_ADDR) || 912 - (IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE) || 913 - (IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) || 914 - (image_offset == 0)) { 915 - extern char _bss[]; 916 - 917 - status = efi_relocate_kernel(&bzimage_addr, 918 - (unsigned long)_bss - bzimage_addr, 919 - hdr->init_size, 920 - hdr->pref_address, 921 - hdr->kernel_alignment, 922 - LOAD_PHYSICAL_ADDR); 923 - if (status != EFI_SUCCESS) { 924 - efi_err("efi_relocate_kernel() failed!\n"); 925 - goto fail; 926 - } 927 - /* 928 - * Now that we've copied the kernel elsewhere, we no longer 929 - * have a set up block before startup_32(), so reset image_offset 930 - * to zero in case it was set earlier. 931 - */ 932 - image_offset = 0; 933 - } 934 - 935 858 #ifdef CONFIG_CMDLINE_BOOL 936 859 status = efi_parse_options(CONFIG_CMDLINE); 937 860 if (status != EFI_SUCCESS) { ··· 893 924 efi_err("Failed to parse options\n"); 894 925 goto fail; 895 926 } 927 + } 928 + 929 + status = efi_decompress_kernel(&kernel_entry); 930 + if (status != EFI_SUCCESS) { 931 + efi_err("Failed to decompress kernel\n"); 932 + goto fail; 896 933 } 897 934 898 935 /* ··· 940 965 941 966 setup_efi_pci(boot_params); 942 967 943 - setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); 968 + setup_quirks(boot_params); 944 969 945 970 setup_unaccepted_memory(); 946 971 ··· 950 975 goto fail; 951 976 } 952 977 978 + /* 979 + * Call the SEV init code while still running with the firmware's 980 + * GDT/IDT, so #VC exceptions will be handled by EFI. 981 + */ 982 + sev_enable(boot_params); 983 + 953 984 efi_5level_switch(); 954 985 955 - if (IS_ENABLED(CONFIG_X86_64)) 956 - bzimage_addr += startup_64 - startup_32; 957 - 958 - enter_kernel(bzimage_addr, boot_params); 986 + enter_kernel(kernel_entry, boot_params); 959 987 fail: 960 988 efi_err("efi_stub_entry() failed!\n"); 961 989