Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-kdump-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x865 kdump updates from Thomas Gleixner:
"Yet more kexec/kdump updates:

- Properly support kexec when AMD's memory encryption (SME) is
enabled

- Pass reserved e820 ranges to the kexec kernel so both PCI and SME
can work"

* 'x86-kdump-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
fs/proc/vmcore: Enable dumping of encrypted memory when SEV was active
x86/kexec: Set the C-bit in the identity map page table when SEV is active
x86/kexec: Do not map kexec area as decrypted when SEV is active
x86/crash: Add e820 reserved ranges to kdump kernel's e820 table
x86/mm: Rework ioremap resource mapping determination
x86/e820, ioport: Add a new I/O resource descriptor IORES_DESC_RESERVED
x86/mm: Create a workarea in the kernel for SME early encryption
x86/mm: Identify the end of the kernel area to be reserved

+155 -37
+2
arch/x86/include/asm/sections.h
··· 13 13 extern char __end_rodata_hpage_align[]; 14 14 #endif 15 15 16 + extern char __end_of_kernel_reserve[]; 17 + 16 18 #endif /* _ASM_X86_SECTIONS_H */
+6
arch/x86/kernel/crash.c
··· 375 375 walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, 376 376 memmap_entry_callback); 377 377 378 + /* Add e820 reserved ranges */ 379 + cmd.type = E820_TYPE_RESERVED; 380 + flags = IORESOURCE_MEM; 381 + walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, 382 + memmap_entry_callback); 383 + 378 384 /* Add crashk_low_res region */ 379 385 if (crashk_low_res.end) { 380 386 ei.addr = crashk_low_res.start;
+1 -1
arch/x86/kernel/e820.c
··· 1063 1063 case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; 1064 1064 case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; 1065 1065 case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; 1066 + case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; 1066 1067 case E820_TYPE_RESERVED_KERN: /* Fall-through: */ 1067 1068 case E820_TYPE_RAM: /* Fall-through: */ 1068 1069 case E820_TYPE_UNUSABLE: /* Fall-through: */ 1069 - case E820_TYPE_RESERVED: /* Fall-through: */ 1070 1070 default: return IORES_DESC_NONE; 1071 1071 } 1072 1072 }
+28 -3
arch/x86/kernel/machine_kexec_64.c
··· 123 123 124 124 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) 125 125 { 126 + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; 127 + unsigned long vaddr, paddr; 128 + int result = -ENOMEM; 126 129 p4d_t *p4d; 127 130 pud_t *pud; 128 131 pmd_t *pmd; 129 132 pte_t *pte; 130 - unsigned long vaddr, paddr; 131 - int result = -ENOMEM; 132 133 133 134 vaddr = (unsigned long)relocate_kernel; 134 135 paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); ··· 166 165 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 167 166 } 168 167 pte = pte_offset_kernel(pmd, vaddr); 169 - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); 168 + 169 + if (sev_active()) 170 + prot = PAGE_KERNEL_EXEC; 171 + 172 + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); 170 173 return 0; 171 174 err: 172 175 return result; ··· 206 201 207 202 level4p = (pgd_t *)__va(start_pgtable); 208 203 clear_page(level4p); 204 + 205 + if (sev_active()) { 206 + info.page_flag |= _PAGE_ENC; 207 + info.kernpg_flag |= _PAGE_ENC; 208 + } 209 209 210 210 if (direct_gbpages) 211 211 info.direct_gbpages = true; ··· 654 644 kexec_mark_crashkres(false); 655 645 } 656 646 647 + /* 648 + * During a traditional boot under SME, SME will encrypt the kernel, 649 + * so the SME kexec kernel also needs to be un-encrypted in order to 650 + * replicate a normal SME boot. 651 + * 652 + * During a traditional boot under SEV, the kernel has already been 653 + * loaded encrypted, so the SEV kexec kernel needs to be encrypted in 654 + * order to replicate a normal SEV boot. 655 + */ 657 656 int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) 658 657 { 658 + if (sev_active()) 659 + return 0; 660 + 659 661 /* 660 662 * If SME is active we need to be sure that kexec pages are 661 663 * not encrypted because when we boot to the new kernel the ··· 678 656 679 657 void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) 680 658 { 659 + if (sev_active()) 660 + return; 661 + 681 662 /* 682 663 * If SME is active we need to reset the pages back to being 683 664 * an encrypted mapping before freeing them.
+7 -1
arch/x86/kernel/setup.c
··· 836 836 837 837 void __init setup_arch(char **cmdline_p) 838 838 { 839 + /* 840 + * Reserve the memory occupied by the kernel between _text and 841 + * __end_of_kernel_reserve symbols. Any kernel sections after the 842 + * __end_of_kernel_reserve symbol must be explicitly reserved with a 843 + * separate memblock_reserve() or they will be discarded. 844 + */ 839 845 memblock_reserve(__pa_symbol(_text), 840 - (unsigned long)__bss_stop - (unsigned long)_text); 846 + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); 841 847 842 848 /* 843 849 * Make sure page 0 is always reserved because on systems with
+33 -1
arch/x86/kernel/vmlinux.lds.S
··· 368 368 __bss_stop = .; 369 369 } 370 370 371 + /* 372 + * The memory occupied from _text to here, __end_of_kernel_reserve, is 373 + * automatically reserved in setup_arch(). Anything after here must be 374 + * explicitly reserved using memblock_reserve() or it will be discarded 375 + * and treated as available memory. 376 + */ 377 + __end_of_kernel_reserve = .; 378 + 371 379 . = ALIGN(PAGE_SIZE); 372 380 .brk : AT(ADDR(.brk) - LOAD_OFFSET) { 373 381 __brk_base = .; ··· 387 379 . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ 388 380 _end = .; 389 381 382 + #ifdef CONFIG_AMD_MEM_ENCRYPT 383 + /* 384 + * Early scratch/workarea section: Lives outside of the kernel proper 385 + * (_text - _end). 386 + * 387 + * Resides after _end because even though the .brk section is after 388 + * __end_of_kernel_reserve, the .brk section is later reserved as a 389 + * part of the kernel. Since it is located after __end_of_kernel_reserve 390 + * it will be discarded and become part of the available memory. As 391 + * such, it can only be used by very early boot code and must not be 392 + * needed afterwards. 393 + * 394 + * Currently used by SME for performing in-place encryption of the 395 + * kernel during boot. Resides on a 2MB boundary to simplify the 396 + * pagetable setup used for SME in-place encryption. 397 + */ 398 + . = ALIGN(HPAGE_SIZE); 399 + .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) { 400 + __init_scratch_begin = .; 401 + *(.init.scratch) 402 + . = ALIGN(HPAGE_SIZE); 403 + __init_scratch_end = .; 404 + } 405 + #endif 406 + 390 407 STABS_DEBUG 391 408 DWARF_DEBUG 392 409 393 - /* Sections to be discarded */ 394 410 DISCARDS 395 411 /DISCARD/ : { 396 412 *(.eh_frame)
+45 -26
arch/x86/mm/ioremap.c
··· 28 28 29 29 #include "physaddr.h" 30 30 31 - struct ioremap_mem_flags { 32 - bool system_ram; 33 - bool desc_other; 31 + /* 32 + * Descriptor controlling ioremap() behavior. 33 + */ 34 + struct ioremap_desc { 35 + unsigned int flags; 34 36 }; 35 37 36 38 /* ··· 64 62 return err; 65 63 } 66 64 67 - static bool __ioremap_check_ram(struct resource *res) 65 + /* Does the range (or a subset of) contain normal RAM? */ 66 + static unsigned int __ioremap_check_ram(struct resource *res) 68 67 { 69 68 unsigned long start_pfn, stop_pfn; 70 69 unsigned long i; 71 70 72 71 if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) 73 - return false; 72 + return 0; 74 73 75 74 start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; 76 75 stop_pfn = (res->end + 1) >> PAGE_SHIFT; ··· 79 76 for (i = 0; i < (stop_pfn - start_pfn); ++i) 80 77 if (pfn_valid(start_pfn + i) && 81 78 !PageReserved(pfn_to_page(start_pfn + i))) 82 - return true; 79 + return IORES_MAP_SYSTEM_RAM; 83 80 } 84 81 85 - return false; 82 + return 0; 86 83 } 87 84 88 - static int __ioremap_check_desc_other(struct resource *res) 85 + /* 86 + * In a SEV guest, NONE and RESERVED should not be mapped encrypted because 87 + * there the whole memory is already encrypted. 88 + */ 89 + static unsigned int __ioremap_check_encrypted(struct resource *res) 89 90 { 90 - return (res->desc != IORES_DESC_NONE); 91 + if (!sev_active()) 92 + return 0; 93 + 94 + switch (res->desc) { 95 + case IORES_DESC_NONE: 96 + case IORES_DESC_RESERVED: 97 + break; 98 + default: 99 + return IORES_MAP_ENCRYPTED; 100 + } 101 + 102 + return 0; 91 103 } 92 104 93 - static int __ioremap_res_check(struct resource *res, void *arg) 105 + static int __ioremap_collect_map_flags(struct resource *res, void *arg) 94 106 { 95 - struct ioremap_mem_flags *flags = arg; 107 + struct ioremap_desc *desc = arg; 96 108 97 - if (!flags->system_ram) 98 - flags->system_ram = __ioremap_check_ram(res); 109 + if (!(desc->flags & IORES_MAP_SYSTEM_RAM)) 110 + desc->flags |= __ioremap_check_ram(res); 99 111 100 - if (!flags->desc_other) 101 - flags->desc_other = __ioremap_check_desc_other(res); 112 + if (!(desc->flags & IORES_MAP_ENCRYPTED)) 113 + desc->flags |= __ioremap_check_encrypted(res); 102 114 103 - return flags->system_ram && flags->desc_other; 115 + return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) == 116 + (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)); 104 117 } 105 118 106 119 /* ··· 125 106 * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). 126 107 */ 127 108 static void __ioremap_check_mem(resource_size_t addr, unsigned long size, 128 - struct ioremap_mem_flags *flags) 109 + struct ioremap_desc *desc) 129 110 { 130 111 u64 start, end; 131 112 132 113 start = (u64)addr; 133 114 end = start + size - 1; 134 - memset(flags, 0, sizeof(*flags)); 115 + memset(desc, 0, sizeof(struct ioremap_desc)); 135 116 136 - walk_mem_res(start, end, flags, __ioremap_res_check); 117 + walk_mem_res(start, end, desc, __ioremap_collect_map_flags); 137 118 } 138 119 139 120 /* ··· 150 131 * have to convert them into an offset in a page-aligned mapping, but the 151 132 * caller shouldn't need to know that small detail. 152 133 */ 153 - static void __iomem *__ioremap_caller(resource_size_t phys_addr, 154 - unsigned long size, enum page_cache_mode pcm, 155 - void *caller, bool encrypted) 134 + static void __iomem * 135 + __ioremap_caller(resource_size_t phys_addr, unsigned long size, 136 + enum page_cache_mode pcm, void *caller, bool encrypted) 156 137 { 157 138 unsigned long offset, vaddr; 158 139 resource_size_t last_addr; 159 140 const resource_size_t unaligned_phys_addr = phys_addr; 160 141 const unsigned long unaligned_size = size; 161 - struct ioremap_mem_flags mem_flags; 142 + struct ioremap_desc io_desc; 162 143 struct vm_struct *area; 163 144 enum page_cache_mode new_pcm; 164 145 pgprot_t prot; ··· 177 158 return NULL; 178 159 } 179 160 180 - __ioremap_check_mem(phys_addr, size, &mem_flags); 161 + __ioremap_check_mem(phys_addr, size, &io_desc); 181 162 182 163 /* 183 164 * Don't allow anybody to remap normal RAM that we're using.. 184 165 */ 185 - if (mem_flags.system_ram) { 166 + if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { 186 167 WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", 187 168 &phys_addr, &last_addr); 188 169 return NULL; ··· 220 201 * resulting mapping. 221 202 */ 222 203 prot = PAGE_KERNEL_IO; 223 - if ((sev_active() && mem_flags.desc_other) || encrypted) 204 + if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) 224 205 prot = pgprot_encrypted(prot); 225 206 226 207 switch (pcm) {
+20 -2
arch/x86/mm/mem_encrypt_identity.c
··· 70 70 unsigned long vaddr_end; 71 71 }; 72 72 73 + /* 74 + * This work area lives in the .init.scratch section, which lives outside of 75 + * the kernel proper. It is sized to hold the intermediate copy buffer and 76 + * more than enough pagetable pages. 77 + * 78 + * By using this section, the kernel can be encrypted in place and it 79 + * avoids any possibility of boot parameters or initramfs images being 80 + * placed such that the in-place encryption logic overwrites them. This 81 + * section is 2MB aligned to allow for simple pagetable setup using only 82 + * PMD entries (see vmlinux.lds.S). 83 + */ 84 + static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.init.scratch); 85 + 73 86 static char sme_cmdline_arg[] __initdata = "mem_encrypt"; 74 87 static char sme_cmdline_on[] __initdata = "on"; 75 88 static char sme_cmdline_off[] __initdata = "off"; ··· 324 311 } 325 312 #endif 326 313 327 - /* Set the encryption workarea to be immediately after the kernel */ 328 - workarea_start = kernel_end; 314 + /* 315 + * We're running identity mapped, so we must obtain the address to the 316 + * SME encryption workarea using rip-relative addressing. 317 + */ 318 + asm ("lea sme_workarea(%%rip), %0" 319 + : "=r" (workarea_start) 320 + : "p" (sme_workarea)); 329 321 330 322 /* 331 323 * Calculate required number of workarea bytes needed:
+3 -3
fs/proc/vmcore.c
··· 166 166 */ 167 167 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) 168 168 { 169 - return read_from_oldmem(buf, count, ppos, 0, false); 169 + return read_from_oldmem(buf, count, ppos, 0, sev_active()); 170 170 } 171 171 172 172 /* ··· 174 174 */ 175 175 ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) 176 176 { 177 - return read_from_oldmem(buf, count, ppos, 0, sme_active()); 177 + return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active()); 178 178 } 179 179 180 180 /* ··· 374 374 buflen); 375 375 start = m->paddr + *fpos - m->offset; 376 376 tmp = read_from_oldmem(buffer, tsz, &start, 377 - userbuf, sme_active()); 377 + userbuf, mem_encrypt_active()); 378 378 if (tmp < 0) 379 379 return tmp; 380 380 buflen -= tsz;
+10
include/linux/ioport.h
··· 12 12 #ifndef __ASSEMBLY__ 13 13 #include <linux/compiler.h> 14 14 #include <linux/types.h> 15 + #include <linux/bits.h> 15 16 /* 16 17 * Resources are tree-like, allowing 17 18 * nesting etc.. ··· 134 133 IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, 135 134 IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, 136 135 IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, 136 + IORES_DESC_RESERVED = 8, 137 + }; 138 + 139 + /* 140 + * Flags controlling ioremap() behavior. 141 + */ 142 + enum { 143 + IORES_MAP_SYSTEM_RAM = BIT(0), 144 + IORES_MAP_ENCRYPTED = BIT(1), 137 145 }; 138 146 139 147 /* helpers to define resources */