Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'x86_cc_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 confidential computing update from Borislav Petkov:

- Add support for unaccepted memory as specified in the UEFI spec v2.9.

The gist of it all is that Intel TDX and AMD SEV-SNP confidential
computing guests define the notion of accepting memory before using
it and thus preventing a whole set of attacks against such guests
like memory replay and the like.

There are a couple of strategies of how memory should be accepted -
the current implementation does an on-demand way of accepting.

* tag 'x86_cc_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
virt: sevguest: Add CONFIG_CRYPTO dependency
x86/efi: Safely enable unaccepted memory in UEFI
x86/sev: Add SNP-specific unaccepted memory support
x86/sev: Use large PSC requests if applicable
x86/sev: Allow for use of the early boot GHCB for PSC requests
x86/sev: Put PSC struct on the stack in prep for unaccepted memory support
x86/sev: Fix calculation of end address based on number of pages
x86/tdx: Add unaccepted memory support
x86/tdx: Refactor try_accept_one()
x86/tdx: Make _tdx_hypercall() and __tdx_module_call() available in boot stub
efi/unaccepted: Avoid load_unaligned_zeropad() stepping into unaccepted memory
efi: Add unaccepted memory support
x86/boot/compressed: Handle unaccepted memory
efi/libstub: Implement support for unaccepted memory
efi/x86: Get full memory map in allocate_e820()
mm: Add support for unaccepted memory

+1448 -307
+4
arch/x86/Kconfig
··· 887 887 bool "Intel TDX (Trust Domain Extensions) - Guest Support" 888 888 depends on X86_64 && CPU_SUP_INTEL 889 889 depends on X86_X2APIC 890 + depends on EFI_STUB 890 891 select ARCH_HAS_CC_PLATFORM 891 892 select X86_MEM_ENCRYPT 892 893 select X86_MCE 894 + select UNACCEPTED_MEMORY 893 895 help 894 896 Support running as a guest under Intel TDX. Without this support, 895 897 the guest kernel can not boot or run under TDX. ··· 1546 1544 config AMD_MEM_ENCRYPT 1547 1545 bool "AMD Secure Memory Encryption (SME) support" 1548 1546 depends on X86_64 && CPU_SUP_AMD 1547 + depends on EFI_STUB 1549 1548 select DMA_COHERENT_POOL 1550 1549 select ARCH_USE_MEMREMAP_PROT 1551 1550 select INSTRUCTION_DECODER 1552 1551 select ARCH_HAS_CC_PLATFORM 1553 1552 select X86_MEM_ENCRYPT 1553 + select UNACCEPTED_MEMORY 1554 1554 help 1555 1555 Say yes to enable support for the encryption of system memory. 1556 1556 This requires an AMD processor that supports Secure Memory
+2 -1
arch/x86/boot/compressed/Makefile
··· 106 106 endif 107 107 108 108 vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o 109 - vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o 109 + vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o 110 + vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o 110 111 111 112 vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o 112 113 vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o
+10
arch/x86/boot/compressed/efi.h
··· 16 16 #define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) 17 17 #define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) 18 18 #define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) 19 + #define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) 19 20 20 21 #define EFI32_LOADER_SIGNATURE "EL32" 21 22 #define EFI64_LOADER_SIGNATURE "EL64" ··· 33 32 } efi_table_hdr_t; 34 33 35 34 #define EFI_CONVENTIONAL_MEMORY 7 35 + #define EFI_UNACCEPTED_MEMORY 15 36 36 37 37 #define EFI_MEMORY_MORE_RELIABLE \ 38 38 ((u64)0x0000000000010000ULL) /* higher reliability */ ··· 104 102 u64 tables; 105 103 u64 smbios; 106 104 u64 reserved[8]; 105 + }; 106 + 107 + struct efi_unaccepted_memory { 108 + u32 version; 109 + u32 unit_size; 110 + u64 phys_base; 111 + u64 size; 112 + unsigned long bitmap[]; 107 113 }; 108 114 109 115 static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)
+19
arch/x86/boot/compressed/error.c
··· 22 22 while (1) 23 23 asm("hlt"); 24 24 } 25 + 26 + /* EFI libstub provides vsnprintf() */ 27 + #ifdef CONFIG_EFI_STUB 28 + void panic(const char *fmt, ...) 29 + { 30 + static char buf[1024]; 31 + va_list args; 32 + int len; 33 + 34 + va_start(args, fmt); 35 + len = vsnprintf(buf, sizeof(buf), fmt, args); 36 + va_end(args); 37 + 38 + if (len && buf[len - 1] == '\n') 39 + buf[len - 1] = '\0'; 40 + 41 + error(buf); 42 + } 43 + #endif
+1
arch/x86/boot/compressed/error.h
··· 6 6 7 7 void warn(char *m); 8 8 void error(char *m) __noreturn; 9 + void panic(const char *fmt, ...) __noreturn __cold; 9 10 10 11 #endif /* BOOT_COMPRESSED_ERROR_H */
+28 -12
arch/x86/boot/compressed/kaslr.c
··· 672 672 } 673 673 674 674 #ifdef CONFIG_EFI 675 + 676 + /* 677 + * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are 678 + * guaranteed to be free. 679 + * 680 + * Pick free memory more conservatively than the EFI spec allows: according to 681 + * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus 682 + * available to place the kernel image into, but in practice there's firmware 683 + * where using that memory leads to crashes. Buggy vendor EFI code registers 684 + * for an event that triggers on SetVirtualAddressMap(). The handler assumes 685 + * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which 686 + * is probably true for Windows. 687 + * 688 + * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). 689 + */ 690 + static inline bool memory_type_is_free(efi_memory_desc_t *md) 691 + { 692 + if (md->type == EFI_CONVENTIONAL_MEMORY) 693 + return true; 694 + 695 + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && 696 + md->type == EFI_UNACCEPTED_MEMORY) 697 + return true; 698 + 699 + return false; 700 + } 701 + 675 702 /* 676 703 * Returns true if we processed the EFI memmap, which we prefer over the E820 677 704 * table if it is available. ··· 743 716 for (i = 0; i < nr_desc; i++) { 744 717 md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); 745 718 746 - /* 747 - * Here we are more conservative in picking free memory than 748 - * the EFI spec allows: 749 - * 750 - * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also 751 - * free memory and thus available to place the kernel image into, 752 - * but in practice there's firmware where using that memory leads 753 - * to crashes. 754 - * 755 - * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. 756 - */ 757 - if (md->type != EFI_CONVENTIONAL_MEMORY) 719 + if (!memory_type_is_free(md)) 758 720 continue; 759 721 760 722 if (efi_soft_reserve_enabled() &&
+86
arch/x86/boot/compressed/mem.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + 3 + #include "error.h" 4 + #include "misc.h" 5 + #include "tdx.h" 6 + #include "sev.h" 7 + #include <asm/shared/tdx.h> 8 + 9 + /* 10 + * accept_memory() and process_unaccepted_memory() called from EFI stub which 11 + * runs before decompresser and its early_tdx_detect(). 12 + * 13 + * Enumerate TDX directly from the early users. 14 + */ 15 + static bool early_is_tdx_guest(void) 16 + { 17 + static bool once; 18 + static bool is_tdx; 19 + 20 + if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) 21 + return false; 22 + 23 + if (!once) { 24 + u32 eax, sig[3]; 25 + 26 + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, 27 + &sig[0], &sig[2], &sig[1]); 28 + is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig)); 29 + once = true; 30 + } 31 + 32 + return is_tdx; 33 + } 34 + 35 + void arch_accept_memory(phys_addr_t start, phys_addr_t end) 36 + { 37 + /* Platform-specific memory-acceptance call goes here */ 38 + if (early_is_tdx_guest()) { 39 + if (!tdx_accept_memory(start, end)) 40 + panic("TDX: Failed to accept memory\n"); 41 + } else if (sev_snp_enabled()) { 42 + snp_accept_memory(start, end); 43 + } else { 44 + error("Cannot accept memory: unknown platform\n"); 45 + } 46 + } 47 + 48 + bool init_unaccepted_memory(void) 49 + { 50 + guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; 51 + struct efi_unaccepted_memory *table; 52 + unsigned long cfg_table_pa; 53 + unsigned int cfg_table_len; 54 + enum efi_type et; 55 + int ret; 56 + 57 + et = efi_get_type(boot_params); 58 + if (et == EFI_TYPE_NONE) 59 + return false; 60 + 61 + ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len); 62 + if (ret) { 63 + warn("EFI config table not found."); 64 + return false; 65 + } 66 + 67 + table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa, 68 + cfg_table_len, guid); 69 + if (!table) 70 + return false; 71 + 72 + if (table->version != 1) 73 + error("Unknown version of unaccepted memory table\n"); 74 + 75 + /* 76 + * In many cases unaccepted_table is already set by EFI stub, but it 77 + * has to be initialized again to cover cases when the table is not 78 + * allocated by EFI stub or EFI stub copied the kernel image with 79 + * efi_relocate_kernel() before the variable is set. 80 + * 81 + * It must be initialized before the first usage of accept_memory(). 82 + */ 83 + unaccepted_table = table; 84 + 85 + return true; 86 + }
+6
arch/x86/boot/compressed/misc.c
··· 455 455 #endif 456 456 457 457 debug_putstr("\nDecompressing Linux... "); 458 + 459 + if (init_unaccepted_memory()) { 460 + debug_putstr("Accepting memory... "); 461 + accept_memory(__pa(output), __pa(output) + needed_size); 462 + } 463 + 458 464 __decompress(input_data, input_len, NULL, NULL, output, output_len, 459 465 NULL, error); 460 466 entry_offset = parse_elf(output);
+10
arch/x86/boot/compressed/misc.h
··· 247 247 } 248 248 #endif /* CONFIG_EFI */ 249 249 250 + #ifdef CONFIG_UNACCEPTED_MEMORY 251 + bool init_unaccepted_memory(void); 252 + #else 253 + static inline bool init_unaccepted_memory(void) { return false; } 254 + #endif 255 + 256 + /* Defined in EFI stub */ 257 + extern struct efi_unaccepted_memory *unaccepted_table; 258 + void accept_memory(phys_addr_t start, phys_addr_t end); 259 + 250 260 #endif /* BOOT_COMPRESSED_MISC_H */
+53 -1
arch/x86/boot/compressed/sev.c
··· 115 115 /* Include code for early handlers */ 116 116 #include "../../kernel/sev-shared.c" 117 117 118 - static inline bool sev_snp_enabled(void) 118 + bool sev_snp_enabled(void) 119 119 { 120 120 return sev_status & MSR_AMD64_SEV_SNP_ENABLED; 121 121 } ··· 179 179 snp_register_ghcb_early(__pa(&boot_ghcb_page)); 180 180 181 181 return true; 182 + } 183 + 184 + static phys_addr_t __snp_accept_memory(struct snp_psc_desc *desc, 185 + phys_addr_t pa, phys_addr_t pa_end) 186 + { 187 + struct psc_hdr *hdr; 188 + struct psc_entry *e; 189 + unsigned int i; 190 + 191 + hdr = &desc->hdr; 192 + memset(hdr, 0, sizeof(*hdr)); 193 + 194 + e = desc->entries; 195 + 196 + i = 0; 197 + while (pa < pa_end && i < VMGEXIT_PSC_MAX_ENTRY) { 198 + hdr->end_entry = i; 199 + 200 + e->gfn = pa >> PAGE_SHIFT; 201 + e->operation = SNP_PAGE_STATE_PRIVATE; 202 + if (IS_ALIGNED(pa, PMD_SIZE) && (pa_end - pa) >= PMD_SIZE) { 203 + e->pagesize = RMP_PG_SIZE_2M; 204 + pa += PMD_SIZE; 205 + } else { 206 + e->pagesize = RMP_PG_SIZE_4K; 207 + pa += PAGE_SIZE; 208 + } 209 + 210 + e++; 211 + i++; 212 + } 213 + 214 + if (vmgexit_psc(boot_ghcb, desc)) 215 + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 216 + 217 + pvalidate_pages(desc); 218 + 219 + return pa; 220 + } 221 + 222 + void snp_accept_memory(phys_addr_t start, phys_addr_t end) 223 + { 224 + struct snp_psc_desc desc = {}; 225 + unsigned int i; 226 + phys_addr_t pa; 227 + 228 + if (!boot_ghcb && !early_setup_ghcb()) 229 + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 230 + 231 + pa = start; 232 + while (pa < end) 233 + pa = __snp_accept_memory(&desc, pa, end); 182 234 } 183 235 184 236 void sev_es_shutdown_ghcb(void)
+23
arch/x86/boot/compressed/sev.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * AMD SEV header for early boot related functions. 4 + * 5 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 6 + */ 7 + 8 + #ifndef BOOT_COMPRESSED_SEV_H 9 + #define BOOT_COMPRESSED_SEV_H 10 + 11 + #ifdef CONFIG_AMD_MEM_ENCRYPT 12 + 13 + bool sev_snp_enabled(void); 14 + void snp_accept_memory(phys_addr_t start, phys_addr_t end); 15 + 16 + #else 17 + 18 + static inline bool sev_snp_enabled(void) { return false; } 19 + static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } 20 + 21 + #endif 22 + 23 + #endif
+2
arch/x86/boot/compressed/tdx-shared.c
··· 1 + #include "error.h" 2 + #include "../../coco/tdx/tdx-shared.c"
+1 -1
arch/x86/coco/tdx/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 3 - obj-y += tdx.o tdcall.o 3 + obj-y += tdx.o tdx-shared.o tdcall.o
+71
arch/x86/coco/tdx/tdx-shared.c
··· 1 + #include <asm/tdx.h> 2 + #include <asm/pgtable.h> 3 + 4 + static unsigned long try_accept_one(phys_addr_t start, unsigned long len, 5 + enum pg_level pg_level) 6 + { 7 + unsigned long accept_size = page_level_size(pg_level); 8 + u64 tdcall_rcx; 9 + u8 page_size; 10 + 11 + if (!IS_ALIGNED(start, accept_size)) 12 + return 0; 13 + 14 + if (len < accept_size) 15 + return 0; 16 + 17 + /* 18 + * Pass the page physical address to the TDX module to accept the 19 + * pending, private page. 20 + * 21 + * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. 22 + */ 23 + switch (pg_level) { 24 + case PG_LEVEL_4K: 25 + page_size = 0; 26 + break; 27 + case PG_LEVEL_2M: 28 + page_size = 1; 29 + break; 30 + case PG_LEVEL_1G: 31 + page_size = 2; 32 + break; 33 + default: 34 + return 0; 35 + } 36 + 37 + tdcall_rcx = start | page_size; 38 + if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) 39 + return 0; 40 + 41 + return accept_size; 42 + } 43 + 44 + bool tdx_accept_memory(phys_addr_t start, phys_addr_t end) 45 + { 46 + /* 47 + * For shared->private conversion, accept the page using 48 + * TDX_ACCEPT_PAGE TDX module call. 49 + */ 50 + while (start < end) { 51 + unsigned long len = end - start; 52 + unsigned long accept_size; 53 + 54 + /* 55 + * Try larger accepts first. It gives chance to VMM to keep 56 + * 1G/2M Secure EPT entries where possible and speeds up 57 + * process by cutting number of hypercalls (if successful). 58 + */ 59 + 60 + accept_size = try_accept_one(start, len, PG_LEVEL_1G); 61 + if (!accept_size) 62 + accept_size = try_accept_one(start, len, PG_LEVEL_2M); 63 + if (!accept_size) 64 + accept_size = try_accept_one(start, len, PG_LEVEL_4K); 65 + if (!accept_size) 66 + return false; 67 + start += accept_size; 68 + } 69 + 70 + return true; 71 + }
+3 -99
arch/x86/coco/tdx/tdx.c
··· 14 14 #include <asm/insn-eval.h> 15 15 #include <asm/pgtable.h> 16 16 17 - /* TDX module Call Leaf IDs */ 18 - #define TDX_GET_INFO 1 19 - #define TDX_GET_VEINFO 3 20 - #define TDX_GET_REPORT 4 21 - #define TDX_ACCEPT_PAGE 6 22 - #define TDX_WR 8 23 - 24 - /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ 25 - #define TDCS_NOTIFY_ENABLES 0x9100000000000010 26 - 27 - /* TDX hypercall Leaf IDs */ 28 - #define TDVMCALL_MAP_GPA 0x10001 29 - #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 30 - 31 17 /* MMIO direction */ 32 18 #define EPT_READ 0 33 19 #define EPT_WRITE 1 ··· 36 50 #define TDCALL_INVALID_OPERAND 0xc0000100 37 51 38 52 #define TDREPORT_SUBTYPE_0 0 39 - 40 - /* 41 - * Wrapper for standard use of __tdx_hypercall with no output aside from 42 - * return code. 43 - */ 44 - static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) 45 - { 46 - struct tdx_hypercall_args args = { 47 - .r10 = TDX_HYPERCALL_STANDARD, 48 - .r11 = fn, 49 - .r12 = r12, 50 - .r13 = r13, 51 - .r14 = r14, 52 - .r15 = r15, 53 - }; 54 - 55 - return __tdx_hypercall(&args); 56 - } 57 53 58 54 /* Called from __tdx_hypercall() for unrecoverable failure */ 59 55 noinstr void __tdx_hypercall_failed(void) ··· 713 745 return true; 714 746 } 715 747 716 - static bool try_accept_one(phys_addr_t *start, unsigned long len, 717 - enum pg_level pg_level) 718 - { 719 - unsigned long accept_size = page_level_size(pg_level); 720 - u64 tdcall_rcx; 721 - u8 page_size; 722 - 723 - if (!IS_ALIGNED(*start, accept_size)) 724 - return false; 725 - 726 - if (len < accept_size) 727 - return false; 728 - 729 - /* 730 - * Pass the page physical address to the TDX module to accept the 731 - * pending, private page. 732 - * 733 - * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. 734 - */ 735 - switch (pg_level) { 736 - case PG_LEVEL_4K: 737 - page_size = 0; 738 - break; 739 - case PG_LEVEL_2M: 740 - page_size = 1; 741 - break; 742 - case PG_LEVEL_1G: 743 - page_size = 2; 744 - break; 745 - default: 746 - return false; 747 - } 748 - 749 - tdcall_rcx = *start | page_size; 750 - if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) 751 - return false; 752 - 753 - *start += accept_size; 754 - return true; 755 - } 756 - 757 748 /* 758 749 * Inform the VMM of the guest's intent for this physical page: shared with 759 750 * the VMM or private to the guest. The VMM is expected to change its mapping ··· 737 810 if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0)) 738 811 return false; 739 812 740 - /* private->shared conversion requires only MapGPA call */ 741 - if (!enc) 742 - return true; 743 - 744 - /* 745 - * For shared->private conversion, accept the page using 746 - * TDX_ACCEPT_PAGE TDX module call. 747 - */ 748 - while (start < end) { 749 - unsigned long len = end - start; 750 - 751 - /* 752 - * Try larger accepts first. It gives chance to VMM to keep 753 - * 1G/2M SEPT entries where possible and speeds up process by 754 - * cutting number of hypercalls (if successful). 755 - */ 756 - 757 - if (try_accept_one(&start, len, PG_LEVEL_1G)) 758 - continue; 759 - 760 - if (try_accept_one(&start, len, PG_LEVEL_2M)) 761 - continue; 762 - 763 - if (!try_accept_one(&start, len, PG_LEVEL_4K)) 764 - return false; 765 - } 813 + /* shared->private conversion requires memory to be accepted before use */ 814 + if (enc) 815 + return tdx_accept_memory(start, end); 766 816 767 817 return true; 768 818 }
+2
arch/x86/include/asm/efi.h
··· 31 31 32 32 #define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF 33 33 34 + #define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE 35 + 34 36 /* 35 37 * The EFI services are called through variadic functions in many cases. These 36 38 * functions are implemented in assembler and support only a fixed number of
+7 -2
arch/x86/include/asm/sev-common.h
··· 106 106 #define GHCB_HV_FT_SNP BIT_ULL(0) 107 107 #define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) 108 108 109 - /* SNP Page State Change NAE event */ 110 - #define VMGEXIT_PSC_MAX_ENTRY 253 109 + /* 110 + * SNP Page State Change NAE event 111 + * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which 112 + * is a local stack variable in set_pages_state(). Do not increase this value 113 + * without evaluating the impact to stack usage. 114 + */ 115 + #define VMGEXIT_PSC_MAX_ENTRY 64 111 116 112 117 struct psc_hdr { 113 118 u16 cur_entry;
+15 -8
arch/x86/include/asm/sev.h
··· 80 80 extern void vc_boot_ghcb(void); 81 81 extern bool handle_vc_boot_ghcb(struct pt_regs *regs); 82 82 83 + /* PVALIDATE return codes */ 84 + #define PVALIDATE_FAIL_SIZEMISMATCH 6 85 + 83 86 /* Software defined (when rFlags.CF = 1) */ 84 87 #define PVALIDATE_FAIL_NOUPDATE 255 85 88 86 89 /* RMP page size */ 87 90 #define RMP_PG_SIZE_4K 0 91 + #define RMP_PG_SIZE_2M 1 88 92 89 93 #define RMPADJUST_VMSA_PAGE_BIT BIT(16) 90 94 ··· 196 192 197 193 void setup_ghcb(void); 198 194 void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, 199 - unsigned int npages); 195 + unsigned long npages); 200 196 void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, 201 - unsigned int npages); 197 + unsigned long npages); 202 198 void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); 203 - void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); 204 - void snp_set_memory_private(unsigned long vaddr, unsigned int npages); 199 + void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); 200 + void snp_set_memory_private(unsigned long vaddr, unsigned long npages); 205 201 void snp_set_wakeup_secondary_cpu(void); 206 202 bool snp_init(struct boot_params *bp); 207 203 void __init __noreturn snp_abort(void); 208 204 int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); 205 + void snp_accept_memory(phys_addr_t start, phys_addr_t end); 209 206 #else 210 207 static inline void sev_es_ist_enter(struct pt_regs *regs) { } 211 208 static inline void sev_es_ist_exit(void) { } ··· 217 212 static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } 218 213 static inline void setup_ghcb(void) { } 219 214 static inline void __init 220 - early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } 215 + early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } 221 216 static inline void __init 222 - early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } 217 + early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } 223 218 static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } 224 - static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } 225 - static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } 219 + static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { } 220 + static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } 226 221 static inline void snp_set_wakeup_secondary_cpu(void) { } 227 222 static inline bool snp_init(struct boot_params *bp) { return false; } 228 223 static inline void snp_abort(void) { } ··· 230 225 { 231 226 return -ENOTTY; 232 227 } 228 + 229 + static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } 233 230 #endif 234 231 235 232 #endif
+53
arch/x86/include/asm/shared/tdx.h
··· 10 10 #define TDX_CPUID_LEAF_ID 0x21 11 11 #define TDX_IDENT "IntelTDX " 12 12 13 + /* TDX module Call Leaf IDs */ 14 + #define TDX_GET_INFO 1 15 + #define TDX_GET_VEINFO 3 16 + #define TDX_GET_REPORT 4 17 + #define TDX_ACCEPT_PAGE 6 18 + #define TDX_WR 8 19 + 20 + /* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ 21 + #define TDCS_NOTIFY_ENABLES 0x9100000000000010 22 + 23 + /* TDX hypercall Leaf IDs */ 24 + #define TDVMCALL_MAP_GPA 0x10001 25 + #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 26 + 13 27 #ifndef __ASSEMBLY__ 14 28 15 29 /* ··· 51 37 u64 __tdx_hypercall(struct tdx_hypercall_args *args); 52 38 u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); 53 39 40 + /* 41 + * Wrapper for standard use of __tdx_hypercall with no output aside from 42 + * return code. 43 + */ 44 + static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) 45 + { 46 + struct tdx_hypercall_args args = { 47 + .r10 = TDX_HYPERCALL_STANDARD, 48 + .r11 = fn, 49 + .r12 = r12, 50 + .r13 = r13, 51 + .r14 = r14, 52 + .r15 = r15, 53 + }; 54 + 55 + return __tdx_hypercall(&args); 56 + } 57 + 58 + 54 59 /* Called from __tdx_hypercall() for unrecoverable failure */ 55 60 void __tdx_hypercall_failed(void); 61 + 62 + /* 63 + * Used in __tdx_module_call() to gather the output registers' values of the 64 + * TDCALL instruction when requesting services from the TDX module. This is a 65 + * software only structure and not part of the TDX module/VMM ABI 66 + */ 67 + struct tdx_module_output { 68 + u64 rcx; 69 + u64 rdx; 70 + u64 r8; 71 + u64 r9; 72 + u64 r10; 73 + u64 r11; 74 + }; 75 + 76 + /* Used to communicate with the TDX module */ 77 + u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, 78 + struct tdx_module_output *out); 79 + 80 + bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); 56 81 57 82 #endif /* !__ASSEMBLY__ */ 58 83 #endif /* _ASM_X86_SHARED_TDX_H */
+2 -19
arch/x86/include/asm/tdx.h
··· 5 5 6 6 #include <linux/init.h> 7 7 #include <linux/bits.h> 8 + 9 + #include <asm/errno.h> 8 10 #include <asm/ptrace.h> 9 11 #include <asm/shared/tdx.h> 10 12 ··· 21 19 #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) 22 20 23 21 #ifndef __ASSEMBLY__ 24 - 25 - /* 26 - * Used to gather the output registers values of the TDCALL and SEAMCALL 27 - * instructions when requesting services from the TDX module. 28 - * 29 - * This is a software only structure and not part of the TDX module/VMM ABI. 30 - */ 31 - struct tdx_module_output { 32 - u64 rcx; 33 - u64 rdx; 34 - u64 r8; 35 - u64 r9; 36 - u64 r10; 37 - u64 r11; 38 - }; 39 22 40 23 /* 41 24 * Used by the #VE exception handler to gather the #VE exception ··· 41 54 #ifdef CONFIG_INTEL_TDX_GUEST 42 55 43 56 void __init tdx_early_init(void); 44 - 45 - /* Used to communicate with the TDX module */ 46 - u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, 47 - struct tdx_module_output *out); 48 57 49 58 void tdx_get_ve_info(struct ve_info *ve); 50 59
+27
arch/x86/include/asm/unaccepted_memory.h
··· 1 + #ifndef _ASM_X86_UNACCEPTED_MEMORY_H 2 + #define _ASM_X86_UNACCEPTED_MEMORY_H 3 + 4 + #include <linux/efi.h> 5 + #include <asm/tdx.h> 6 + #include <asm/sev.h> 7 + 8 + static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end) 9 + { 10 + /* Platform-specific memory-acceptance call goes here */ 11 + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { 12 + if (!tdx_accept_memory(start, end)) 13 + panic("TDX: Failed to accept memory\n"); 14 + } else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { 15 + snp_accept_memory(start, end); 16 + } else { 17 + panic("Cannot accept memory: unknown platform\n"); 18 + } 19 + } 20 + 21 + static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void) 22 + { 23 + if (efi.unaccepted == EFI_INVALID_TABLE_ADDR) 24 + return NULL; 25 + return __va(efi.unaccepted); 26 + } 27 + #endif
+103
arch/x86/kernel/sev-shared.c
··· 12 12 #ifndef __BOOT_COMPRESSED 13 13 #define error(v) pr_err(v) 14 14 #define has_cpuflag(f) boot_cpu_has(f) 15 + #else 16 + #undef WARN 17 + #define WARN(condition, format...) (!!(condition)) 15 18 #endif 16 19 17 20 /* I/O parameters for CPUID-related helpers */ ··· 993 990 else if (fn->eax_in == 0x80000000) 994 991 cpuid_ext_range_max = fn->eax; 995 992 } 993 + } 994 + 995 + static void pvalidate_pages(struct snp_psc_desc *desc) 996 + { 997 + struct psc_entry *e; 998 + unsigned long vaddr; 999 + unsigned int size; 1000 + unsigned int i; 1001 + bool validate; 1002 + int rc; 1003 + 1004 + for (i = 0; i <= desc->hdr.end_entry; i++) { 1005 + e = &desc->entries[i]; 1006 + 1007 + vaddr = (unsigned long)pfn_to_kaddr(e->gfn); 1008 + size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; 1009 + validate = e->operation == SNP_PAGE_STATE_PRIVATE; 1010 + 1011 + rc = pvalidate(vaddr, size, validate); 1012 + if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { 1013 + unsigned long vaddr_end = vaddr + PMD_SIZE; 1014 + 1015 + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { 1016 + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); 1017 + if (rc) 1018 + break; 1019 + } 1020 + } 1021 + 1022 + if (rc) { 1023 + WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); 1024 + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); 1025 + } 1026 + } 1027 + } 1028 + 1029 + static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) 1030 + { 1031 + int cur_entry, end_entry, ret = 0; 1032 + struct snp_psc_desc *data; 1033 + struct es_em_ctxt ctxt; 1034 + 1035 + vc_ghcb_invalidate(ghcb); 1036 + 1037 + /* Copy the input desc into GHCB shared buffer */ 1038 + data = (struct snp_psc_desc *)ghcb->shared_buffer; 1039 + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); 1040 + 1041 + /* 1042 + * As per the GHCB specification, the hypervisor can resume the guest 1043 + * before processing all the entries. Check whether all the entries 1044 + * are processed. If not, then keep retrying. Note, the hypervisor 1045 + * will update the data memory directly to indicate the status, so 1046 + * reference the data->hdr everywhere. 1047 + * 1048 + * The strategy here is to wait for the hypervisor to change the page 1049 + * state in the RMP table before guest accesses the memory pages. If the 1050 + * page state change was not successful, then later memory access will 1051 + * result in a crash. 1052 + */ 1053 + cur_entry = data->hdr.cur_entry; 1054 + end_entry = data->hdr.end_entry; 1055 + 1056 + while (data->hdr.cur_entry <= data->hdr.end_entry) { 1057 + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); 1058 + 1059 + /* This will advance the shared buffer data points to. */ 1060 + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); 1061 + 1062 + /* 1063 + * Page State Change VMGEXIT can pass error code through 1064 + * exit_info_2. 1065 + */ 1066 + if (WARN(ret || ghcb->save.sw_exit_info_2, 1067 + "SNP: PSC failed ret=%d exit_info_2=%llx\n", 1068 + ret, ghcb->save.sw_exit_info_2)) { 1069 + ret = 1; 1070 + goto out; 1071 + } 1072 + 1073 + /* Verify that reserved bit is not set */ 1074 + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { 1075 + ret = 1; 1076 + goto out; 1077 + } 1078 + 1079 + /* 1080 + * Sanity check that entry processing is not going backwards. 1081 + * This will happen only if hypervisor is tricking us. 1082 + */ 1083 + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, 1084 + "SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", 1085 + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { 1086 + ret = 1; 1087 + goto out; 1088 + } 1089 + } 1090 + 1091 + out: 1092 + return ret; 996 1093 }
+108 -148
arch/x86/kernel/sev.c
··· 119 119 120 120 struct sev_config { 121 121 __u64 debug : 1, 122 - __reserved : 63; 122 + 123 + /* 124 + * A flag used by __set_pages_state() that indicates when the 125 + * per-CPU GHCB has been created and registered and thus can be 126 + * used by the BSP instead of the early boot GHCB. 127 + * 128 + * For APs, the per-CPU GHCB is created before they are started 129 + * and registered upon startup, so this flag can be used globally 130 + * for the BSP and APs. 131 + */ 132 + ghcbs_initialized : 1, 133 + 134 + __reserved : 62; 123 135 }; 124 136 125 137 static struct sev_config sev_cfg __read_mostly; ··· 657 645 return ret; 658 646 } 659 647 660 - static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) 661 - { 662 - unsigned long vaddr_end; 663 - int rc; 664 - 665 - vaddr = vaddr & PAGE_MASK; 666 - vaddr_end = vaddr + (npages << PAGE_SHIFT); 667 - 668 - while (vaddr < vaddr_end) { 669 - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); 670 - if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) 671 - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); 672 - 673 - vaddr = vaddr + PAGE_SIZE; 674 - } 675 - } 676 - 677 - static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) 648 + static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, 649 + unsigned long npages, enum psc_op op) 678 650 { 679 651 unsigned long paddr_end; 680 652 u64 val; 653 + int ret; 654 + 655 + vaddr = vaddr & PAGE_MASK; 681 656 682 657 paddr = paddr & PAGE_MASK; 683 658 paddr_end = paddr + (npages << PAGE_SHIFT); 684 659 685 660 while (paddr < paddr_end) { 661 + if (op == SNP_PAGE_STATE_SHARED) { 662 + /* Page validation must be rescinded before changing to shared */ 663 + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); 664 + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) 665 + goto e_term; 666 + } 667 + 686 668 /* 687 669 * Use the MSR protocol because this function can be called before 688 670 * the GHCB is established. ··· 697 691 paddr, GHCB_MSR_PSC_RESP_VAL(val))) 698 692 goto e_term; 699 693 700 - paddr = paddr + PAGE_SIZE; 694 + if (op == SNP_PAGE_STATE_PRIVATE) { 695 + /* Page validation must be performed after changing to private */ 696 + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); 697 + if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) 698 + goto e_term; 699 + } 700 + 701 + vaddr += PAGE_SIZE; 702 + paddr += PAGE_SIZE; 701 703 } 702 704 703 705 return; ··· 715 701 } 716 702 717 703 void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, 718 - unsigned int npages) 704 + unsigned long npages) 719 705 { 720 706 /* 721 707 * This can be invoked in early boot while running identity mapped, so ··· 730 716 * Ask the hypervisor to mark the memory pages as private in the RMP 731 717 * table. 732 718 */ 733 - early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); 734 - 735 - /* Validate the memory pages after they've been added in the RMP table. */ 736 - pvalidate_pages(vaddr, npages, true); 719 + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); 737 720 } 738 721 739 722 void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, 740 - unsigned int npages) 723 + unsigned long npages) 741 724 { 742 725 /* 743 726 * This can be invoked in early boot while running identity mapped, so ··· 745 734 if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) 746 735 return; 747 736 748 - /* Invalidate the memory pages before they are marked shared in the RMP table. */ 749 - pvalidate_pages(vaddr, npages, false); 750 - 751 737 /* Ask hypervisor to mark the memory pages shared in the RMP table. */ 752 - early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); 738 + early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); 753 739 } 754 740 755 741 void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) ··· 764 756 WARN(1, "invalid memory op %d\n", op); 765 757 } 766 758 767 - static int vmgexit_psc(struct snp_psc_desc *desc) 759 + static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, 760 + unsigned long vaddr_end, int op) 768 761 { 769 - int cur_entry, end_entry, ret = 0; 770 - struct snp_psc_desc *data; 771 762 struct ghcb_state state; 772 - struct es_em_ctxt ctxt; 773 - unsigned long flags; 774 - struct ghcb *ghcb; 775 - 776 - /* 777 - * __sev_get_ghcb() needs to run with IRQs disabled because it is using 778 - * a per-CPU GHCB. 779 - */ 780 - local_irq_save(flags); 781 - 782 - ghcb = __sev_get_ghcb(&state); 783 - if (!ghcb) { 784 - ret = 1; 785 - goto out_unlock; 786 - } 787 - 788 - /* Copy the input desc into GHCB shared buffer */ 789 - data = (struct snp_psc_desc *)ghcb->shared_buffer; 790 - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); 791 - 792 - /* 793 - * As per the GHCB specification, the hypervisor can resume the guest 794 - * before processing all the entries. Check whether all the entries 795 - * are processed. If not, then keep retrying. Note, the hypervisor 796 - * will update the data memory directly to indicate the status, so 797 - * reference the data->hdr everywhere. 798 - * 799 - * The strategy here is to wait for the hypervisor to change the page 800 - * state in the RMP table before guest accesses the memory pages. If the 801 - * page state change was not successful, then later memory access will 802 - * result in a crash. 803 - */ 804 - cur_entry = data->hdr.cur_entry; 805 - end_entry = data->hdr.end_entry; 806 - 807 - while (data->hdr.cur_entry <= data->hdr.end_entry) { 808 - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); 809 - 810 - /* This will advance the shared buffer data points to. */ 811 - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); 812 - 813 - /* 814 - * Page State Change VMGEXIT can pass error code through 815 - * exit_info_2. 816 - */ 817 - if (WARN(ret || ghcb->save.sw_exit_info_2, 818 - "SNP: PSC failed ret=%d exit_info_2=%llx\n", 819 - ret, ghcb->save.sw_exit_info_2)) { 820 - ret = 1; 821 - goto out; 822 - } 823 - 824 - /* Verify that reserved bit is not set */ 825 - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { 826 - ret = 1; 827 - goto out; 828 - } 829 - 830 - /* 831 - * Sanity check that entry processing is not going backwards. 832 - * This will happen only if hypervisor is tricking us. 833 - */ 834 - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, 835 - "SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", 836 - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { 837 - ret = 1; 838 - goto out; 839 - } 840 - } 841 - 842 - out: 843 - __sev_put_ghcb(&state); 844 - 845 - out_unlock: 846 - local_irq_restore(flags); 847 - 848 - return ret; 849 - } 850 - 851 - static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, 852 - unsigned long vaddr_end, int op) 853 - { 763 + bool use_large_entry; 854 764 struct psc_hdr *hdr; 855 765 struct psc_entry *e; 766 + unsigned long flags; 856 767 unsigned long pfn; 768 + struct ghcb *ghcb; 857 769 int i; 858 770 859 771 hdr = &data->hdr; ··· 782 854 memset(data, 0, sizeof(*data)); 783 855 i = 0; 784 856 785 - while (vaddr < vaddr_end) { 786 - if (is_vmalloc_addr((void *)vaddr)) 857 + while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { 858 + hdr->end_entry = i; 859 + 860 + if (is_vmalloc_addr((void *)vaddr)) { 787 861 pfn = vmalloc_to_pfn((void *)vaddr); 788 - else 862 + use_large_entry = false; 863 + } else { 789 864 pfn = __pa(vaddr) >> PAGE_SHIFT; 865 + use_large_entry = true; 866 + } 790 867 791 868 e->gfn = pfn; 792 869 e->operation = op; 793 - hdr->end_entry = i; 794 870 795 - /* 796 - * Current SNP implementation doesn't keep track of the RMP page 797 - * size so use 4K for simplicity. 798 - */ 799 - e->pagesize = RMP_PG_SIZE_4K; 871 + if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && 872 + (vaddr_end - vaddr) >= PMD_SIZE) { 873 + e->pagesize = RMP_PG_SIZE_2M; 874 + vaddr += PMD_SIZE; 875 + } else { 876 + e->pagesize = RMP_PG_SIZE_4K; 877 + vaddr += PAGE_SIZE; 878 + } 800 879 801 - vaddr = vaddr + PAGE_SIZE; 802 880 e++; 803 881 i++; 804 882 } 805 883 806 - if (vmgexit_psc(data)) 884 + /* Page validation must be rescinded before changing to shared */ 885 + if (op == SNP_PAGE_STATE_SHARED) 886 + pvalidate_pages(data); 887 + 888 + local_irq_save(flags); 889 + 890 + if (sev_cfg.ghcbs_initialized) 891 + ghcb = __sev_get_ghcb(&state); 892 + else 893 + ghcb = boot_ghcb; 894 + 895 + /* Invoke the hypervisor to perform the page state changes */ 896 + if (!ghcb || vmgexit_psc(ghcb, data)) 807 897 sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); 898 + 899 + if (sev_cfg.ghcbs_initialized) 900 + __sev_put_ghcb(&state); 901 + 902 + local_irq_restore(flags); 903 + 904 + /* Page validation must be performed after changing to private */ 905 + if (op == SNP_PAGE_STATE_PRIVATE) 906 + pvalidate_pages(data); 907 + 908 + return vaddr; 808 909 } 809 910 810 - static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) 911 + static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) 811 912 { 812 - unsigned long vaddr_end, next_vaddr; 813 - struct snp_psc_desc *desc; 913 + struct snp_psc_desc desc; 914 + unsigned long vaddr_end; 814 915 815 - desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); 816 - if (!desc) 817 - panic("SNP: failed to allocate memory for PSC descriptor\n"); 916 + /* Use the MSR protocol when a GHCB is not available. */ 917 + if (!boot_ghcb) 918 + return early_set_pages_state(vaddr, __pa(vaddr), npages, op); 818 919 819 920 vaddr = vaddr & PAGE_MASK; 820 921 vaddr_end = vaddr + (npages << PAGE_SHIFT); 821 922 822 - while (vaddr < vaddr_end) { 823 - /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ 824 - next_vaddr = min_t(unsigned long, vaddr_end, 825 - (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); 826 - 827 - __set_pages_state(desc, vaddr, next_vaddr, op); 828 - 829 - vaddr = next_vaddr; 830 - } 831 - 832 - kfree(desc); 923 + while (vaddr < vaddr_end) 924 + vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); 833 925 } 834 926 835 - void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) 927 + void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) 836 928 { 837 929 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 838 930 return; 839 931 840 - pvalidate_pages(vaddr, npages, false); 841 - 842 932 set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); 843 933 } 844 934 845 - void snp_set_memory_private(unsigned long vaddr, unsigned int npages) 935 + void snp_set_memory_private(unsigned long vaddr, unsigned long npages) 846 936 { 847 937 if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 848 938 return; 849 939 850 940 set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 941 + } 851 942 852 - pvalidate_pages(vaddr, npages, true); 943 + void snp_accept_memory(phys_addr_t start, phys_addr_t end) 944 + { 945 + unsigned long vaddr; 946 + unsigned int npages; 947 + 948 + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 949 + return; 950 + 951 + vaddr = (unsigned long)__va(start); 952 + npages = (end - start) >> PAGE_SHIFT; 953 + 954 + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); 853 955 } 854 956 855 957 static int snp_set_vmsa(void *va, bool vmsa) ··· 1224 1266 if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { 1225 1267 if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) 1226 1268 snp_register_per_cpu_ghcb(); 1269 + 1270 + sev_cfg.ghcbs_initialized = true; 1227 1271 1228 1272 return; 1229 1273 }
+3
arch/x86/platform/efi/efi.c
··· 96 96 #ifdef CONFIG_EFI_COCO_SECRET 97 97 &efi.coco_secret, 98 98 #endif 99 + #ifdef CONFIG_UNACCEPTED_MEMORY 100 + &efi.unaccepted, 101 + #endif 99 102 }; 100 103 101 104 u64 efi_setup; /* efi setup_data physical address */
+7
drivers/base/node.c
··· 449 449 "Node %d FileHugePages: %8lu kB\n" 450 450 "Node %d FilePmdMapped: %8lu kB\n" 451 451 #endif 452 + #ifdef CONFIG_UNACCEPTED_MEMORY 453 + "Node %d Unaccepted: %8lu kB\n" 454 + #endif 452 455 , 453 456 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), 454 457 nid, K(node_page_state(pgdat, NR_WRITEBACK)), ··· 480 477 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), 481 478 nid, K(node_page_state(pgdat, NR_FILE_THPS)), 482 479 nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED)) 480 + #endif 481 + #ifdef CONFIG_UNACCEPTED_MEMORY 482 + , 483 + nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED)) 483 484 #endif 484 485 ); 485 486 len += hugetlb_report_node_meminfo(buf, len, nid);
+14
drivers/firmware/efi/Kconfig
··· 269 269 virt/coco/efi_secret module to access the secrets, which in turn 270 270 allows userspace programs to access the injected secrets. 271 271 272 + config UNACCEPTED_MEMORY 273 + bool 274 + depends on EFI_STUB 275 + help 276 + Some Virtual Machine platforms, such as Intel TDX, require 277 + some memory to be "accepted" by the guest before it can be used. 278 + This mechanism helps prevent malicious hosts from making changes 279 + to guest memory. 280 + 281 + UEFI specification v2.9 introduced EFI_UNACCEPTED_MEMORY memory type. 282 + 283 + This option adds support for unaccepted memory and makes such memory 284 + usable by the kernel. 285 + 272 286 config EFI_EMBEDDED_FIRMWARE 273 287 bool 274 288 select CRYPTO_LIB_SHA256
+1
drivers/firmware/efi/Makefile
··· 41 41 obj-$(CONFIG_EFI_EARLYCON) += earlycon.o 42 42 obj-$(CONFIG_UEFI_CPER_ARM) += cper-arm.o 43 43 obj-$(CONFIG_UEFI_CPER_X86) += cper-x86.o 44 + obj-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o
+26
drivers/firmware/efi/efi.c
··· 50 50 #ifdef CONFIG_EFI_COCO_SECRET 51 51 .coco_secret = EFI_INVALID_TABLE_ADDR, 52 52 #endif 53 + #ifdef CONFIG_UNACCEPTED_MEMORY 54 + .unaccepted = EFI_INVALID_TABLE_ADDR, 55 + #endif 53 56 }; 54 57 EXPORT_SYMBOL(efi); 55 58 ··· 587 584 #ifdef CONFIG_EFI_COCO_SECRET 588 585 {LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" }, 589 586 #endif 587 + #ifdef CONFIG_UNACCEPTED_MEMORY 588 + {LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID, &efi.unaccepted, "Unaccepted" }, 589 + #endif 590 590 #ifdef CONFIG_EFI_GENERIC_STUB 591 591 {LINUX_EFI_SCREEN_INFO_TABLE_GUID, &screen_info_table }, 592 592 #endif ··· 744 738 } 745 739 } 746 740 741 + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && 742 + efi.unaccepted != EFI_INVALID_TABLE_ADDR) { 743 + struct efi_unaccepted_memory *unaccepted; 744 + 745 + unaccepted = early_memremap(efi.unaccepted, sizeof(*unaccepted)); 746 + if (unaccepted) { 747 + unsigned long size; 748 + 749 + if (unaccepted->version == 1) { 750 + size = sizeof(*unaccepted) + unaccepted->size; 751 + memblock_reserve(efi.unaccepted, size); 752 + } else { 753 + efi.unaccepted = EFI_INVALID_TABLE_ADDR; 754 + } 755 + 756 + early_memunmap(unaccepted, sizeof(*unaccepted)); 757 + } 758 + } 759 + 747 760 return 0; 748 761 } 749 762 ··· 847 822 "MMIO Port", 848 823 "PAL Code", 849 824 "Persistent", 825 + "Unaccepted", 850 826 }; 851 827 852 828 char * __init efi_md_typeattr_format(char *buf, size_t size,
+2
drivers/firmware/efi/libstub/Makefile
··· 96 96 zboot-obj-$(CONFIG_RISCV) := lib-clz_ctz.o lib-ashldi3.o 97 97 lib-$(CONFIG_EFI_ZBOOT) += zboot.o $(zboot-obj-y) 98 98 99 + lib-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o bitmap.o find.o 100 + 99 101 extra-y := $(lib-y) 100 102 lib-y := $(patsubst %.o,%.stub.o,$(lib-y)) 101 103
+41
drivers/firmware/efi/libstub/bitmap.c
··· 1 + #include <linux/bitmap.h> 2 + 3 + void __bitmap_set(unsigned long *map, unsigned int start, int len) 4 + { 5 + unsigned long *p = map + BIT_WORD(start); 6 + const unsigned int size = start + len; 7 + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); 8 + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); 9 + 10 + while (len - bits_to_set >= 0) { 11 + *p |= mask_to_set; 12 + len -= bits_to_set; 13 + bits_to_set = BITS_PER_LONG; 14 + mask_to_set = ~0UL; 15 + p++; 16 + } 17 + if (len) { 18 + mask_to_set &= BITMAP_LAST_WORD_MASK(size); 19 + *p |= mask_to_set; 20 + } 21 + } 22 + 23 + void __bitmap_clear(unsigned long *map, unsigned int start, int len) 24 + { 25 + unsigned long *p = map + BIT_WORD(start); 26 + const unsigned int size = start + len; 27 + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); 28 + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); 29 + 30 + while (len - bits_to_clear >= 0) { 31 + *p &= ~mask_to_clear; 32 + len -= bits_to_clear; 33 + bits_to_clear = BITS_PER_LONG; 34 + mask_to_clear = ~0UL; 35 + p++; 36 + } 37 + if (len) { 38 + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); 39 + *p &= ~mask_to_clear; 40 + } 41 + }
+6
drivers/firmware/efi/libstub/efistub.h
··· 1136 1136 asmlinkage efi_status_t __efiapi 1137 1137 efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab); 1138 1138 1139 + efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, 1140 + struct efi_boot_memmap *map); 1141 + void process_unaccepted_memory(u64 start, u64 end); 1142 + void accept_memory(phys_addr_t start, phys_addr_t end); 1143 + void arch_accept_memory(phys_addr_t start, phys_addr_t end); 1144 + 1139 1145 #endif
+43
drivers/firmware/efi/libstub/find.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <linux/bitmap.h> 3 + #include <linux/math.h> 4 + #include <linux/minmax.h> 5 + 6 + /* 7 + * Common helper for find_next_bit() function family 8 + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) 9 + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) 10 + * @size: The bitmap size in bits 11 + * @start: The bitnumber to start searching at 12 + */ 13 + #define FIND_NEXT_BIT(FETCH, MUNGE, size, start) \ 14 + ({ \ 15 + unsigned long mask, idx, tmp, sz = (size), __start = (start); \ 16 + \ 17 + if (unlikely(__start >= sz)) \ 18 + goto out; \ 19 + \ 20 + mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start)); \ 21 + idx = __start / BITS_PER_LONG; \ 22 + \ 23 + for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) { \ 24 + if ((idx + 1) * BITS_PER_LONG >= sz) \ 25 + goto out; \ 26 + idx++; \ 27 + } \ 28 + \ 29 + sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz); \ 30 + out: \ 31 + sz; \ 32 + }) 33 + 34 + unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) 35 + { 36 + return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); 37 + } 38 + 39 + unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, 40 + unsigned long start) 41 + { 42 + return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); 43 + }
+222
drivers/firmware/efi/libstub/unaccepted_memory.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + 3 + #include <linux/efi.h> 4 + #include <asm/efi.h> 5 + #include "efistub.h" 6 + 7 + struct efi_unaccepted_memory *unaccepted_table; 8 + 9 + efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, 10 + struct efi_boot_memmap *map) 11 + { 12 + efi_guid_t unaccepted_table_guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; 13 + u64 unaccepted_start = ULLONG_MAX, unaccepted_end = 0, bitmap_size; 14 + efi_status_t status; 15 + int i; 16 + 17 + /* Check if the table is already installed */ 18 + unaccepted_table = get_efi_config_table(unaccepted_table_guid); 19 + if (unaccepted_table) { 20 + if (unaccepted_table->version != 1) { 21 + efi_err("Unknown version of unaccepted memory table\n"); 22 + return EFI_UNSUPPORTED; 23 + } 24 + return EFI_SUCCESS; 25 + } 26 + 27 + /* Check if there's any unaccepted memory and find the max address */ 28 + for (i = 0; i < nr_desc; i++) { 29 + efi_memory_desc_t *d; 30 + unsigned long m = (unsigned long)map->map; 31 + 32 + d = efi_early_memdesc_ptr(m, map->desc_size, i); 33 + if (d->type != EFI_UNACCEPTED_MEMORY) 34 + continue; 35 + 36 + unaccepted_start = min(unaccepted_start, d->phys_addr); 37 + unaccepted_end = max(unaccepted_end, 38 + d->phys_addr + d->num_pages * PAGE_SIZE); 39 + } 40 + 41 + if (unaccepted_start == ULLONG_MAX) 42 + return EFI_SUCCESS; 43 + 44 + unaccepted_start = round_down(unaccepted_start, 45 + EFI_UNACCEPTED_UNIT_SIZE); 46 + unaccepted_end = round_up(unaccepted_end, EFI_UNACCEPTED_UNIT_SIZE); 47 + 48 + /* 49 + * If unaccepted memory is present, allocate a bitmap to track what 50 + * memory has to be accepted before access. 51 + * 52 + * One bit in the bitmap represents 2MiB in the address space: 53 + * A 4k bitmap can track 64GiB of physical address space. 54 + * 55 + * In the worst case scenario -- a huge hole in the middle of the 56 + * address space -- It needs 256MiB to handle 4PiB of the address 57 + * space. 58 + * 59 + * The bitmap will be populated in setup_e820() according to the memory 60 + * map after efi_exit_boot_services(). 61 + */ 62 + bitmap_size = DIV_ROUND_UP(unaccepted_end - unaccepted_start, 63 + EFI_UNACCEPTED_UNIT_SIZE * BITS_PER_BYTE); 64 + 65 + status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, 66 + sizeof(*unaccepted_table) + bitmap_size, 67 + (void **)&unaccepted_table); 68 + if (status != EFI_SUCCESS) { 69 + efi_err("Failed to allocate unaccepted memory config table\n"); 70 + return status; 71 + } 72 + 73 + unaccepted_table->version = 1; 74 + unaccepted_table->unit_size = EFI_UNACCEPTED_UNIT_SIZE; 75 + unaccepted_table->phys_base = unaccepted_start; 76 + unaccepted_table->size = bitmap_size; 77 + memset(unaccepted_table->bitmap, 0, bitmap_size); 78 + 79 + status = efi_bs_call(install_configuration_table, 80 + &unaccepted_table_guid, unaccepted_table); 81 + if (status != EFI_SUCCESS) { 82 + efi_bs_call(free_pool, unaccepted_table); 83 + efi_err("Failed to install unaccepted memory config table!\n"); 84 + } 85 + 86 + return status; 87 + } 88 + 89 + /* 90 + * The accepted memory bitmap only works at unit_size granularity. Take 91 + * unaligned start/end addresses and either: 92 + * 1. Accepts the memory immediately and in its entirety 93 + * 2. Accepts unaligned parts, and marks *some* aligned part unaccepted 94 + * 95 + * The function will never reach the bitmap_set() with zero bits to set. 96 + */ 97 + void process_unaccepted_memory(u64 start, u64 end) 98 + { 99 + u64 unit_size = unaccepted_table->unit_size; 100 + u64 unit_mask = unaccepted_table->unit_size - 1; 101 + u64 bitmap_size = unaccepted_table->size; 102 + 103 + /* 104 + * Ensure that at least one bit will be set in the bitmap by 105 + * immediately accepting all regions under 2*unit_size. This is 106 + * imprecise and may immediately accept some areas that could 107 + * have been represented in the bitmap. But, results in simpler 108 + * code below 109 + * 110 + * Consider case like this (assuming unit_size == 2MB): 111 + * 112 + * | 4k | 2044k | 2048k | 113 + * ^ 0x0 ^ 2MB ^ 4MB 114 + * 115 + * Only the first 4k has been accepted. The 0MB->2MB region can not be 116 + * represented in the bitmap. The 2MB->4MB region can be represented in 117 + * the bitmap. But, the 0MB->4MB region is <2*unit_size and will be 118 + * immediately accepted in its entirety. 119 + */ 120 + if (end - start < 2 * unit_size) { 121 + arch_accept_memory(start, end); 122 + return; 123 + } 124 + 125 + /* 126 + * No matter how the start and end are aligned, at least one unaccepted 127 + * unit_size area will remain to be marked in the bitmap. 128 + */ 129 + 130 + /* Immediately accept a <unit_size piece at the start: */ 131 + if (start & unit_mask) { 132 + arch_accept_memory(start, round_up(start, unit_size)); 133 + start = round_up(start, unit_size); 134 + } 135 + 136 + /* Immediately accept a <unit_size piece at the end: */ 137 + if (end & unit_mask) { 138 + arch_accept_memory(round_down(end, unit_size), end); 139 + end = round_down(end, unit_size); 140 + } 141 + 142 + /* 143 + * Accept part of the range that before phys_base and cannot be recorded 144 + * into the bitmap. 145 + */ 146 + if (start < unaccepted_table->phys_base) { 147 + arch_accept_memory(start, 148 + min(unaccepted_table->phys_base, end)); 149 + start = unaccepted_table->phys_base; 150 + } 151 + 152 + /* Nothing to record */ 153 + if (end < unaccepted_table->phys_base) 154 + return; 155 + 156 + /* Translate to offsets from the beginning of the bitmap */ 157 + start -= unaccepted_table->phys_base; 158 + end -= unaccepted_table->phys_base; 159 + 160 + /* Accept memory that doesn't fit into bitmap */ 161 + if (end > bitmap_size * unit_size * BITS_PER_BYTE) { 162 + unsigned long phys_start, phys_end; 163 + 164 + phys_start = bitmap_size * unit_size * BITS_PER_BYTE + 165 + unaccepted_table->phys_base; 166 + phys_end = end + unaccepted_table->phys_base; 167 + 168 + arch_accept_memory(phys_start, phys_end); 169 + end = bitmap_size * unit_size * BITS_PER_BYTE; 170 + } 171 + 172 + /* 173 + * 'start' and 'end' are now both unit_size-aligned. 174 + * Record the range as being unaccepted: 175 + */ 176 + bitmap_set(unaccepted_table->bitmap, 177 + start / unit_size, (end - start) / unit_size); 178 + } 179 + 180 + void accept_memory(phys_addr_t start, phys_addr_t end) 181 + { 182 + unsigned long range_start, range_end; 183 + unsigned long bitmap_size; 184 + u64 unit_size; 185 + 186 + if (!unaccepted_table) 187 + return; 188 + 189 + unit_size = unaccepted_table->unit_size; 190 + 191 + /* 192 + * Only care for the part of the range that is represented 193 + * in the bitmap. 194 + */ 195 + if (start < unaccepted_table->phys_base) 196 + start = unaccepted_table->phys_base; 197 + if (end < unaccepted_table->phys_base) 198 + return; 199 + 200 + /* Translate to offsets from the beginning of the bitmap */ 201 + start -= unaccepted_table->phys_base; 202 + end -= unaccepted_table->phys_base; 203 + 204 + /* Make sure not to overrun the bitmap */ 205 + if (end > unaccepted_table->size * unit_size * BITS_PER_BYTE) 206 + end = unaccepted_table->size * unit_size * BITS_PER_BYTE; 207 + 208 + range_start = start / unit_size; 209 + bitmap_size = DIV_ROUND_UP(end, unit_size); 210 + 211 + for_each_set_bitrange_from(range_start, range_end, 212 + unaccepted_table->bitmap, bitmap_size) { 213 + unsigned long phys_start, phys_end; 214 + 215 + phys_start = range_start * unit_size + unaccepted_table->phys_base; 216 + phys_end = range_end * unit_size + unaccepted_table->phys_base; 217 + 218 + arch_accept_memory(phys_start, phys_end); 219 + bitmap_clear(unaccepted_table->bitmap, 220 + range_start, range_end - range_start); 221 + } 222 + }
+60 -15
drivers/firmware/efi/libstub/x86-stub.c
··· 26 26 u32 image_offset __section(".data"); 27 27 static efi_loaded_image_t *image = NULL; 28 28 29 + typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t; 30 + union sev_memory_acceptance_protocol { 31 + struct { 32 + efi_status_t (__efiapi * allow_unaccepted_memory)( 33 + sev_memory_acceptance_protocol_t *); 34 + }; 35 + struct { 36 + u32 allow_unaccepted_memory; 37 + } mixed_mode; 38 + }; 39 + 29 40 static efi_status_t 30 41 preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) 31 42 { ··· 319 308 adjust_memory_range_protection(LOAD_PHYSICAL_ADDR, 320 309 KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR); 321 310 #endif 311 + } 312 + 313 + static void setup_unaccepted_memory(void) 314 + { 315 + efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID; 316 + sev_memory_acceptance_protocol_t *proto; 317 + efi_status_t status; 318 + 319 + if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY)) 320 + return; 321 + 322 + /* 323 + * Enable unaccepted memory before calling exit boot services in order 324 + * for the UEFI to not accept all memory on EBS. 325 + */ 326 + status = efi_bs_call(locate_protocol, &mem_acceptance_proto, NULL, 327 + (void **)&proto); 328 + if (status != EFI_SUCCESS) 329 + return; 330 + 331 + status = efi_call_proto(proto, allow_unaccepted_memory); 332 + if (status != EFI_SUCCESS) 333 + efi_err("Memory acceptance protocol failed\n"); 322 334 } 323 335 324 336 static const efi_char16_t apple[] = L"Apple"; ··· 647 613 e820_type = E820_TYPE_PMEM; 648 614 break; 649 615 616 + case EFI_UNACCEPTED_MEMORY: 617 + if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY)) { 618 + efi_warn_once( 619 + "The system has unaccepted memory, but kernel does not support it\nConsider enabling CONFIG_UNACCEPTED_MEMORY\n"); 620 + continue; 621 + } 622 + e820_type = E820_TYPE_RAM; 623 + process_unaccepted_memory(d->phys_addr, 624 + d->phys_addr + PAGE_SIZE * d->num_pages); 625 + break; 650 626 default: 651 627 continue; 652 628 } ··· 725 681 struct setup_data **e820ext, 726 682 u32 *e820ext_size) 727 683 { 728 - unsigned long map_size, desc_size, map_key; 684 + struct efi_boot_memmap *map; 729 685 efi_status_t status; 730 - __u32 nr_desc, desc_version; 686 + __u32 nr_desc; 731 687 732 - /* Only need the size of the mem map and size of each mem descriptor */ 733 - map_size = 0; 734 - status = efi_bs_call(get_memory_map, &map_size, NULL, &map_key, 735 - &desc_size, &desc_version); 736 - if (status != EFI_BUFFER_TOO_SMALL) 737 - return (status != EFI_SUCCESS) ? status : EFI_UNSUPPORTED; 688 + status = efi_get_memory_map(&map, false); 689 + if (status != EFI_SUCCESS) 690 + return status; 738 691 739 - nr_desc = map_size / desc_size + EFI_MMAP_NR_SLACK_SLOTS; 740 - 741 - if (nr_desc > ARRAY_SIZE(params->e820_table)) { 742 - u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table); 692 + nr_desc = map->map_size / map->desc_size; 693 + if (nr_desc > ARRAY_SIZE(params->e820_table) - EFI_MMAP_NR_SLACK_SLOTS) { 694 + u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table) + 695 + EFI_MMAP_NR_SLACK_SLOTS; 743 696 744 697 status = alloc_e820ext(nr_e820ext, e820ext, e820ext_size); 745 - if (status != EFI_SUCCESS) 746 - return status; 747 698 } 748 699 749 - return EFI_SUCCESS; 700 + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && status == EFI_SUCCESS) 701 + status = allocate_unaccepted_bitmap(nr_desc, map); 702 + 703 + efi_bs_call(free_pool, map); 704 + return status; 750 705 } 751 706 752 707 struct exit_boot_struct { ··· 941 898 setup_efi_pci(boot_params); 942 899 943 900 setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); 901 + 902 + setup_unaccepted_memory(); 944 903 945 904 status = exit_boot(boot_params, handle); 946 905 if (status != EFI_SUCCESS) {
+147
drivers/firmware/efi/unaccepted_memory.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + 3 + #include <linux/efi.h> 4 + #include <linux/memblock.h> 5 + #include <linux/spinlock.h> 6 + #include <asm/unaccepted_memory.h> 7 + 8 + /* Protects unaccepted memory bitmap */ 9 + static DEFINE_SPINLOCK(unaccepted_memory_lock); 10 + 11 + /* 12 + * accept_memory() -- Consult bitmap and accept the memory if needed. 13 + * 14 + * Only memory that is explicitly marked as unaccepted in the bitmap requires 15 + * an action. All the remaining memory is implicitly accepted and doesn't need 16 + * acceptance. 17 + * 18 + * No need to accept: 19 + * - anything if the system has no unaccepted table; 20 + * - memory that is below phys_base; 21 + * - memory that is above the memory that addressable by the bitmap; 22 + */ 23 + void accept_memory(phys_addr_t start, phys_addr_t end) 24 + { 25 + struct efi_unaccepted_memory *unaccepted; 26 + unsigned long range_start, range_end; 27 + unsigned long flags; 28 + u64 unit_size; 29 + 30 + unaccepted = efi_get_unaccepted_table(); 31 + if (!unaccepted) 32 + return; 33 + 34 + unit_size = unaccepted->unit_size; 35 + 36 + /* 37 + * Only care for the part of the range that is represented 38 + * in the bitmap. 39 + */ 40 + if (start < unaccepted->phys_base) 41 + start = unaccepted->phys_base; 42 + if (end < unaccepted->phys_base) 43 + return; 44 + 45 + /* Translate to offsets from the beginning of the bitmap */ 46 + start -= unaccepted->phys_base; 47 + end -= unaccepted->phys_base; 48 + 49 + /* 50 + * load_unaligned_zeropad() can lead to unwanted loads across page 51 + * boundaries. The unwanted loads are typically harmless. But, they 52 + * might be made to totally unrelated or even unmapped memory. 53 + * load_unaligned_zeropad() relies on exception fixup (#PF, #GP and now 54 + * #VE) to recover from these unwanted loads. 55 + * 56 + * But, this approach does not work for unaccepted memory. For TDX, a 57 + * load from unaccepted memory will not lead to a recoverable exception 58 + * within the guest. The guest will exit to the VMM where the only 59 + * recourse is to terminate the guest. 60 + * 61 + * There are two parts to fix this issue and comprehensively avoid 62 + * access to unaccepted memory. Together these ensure that an extra 63 + * "guard" page is accepted in addition to the memory that needs to be 64 + * used: 65 + * 66 + * 1. Implicitly extend the range_contains_unaccepted_memory(start, end) 67 + * checks up to end+unit_size if 'end' is aligned on a unit_size 68 + * boundary. 69 + * 70 + * 2. Implicitly extend accept_memory(start, end) to end+unit_size if 71 + * 'end' is aligned on a unit_size boundary. (immediately following 72 + * this comment) 73 + */ 74 + if (!(end % unit_size)) 75 + end += unit_size; 76 + 77 + /* Make sure not to overrun the bitmap */ 78 + if (end > unaccepted->size * unit_size * BITS_PER_BYTE) 79 + end = unaccepted->size * unit_size * BITS_PER_BYTE; 80 + 81 + range_start = start / unit_size; 82 + 83 + spin_lock_irqsave(&unaccepted_memory_lock, flags); 84 + for_each_set_bitrange_from(range_start, range_end, unaccepted->bitmap, 85 + DIV_ROUND_UP(end, unit_size)) { 86 + unsigned long phys_start, phys_end; 87 + unsigned long len = range_end - range_start; 88 + 89 + phys_start = range_start * unit_size + unaccepted->phys_base; 90 + phys_end = range_end * unit_size + unaccepted->phys_base; 91 + 92 + arch_accept_memory(phys_start, phys_end); 93 + bitmap_clear(unaccepted->bitmap, range_start, len); 94 + } 95 + spin_unlock_irqrestore(&unaccepted_memory_lock, flags); 96 + } 97 + 98 + bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) 99 + { 100 + struct efi_unaccepted_memory *unaccepted; 101 + unsigned long flags; 102 + bool ret = false; 103 + u64 unit_size; 104 + 105 + unaccepted = efi_get_unaccepted_table(); 106 + if (!unaccepted) 107 + return false; 108 + 109 + unit_size = unaccepted->unit_size; 110 + 111 + /* 112 + * Only care for the part of the range that is represented 113 + * in the bitmap. 114 + */ 115 + if (start < unaccepted->phys_base) 116 + start = unaccepted->phys_base; 117 + if (end < unaccepted->phys_base) 118 + return false; 119 + 120 + /* Translate to offsets from the beginning of the bitmap */ 121 + start -= unaccepted->phys_base; 122 + end -= unaccepted->phys_base; 123 + 124 + /* 125 + * Also consider the unaccepted state of the *next* page. See fix #1 in 126 + * the comment on load_unaligned_zeropad() in accept_memory(). 127 + */ 128 + if (!(end % unit_size)) 129 + end += unit_size; 130 + 131 + /* Make sure not to overrun the bitmap */ 132 + if (end > unaccepted->size * unit_size * BITS_PER_BYTE) 133 + end = unaccepted->size * unit_size * BITS_PER_BYTE; 134 + 135 + spin_lock_irqsave(&unaccepted_memory_lock, flags); 136 + while (start < end) { 137 + if (test_bit(start / unit_size, unaccepted->bitmap)) { 138 + ret = true; 139 + break; 140 + } 141 + 142 + start += unit_size; 143 + } 144 + spin_unlock_irqrestore(&unaccepted_memory_lock, flags); 145 + 146 + return ret; 147 + }
+1
drivers/virt/coco/sev-guest/Kconfig
··· 2 2 tristate "AMD SEV Guest driver" 3 3 default m 4 4 depends on AMD_MEM_ENCRYPT 5 + select CRYPTO 5 6 select CRYPTO_AEAD2 6 7 select CRYPTO_GCM 7 8 help
+5
fs/proc/meminfo.c
··· 168 168 global_zone_page_state(NR_FREE_CMA_PAGES)); 169 169 #endif 170 170 171 + #ifdef CONFIG_UNACCEPTED_MEMORY 172 + show_val_kb(m, "Unaccepted: ", 173 + global_zone_page_state(NR_UNACCEPTED)); 174 + #endif 175 + 171 176 hugetlb_report_meminfo(m); 172 177 173 178 arch_report_meminfo(m);
+15 -1
include/linux/efi.h
··· 108 108 #define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 109 109 #define EFI_PAL_CODE 13 110 110 #define EFI_PERSISTENT_MEMORY 14 111 - #define EFI_MAX_MEMORY_TYPE 15 111 + #define EFI_UNACCEPTED_MEMORY 15 112 + #define EFI_MAX_MEMORY_TYPE 16 112 113 113 114 /* Attribute values: */ 114 115 #define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ ··· 418 417 #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID EFI_GUID(0xc451ed2b, 0x9694, 0x45d3, 0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89) 419 418 #define LINUX_EFI_COCO_SECRET_AREA_GUID EFI_GUID(0xadf956ad, 0xe98c, 0x484c, 0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47) 420 419 #define LINUX_EFI_BOOT_MEMMAP_GUID EFI_GUID(0x800f683f, 0xd08b, 0x423a, 0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) 420 + #define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) 421 421 422 422 #define RISCV_EFI_BOOT_PROTOCOL_GUID EFI_GUID(0xccd15fec, 0x6f73, 0x4eec, 0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf) 423 423 ··· 436 434 /* OEM GUIDs */ 437 435 #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) 438 436 #define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) 437 + 438 + /* OVMF protocol GUIDs */ 439 + #define OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID EFI_GUID(0xc5a010fe, 0x38a7, 0x4531, 0x8a, 0x4a, 0x05, 0x00, 0xd2, 0xfd, 0x16, 0x49) 439 440 440 441 typedef struct { 441 442 efi_guid_t guid; ··· 537 532 unsigned long map_key; 538 533 unsigned long buff_size; 539 534 efi_memory_desc_t map[]; 535 + }; 536 + 537 + struct efi_unaccepted_memory { 538 + u32 version; 539 + u32 unit_size; 540 + u64 phys_base; 541 + u64 size; 542 + unsigned long bitmap[]; 540 543 }; 541 544 542 545 /* ··· 649 636 unsigned long tpm_final_log; /* TPM2 Final Events Log table */ 650 637 unsigned long mokvar_table; /* MOK variable config table */ 651 638 unsigned long coco_secret; /* Confidential computing secret table */ 639 + unsigned long unaccepted; /* Unaccepted memory table */ 652 640 653 641 efi_get_time_t *get_time; 654 642 efi_set_time_t *set_time;
+19
include/linux/mm.h
··· 3839 3839 } 3840 3840 #endif 3841 3841 3842 + #ifdef CONFIG_UNACCEPTED_MEMORY 3843 + 3844 + bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); 3845 + void accept_memory(phys_addr_t start, phys_addr_t end); 3846 + 3847 + #else 3848 + 3849 + static inline bool range_contains_unaccepted_memory(phys_addr_t start, 3850 + phys_addr_t end) 3851 + { 3852 + return false; 3853 + } 3854 + 3855 + static inline void accept_memory(phys_addr_t start, phys_addr_t end) 3856 + { 3857 + } 3858 + 3859 + #endif 3860 + 3842 3861 #endif /* _LINUX_MM_H */
+8
include/linux/mmzone.h
··· 143 143 NR_ZSPAGES, /* allocated in zsmalloc */ 144 144 #endif 145 145 NR_FREE_CMA_PAGES, 146 + #ifdef CONFIG_UNACCEPTED_MEMORY 147 + NR_UNACCEPTED, 148 + #endif 146 149 NR_VM_ZONE_STAT_ITEMS }; 147 150 148 151 enum node_stat_item { ··· 912 909 913 910 /* free areas of different sizes */ 914 911 struct free_area free_area[MAX_ORDER + 1]; 912 + 913 + #ifdef CONFIG_UNACCEPTED_MEMORY 914 + /* Pages to be accepted. All pages on the list are MAX_ORDER */ 915 + struct list_head unaccepted_pages; 916 + #endif 915 917 916 918 /* zone flags, see below */ 917 919 unsigned long flags;
+9
mm/memblock.c
··· 1436 1436 */ 1437 1437 kmemleak_alloc_phys(found, size, 0); 1438 1438 1439 + /* 1440 + * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, 1441 + * require memory to be accepted before it can be used by the 1442 + * guest. 1443 + * 1444 + * Accept the memory of the allocated buffer. 1445 + */ 1446 + accept_memory(found, found + size); 1447 + 1439 1448 return found; 1440 1449 } 1441 1450
+7
mm/mm_init.c
··· 1375 1375 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 1376 1376 zone->free_area[order].nr_free = 0; 1377 1377 } 1378 + 1379 + #ifdef CONFIG_UNACCEPTED_MEMORY 1380 + INIT_LIST_HEAD(&zone->unaccepted_pages); 1381 + #endif 1378 1382 } 1379 1383 1380 1384 void __meminit init_currently_empty_zone(struct zone *zone, ··· 1963 1959 __free_pages_core(page, MAX_ORDER); 1964 1960 return; 1965 1961 } 1962 + 1963 + /* Accept chunks smaller than MAX_ORDER upfront */ 1964 + accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); 1966 1965 1967 1966 for (i = 0; i < nr_pages; i++, page++, pfn++) { 1968 1967 if (pageblock_aligned(pfn))
+173
mm/page_alloc.c
··· 387 387 EXPORT_SYMBOL(nr_online_nodes); 388 388 #endif 389 389 390 + static bool page_contains_unaccepted(struct page *page, unsigned int order); 391 + static void accept_page(struct page *page, unsigned int order); 392 + static bool try_to_accept_memory(struct zone *zone, unsigned int order); 393 + static inline bool has_unaccepted_memory(void); 394 + static bool __free_unaccepted(struct page *page); 395 + 390 396 int page_group_by_mobility_disabled __read_mostly; 391 397 392 398 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT ··· 1486 1480 set_page_count(p, 0); 1487 1481 1488 1482 atomic_long_add(nr_pages, &page_zone(page)->managed_pages); 1483 + 1484 + if (page_contains_unaccepted(page, order)) { 1485 + if (order == MAX_ORDER && __free_unaccepted(page)) 1486 + return; 1487 + 1488 + accept_page(page, order); 1489 + } 1489 1490 1490 1491 /* 1491 1492 * Bypass PCP and place fresh pages right to the tail, primarily ··· 3172 3159 if (!(alloc_flags & ALLOC_CMA)) 3173 3160 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); 3174 3161 #endif 3162 + #ifdef CONFIG_UNACCEPTED_MEMORY 3163 + unusable_free += zone_page_state(z, NR_UNACCEPTED); 3164 + #endif 3175 3165 3176 3166 return unusable_free; 3177 3167 } ··· 3474 3458 gfp_mask)) { 3475 3459 int ret; 3476 3460 3461 + if (has_unaccepted_memory()) { 3462 + if (try_to_accept_memory(zone, order)) 3463 + goto try_this_zone; 3464 + } 3465 + 3477 3466 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3478 3467 /* 3479 3468 * Watermark failed for this zone, but see if we can ··· 3531 3510 3532 3511 return page; 3533 3512 } else { 3513 + if (has_unaccepted_memory()) { 3514 + if (try_to_accept_memory(zone, order)) 3515 + goto try_this_zone; 3516 + } 3517 + 3534 3518 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 3535 3519 /* Try again if zone has deferred pages */ 3536 3520 if (deferred_pages_enabled()) { ··· 7241 7215 return false; 7242 7216 } 7243 7217 #endif /* CONFIG_ZONE_DMA */ 7218 + 7219 + #ifdef CONFIG_UNACCEPTED_MEMORY 7220 + 7221 + /* Counts number of zones with unaccepted pages. */ 7222 + static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); 7223 + 7224 + static bool lazy_accept = true; 7225 + 7226 + static int __init accept_memory_parse(char *p) 7227 + { 7228 + if (!strcmp(p, "lazy")) { 7229 + lazy_accept = true; 7230 + return 0; 7231 + } else if (!strcmp(p, "eager")) { 7232 + lazy_accept = false; 7233 + return 0; 7234 + } else { 7235 + return -EINVAL; 7236 + } 7237 + } 7238 + early_param("accept_memory", accept_memory_parse); 7239 + 7240 + static bool page_contains_unaccepted(struct page *page, unsigned int order) 7241 + { 7242 + phys_addr_t start = page_to_phys(page); 7243 + phys_addr_t end = start + (PAGE_SIZE << order); 7244 + 7245 + return range_contains_unaccepted_memory(start, end); 7246 + } 7247 + 7248 + static void accept_page(struct page *page, unsigned int order) 7249 + { 7250 + phys_addr_t start = page_to_phys(page); 7251 + 7252 + accept_memory(start, start + (PAGE_SIZE << order)); 7253 + } 7254 + 7255 + static bool try_to_accept_memory_one(struct zone *zone) 7256 + { 7257 + unsigned long flags; 7258 + struct page *page; 7259 + bool last; 7260 + 7261 + if (list_empty(&zone->unaccepted_pages)) 7262 + return false; 7263 + 7264 + spin_lock_irqsave(&zone->lock, flags); 7265 + page = list_first_entry_or_null(&zone->unaccepted_pages, 7266 + struct page, lru); 7267 + if (!page) { 7268 + spin_unlock_irqrestore(&zone->lock, flags); 7269 + return false; 7270 + } 7271 + 7272 + list_del(&page->lru); 7273 + last = list_empty(&zone->unaccepted_pages); 7274 + 7275 + __mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); 7276 + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); 7277 + spin_unlock_irqrestore(&zone->lock, flags); 7278 + 7279 + accept_page(page, MAX_ORDER); 7280 + 7281 + __free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); 7282 + 7283 + if (last) 7284 + static_branch_dec(&zones_with_unaccepted_pages); 7285 + 7286 + return true; 7287 + } 7288 + 7289 + static bool try_to_accept_memory(struct zone *zone, unsigned int order) 7290 + { 7291 + long to_accept; 7292 + int ret = false; 7293 + 7294 + /* How much to accept to get to high watermark? */ 7295 + to_accept = high_wmark_pages(zone) - 7296 + (zone_page_state(zone, NR_FREE_PAGES) - 7297 + __zone_watermark_unusable_free(zone, order, 0)); 7298 + 7299 + /* Accept at least one page */ 7300 + do { 7301 + if (!try_to_accept_memory_one(zone)) 7302 + break; 7303 + ret = true; 7304 + to_accept -= MAX_ORDER_NR_PAGES; 7305 + } while (to_accept > 0); 7306 + 7307 + return ret; 7308 + } 7309 + 7310 + static inline bool has_unaccepted_memory(void) 7311 + { 7312 + return static_branch_unlikely(&zones_with_unaccepted_pages); 7313 + } 7314 + 7315 + static bool __free_unaccepted(struct page *page) 7316 + { 7317 + struct zone *zone = page_zone(page); 7318 + unsigned long flags; 7319 + bool first = false; 7320 + 7321 + if (!lazy_accept) 7322 + return false; 7323 + 7324 + spin_lock_irqsave(&zone->lock, flags); 7325 + first = list_empty(&zone->unaccepted_pages); 7326 + list_add_tail(&page->lru, &zone->unaccepted_pages); 7327 + __mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); 7328 + __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); 7329 + spin_unlock_irqrestore(&zone->lock, flags); 7330 + 7331 + if (first) 7332 + static_branch_inc(&zones_with_unaccepted_pages); 7333 + 7334 + return true; 7335 + } 7336 + 7337 + #else 7338 + 7339 + static bool page_contains_unaccepted(struct page *page, unsigned int order) 7340 + { 7341 + return false; 7342 + } 7343 + 7344 + static void accept_page(struct page *page, unsigned int order) 7345 + { 7346 + } 7347 + 7348 + static bool try_to_accept_memory(struct zone *zone, unsigned int order) 7349 + { 7350 + return false; 7351 + } 7352 + 7353 + static inline bool has_unaccepted_memory(void) 7354 + { 7355 + return false; 7356 + } 7357 + 7358 + static bool __free_unaccepted(struct page *page) 7359 + { 7360 + BUILD_BUG(); 7361 + return false; 7362 + } 7363 + 7364 + #endif /* CONFIG_UNACCEPTED_MEMORY */
+3
mm/vmstat.c
··· 1180 1180 "nr_zspages", 1181 1181 #endif 1182 1182 "nr_free_cma", 1183 + #ifdef CONFIG_UNACCEPTED_MEMORY 1184 + "nr_unaccepted", 1185 + #endif 1183 1186 1184 1187 /* enum numa_stat_item counters */ 1185 1188 #ifdef CONFIG_NUMA