Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/boot: Avoid #VE during boot for TDX platforms

There are a few MSRs and control register bits that the kernel
normally needs to modify during boot. But, TDX disallows
modification of these registers to help provide consistent security
guarantees. Fortunately, TDX ensures that these are all in the correct
state before the kernel loads, which means the kernel does not need to
modify them.

The conditions to avoid are:

* Any writes to the EFER MSR
* Clearing CR4.MCE

This theoretically makes the guest boot more fragile. If, for instance,
EFER was set up incorrectly and a WRMSR was performed, it will trigger
early exception panic or a triple fault, if it's before early
exceptions are set up. However, this is likely to trip up the guest
BIOS long before control reaches the kernel. In any case, these kinds
of problems are unlikely to occur in production environments, and
developers have good debug tools to fix them quickly.

Change the common boot code to work on TDX and non-TDX systems.
This should have no functional effect on non-TDX systems.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20220405232939.73860-24-kirill.shutemov@linux.intel.com

authored by

Sean Christopherson and committed by
Dave Hansen
77a512e3 9cf30606

+58 -6
+1
arch/x86/Kconfig
··· 884 884 depends on X86_X2APIC 885 885 select ARCH_HAS_CC_PLATFORM 886 886 select DYNAMIC_PHYSICAL_MASK 887 + select X86_MCE 887 888 help 888 889 Support running as a guest under Intel TDX. Without this support, 889 890 the guest kernel can not boot or run under TDX.
+18 -2
arch/x86/boot/compressed/head_64.S
··· 642 642 movl $MSR_EFER, %ecx 643 643 rdmsr 644 644 btsl $_EFER_LME, %eax 645 + /* Avoid writing EFER if no change was made (for TDX guest) */ 646 + jc 1f 645 647 wrmsr 646 - popl %edx 648 + 1: popl %edx 647 649 popl %ecx 648 650 651 + #ifdef CONFIG_X86_MCE 652 + /* 653 + * Preserve CR4.MCE if the kernel will enable #MC support. 654 + * Clearing MCE may fault in some environments (that also force #MC 655 + * support). Any machine check that occurs before #MC support is fully 656 + * configured will crash the system regardless of the CR4.MCE value set 657 + * here. 658 + */ 659 + movl %cr4, %eax 660 + andl $X86_CR4_MCE, %eax 661 + #else 662 + movl $0, %eax 663 + #endif 664 + 649 665 /* Enable PAE and LA57 (if required) paging modes */ 650 - movl $X86_CR4_PAE, %eax 666 + orl $X86_CR4_PAE, %eax 651 667 testl %edx, %edx 652 668 jz 1f 653 669 orl $X86_CR4_LA57, %eax
+1 -1
arch/x86/boot/compressed/pgtable.h
··· 6 6 #define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 7 7 8 8 #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE 9 - #define TRAMPOLINE_32BIT_CODE_SIZE 0x70 9 + #define TRAMPOLINE_32BIT_CODE_SIZE 0x80 10 10 11 11 #define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE 12 12
+26 -2
arch/x86/kernel/head_64.S
··· 142 142 addq $(init_top_pgt - __START_KERNEL_map), %rax 143 143 1: 144 144 145 + #ifdef CONFIG_X86_MCE 146 + /* 147 + * Preserve CR4.MCE if the kernel will enable #MC support. 148 + * Clearing MCE may fault in some environments (that also force #MC 149 + * support). Any machine check that occurs before #MC support is fully 150 + * configured will crash the system regardless of the CR4.MCE value set 151 + * here. 152 + */ 153 + movq %cr4, %rcx 154 + andl $X86_CR4_MCE, %ecx 155 + #else 156 + movl $0, %ecx 157 + #endif 158 + 145 159 /* Enable PAE mode, PGE and LA57 */ 146 - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx 160 + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx 147 161 #ifdef CONFIG_X86_5LEVEL 148 162 testl $1, __pgtable_l5_enabled(%rip) 149 163 jz 1f ··· 263 249 /* Setup EFER (Extended Feature Enable Register) */ 264 250 movl $MSR_EFER, %ecx 265 251 rdmsr 252 + /* 253 + * Preserve current value of EFER for comparison and to skip 254 + * EFER writes if no change was made (for TDX guest) 255 + */ 256 + movl %eax, %edx 266 257 btsl $_EFER_SCE, %eax /* Enable System Call */ 267 258 btl $20,%edi /* No Execute supported? */ 268 259 jnc 1f 269 260 btsl $_EFER_NX, %eax 270 261 btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) 271 - 1: wrmsr /* Make changes effective */ 272 262 263 + /* Avoid writing EFER if no change was made (for TDX guest) */ 264 + 1: cmpl %edx, %eax 265 + je 1f 266 + xor %edx, %edx 267 + wrmsr /* Make changes effective */ 268 + 1: 273 269 /* Setup cr0 */ 274 270 movl $CR0_STATE, %eax 275 271 /* Make changes effective */
+12 -1
arch/x86/realmode/rm/trampoline_64.S
··· 143 143 movl %eax, %cr3 144 144 145 145 # Set up EFER 146 + movl $MSR_EFER, %ecx 147 + rdmsr 148 + /* 149 + * Skip writing to EFER if the register already has desired 150 + * value (to avoid #VE for the TDX guest). 151 + */ 152 + cmp pa_tr_efer, %eax 153 + jne .Lwrite_efer 154 + cmp pa_tr_efer + 4, %edx 155 + je .Ldone_efer 156 + .Lwrite_efer: 146 157 movl pa_tr_efer, %eax 147 158 movl pa_tr_efer + 4, %edx 148 - movl $MSR_EFER, %ecx 149 159 wrmsr 150 160 161 + .Ldone_efer: 151 162 # Enable paging and in turn activate Long Mode. 152 163 movl $CR0_STATE, %eax 153 164 movl %eax, %cr0