Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'x86_tdx_for_6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 TDX updates from Dave Hansen:
"Avoid direct HLT instruction execution in TDX guests.

TDX guests aren't expected to use the HLT instruction directly. It
causes a virtualization exception (#VE). While the #VE _can_ be
handled, the current handling is slow and buggy and the easiest thing
is just to avoid HLT in the first place. Plus, the kernel already has
paravirt infrastructure that makes it relatively painless.

Make TDX guests require paravirt and add some TDX-specific paravirt
handlers which avoid HLT in the normal halt routines. Also add a
warning in case another HLT sneaks in.

There was a report that this leads to a "major performance
improvement" on specjbb2015, probably because of the extra #VE
overhead or missed wakeups from the buggy HLT handling"

* tag 'x86_tdx_for_6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/tdx: Emit warning if IRQs are enabled during HLT #VE handling
x86/tdx: Fix arch_safe_halt() execution for TDX VMs
x86/paravirt: Move halt paravirt calls under CONFIG_PARAVIRT

+78 -40
+1
arch/x86/Kconfig
··· 889 889 depends on X86_64 && CPU_SUP_INTEL 890 890 depends on X86_X2APIC 891 891 depends on EFI_STUB 892 + depends on PARAVIRT 892 893 select ARCH_HAS_CC_PLATFORM 893 894 select X86_MEM_ENCRYPT 894 895 select X86_MCE
+33 -1
arch/x86/coco/tdx/tdx.c
··· 14 14 #include <asm/ia32.h> 15 15 #include <asm/insn.h> 16 16 #include <asm/insn-eval.h> 17 + #include <asm/paravirt_types.h> 17 18 #include <asm/pgtable.h> 18 19 #include <asm/set_memory.h> 19 20 #include <asm/traps.h> ··· 393 392 { 394 393 const bool irq_disabled = irqs_disabled(); 395 394 395 + /* 396 + * HLT with IRQs enabled is unsafe, as an IRQ that is intended to be a 397 + * wake event may be consumed before requesting HLT emulation, leaving 398 + * the vCPU blocking indefinitely. 399 + */ 400 + if (WARN_ONCE(!irq_disabled, "HLT emulation with IRQs enabled")) 401 + return -EIO; 402 + 396 403 if (__halt(irq_disabled)) 397 404 return -EIO; 398 405 399 406 return ve_instr_len(ve); 400 407 } 401 408 402 - void __cpuidle tdx_safe_halt(void) 409 + void __cpuidle tdx_halt(void) 403 410 { 404 411 const bool irq_disabled = false; 405 412 ··· 416 407 */ 417 408 if (__halt(irq_disabled)) 418 409 WARN_ONCE(1, "HLT instruction emulation failed\n"); 410 + } 411 + 412 + static void __cpuidle tdx_safe_halt(void) 413 + { 414 + tdx_halt(); 415 + /* 416 + * "__cpuidle" section doesn't support instrumentation, so stick 417 + * with raw_* variant that avoids tracing hooks. 418 + */ 419 + raw_local_irq_enable(); 419 420 } 420 421 421 422 static int read_msr(struct pt_regs *regs, struct ve_info *ve) ··· 1127 1108 1128 1109 x86_platform.guest.enc_kexec_begin = tdx_kexec_begin; 1129 1110 x86_platform.guest.enc_kexec_finish = tdx_kexec_finish; 1111 + 1112 + /* 1113 + * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that 1114 + * will enable interrupts before HLT TDCALL invocation if executed 1115 + * in STI-shadow, possibly resulting in missed wakeup events. 1116 + * 1117 + * Modify all possible HLT execution paths to use TDX specific routines 1118 + * that directly execute TDCALL and toggle the interrupt state as 1119 + * needed after TDCALL completion. This also reduces HLT related #VEs 1120 + * in addition to having a reliable halt logic execution. 1121 + */ 1122 + pv_ops.irq.safe_halt = tdx_safe_halt; 1123 + pv_ops.irq.halt = tdx_halt; 1130 1124 1131 1125 /* 1132 1126 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
+22 -18
arch/x86/include/asm/irqflags.h
··· 76 76 77 77 #endif 78 78 79 + #ifndef CONFIG_PARAVIRT 80 + #ifndef __ASSEMBLY__ 81 + /* 82 + * Used in the idle loop; sti takes one instruction cycle 83 + * to complete: 84 + */ 85 + static __always_inline void arch_safe_halt(void) 86 + { 87 + native_safe_halt(); 88 + } 89 + 90 + /* 91 + * Used when interrupts are already enabled or to 92 + * shutdown the processor: 93 + */ 94 + static __always_inline void halt(void) 95 + { 96 + native_halt(); 97 + } 98 + #endif /* __ASSEMBLY__ */ 99 + #endif /* CONFIG_PARAVIRT */ 100 + 79 101 #ifdef CONFIG_PARAVIRT_XXL 80 102 #include <asm/paravirt.h> 81 103 #else ··· 117 95 static __always_inline void arch_local_irq_enable(void) 118 96 { 119 97 native_irq_enable(); 120 - } 121 - 122 - /* 123 - * Used in the idle loop; sti takes one instruction cycle 124 - * to complete: 125 - */ 126 - static __always_inline void arch_safe_halt(void) 127 - { 128 - native_safe_halt(); 129 - } 130 - 131 - /* 132 - * Used when interrupts are already enabled or to 133 - * shutdown the processor: 134 - */ 135 - static __always_inline void halt(void) 136 - { 137 - native_halt(); 138 98 } 139 99 140 100 /*
+10 -10
arch/x86/include/asm/paravirt.h
··· 102 102 PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); 103 103 } 104 104 105 + static __always_inline void arch_safe_halt(void) 106 + { 107 + PVOP_VCALL0(irq.safe_halt); 108 + } 109 + 110 + static inline void halt(void) 111 + { 112 + PVOP_VCALL0(irq.halt); 113 + } 114 + 105 115 #ifdef CONFIG_PARAVIRT_XXL 106 116 static inline void load_sp0(unsigned long sp0) 107 117 { ··· 173 163 static inline void __write_cr4(unsigned long x) 174 164 { 175 165 PVOP_VCALL1(cpu.write_cr4, x); 176 - } 177 - 178 - static __always_inline void arch_safe_halt(void) 179 - { 180 - PVOP_VCALL0(irq.safe_halt); 181 - } 182 - 183 - static inline void halt(void) 184 - { 185 - PVOP_VCALL0(irq.halt); 186 166 } 187 167 188 168 static inline u64 paravirt_read_msr(unsigned msr)
+1 -2
arch/x86/include/asm/paravirt_types.h
··· 120 120 struct paravirt_callee_save save_fl; 121 121 struct paravirt_callee_save irq_disable; 122 122 struct paravirt_callee_save irq_enable; 123 - 123 + #endif 124 124 void (*safe_halt)(void); 125 125 void (*halt)(void); 126 - #endif 127 126 } __no_randomize_layout; 128 127 129 128 struct pv_mmu_ops {
+2 -2
arch/x86/include/asm/tdx.h
··· 58 58 59 59 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); 60 60 61 - void tdx_safe_halt(void); 61 + void tdx_halt(void); 62 62 63 63 bool tdx_early_handle_ve(struct pt_regs *regs); 64 64 ··· 72 72 #else 73 73 74 74 static inline void tdx_early_init(void) { }; 75 - static inline void tdx_safe_halt(void) { }; 75 + static inline void tdx_halt(void) { }; 76 76 77 77 static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } 78 78
+8 -6
arch/x86/kernel/paravirt.c
··· 75 75 static_call_update(pv_sched_clock, func); 76 76 } 77 77 78 + static noinstr void pv_native_safe_halt(void) 79 + { 80 + native_safe_halt(); 81 + } 82 + 78 83 #ifdef CONFIG_PARAVIRT_XXL 79 84 static noinstr void pv_native_write_cr2(unsigned long val) 80 85 { ··· 104 99 static noinstr void pv_native_set_debugreg(int regno, unsigned long val) 105 100 { 106 101 native_set_debugreg(regno, val); 107 - } 108 - 109 - static noinstr void pv_native_safe_halt(void) 110 - { 111 - native_safe_halt(); 112 102 } 113 103 #endif 114 104 ··· 161 161 .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), 162 162 .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), 163 163 .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), 164 + #endif /* CONFIG_PARAVIRT_XXL */ 165 + 166 + /* Irq HLT ops. */ 164 167 .irq.safe_halt = pv_native_safe_halt, 165 168 .irq.halt = native_halt, 166 - #endif /* CONFIG_PARAVIRT_XXL */ 167 169 168 170 /* Mmu ops. */ 169 171 .mmu.flush_tlb_user = native_flush_tlb_local,
+1 -1
arch/x86/kernel/process.c
··· 939 939 static_call_update(x86_idle, mwait_idle); 940 940 } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { 941 941 pr_info("using TDX aware idle routine\n"); 942 - static_call_update(x86_idle, tdx_safe_halt); 942 + static_call_update(x86_idle, tdx_halt); 943 943 } else { 944 944 static_call_update(x86_idle, default_idle); 945 945 }