Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: x86: Unify L1TF flushing under per-CPU variable

Currently the tracking of the need to flush L1D for L1TF is tracked by
two bits: one per-CPU and one per-vCPU.

The per-vCPU bit is always set when the vCPU shows up on a core, so
there is no interesting state that's truly per-vCPU. Indeed, this is a
requirement, since L1D is a part of the physical CPU.

So simplify this by combining the two bits.

The vCPU bit was being written from preemption-enabled regions. To play
nice with those cases, wrap all calls from KVM and use a raw write so that
request a flush with preemption enabled doesn't trigger what would
effectively be DEBUG_PREEMPT false positives. Preemption doesn't need to
be disabled, as kvm_arch_vcpu_load() will mark the new CPU as needing a
flush if the vCPU task is migrated, or if userspace runs the vCPU on a
different task.

Signed-off-by: Brendan Jackman <jackmanb@google.com>
[sean: put raw write in KVM instead of in a hardirq.h variant]
Link: https://patch.msgid.link/20251113233746.1703361-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>

authored by

Brendan Jackman and committed by
Sean Christopherson
38ee66cb 05bd6395

+24 -23
-3
arch/x86/include/asm/kvm_host.h
··· 1055 1055 /* be preempted when it's in kernel-mode(cpl=0) */ 1056 1056 bool preempted_in_kernel; 1057 1057 1058 - /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ 1059 - bool l1tf_flush_l1d; 1060 - 1061 1058 /* Host CPU on which VM-entry was most recently attempted */ 1062 1059 int last_vmentry_cpu; 1063 1060
+1 -1
arch/x86/kvm/mmu/mmu.c
··· 4859 4859 */ 4860 4860 BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); 4861 4861 4862 - vcpu->arch.l1tf_flush_l1d = true; 4862 + kvm_request_l1tf_flush_l1d(); 4863 4863 if (!flags) { 4864 4864 trace_kvm_page_fault(vcpu, fault_address, error_code); 4865 4865
+1 -1
arch/x86/kvm/vmx/nested.c
··· 3880 3880 goto vmentry_failed; 3881 3881 3882 3882 /* Hide L1D cache contents from the nested guest. */ 3883 - vmx->vcpu.arch.l1tf_flush_l1d = true; 3883 + kvm_request_l1tf_flush_l1d(); 3884 3884 3885 3885 /* 3886 3886 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
+5 -15
arch/x86/kvm/vmx/vmx.c
··· 395 395 * 'always' 396 396 */ 397 397 if (static_branch_likely(&vmx_l1d_flush_cond)) { 398 - bool flush_l1d; 399 - 400 398 /* 401 - * Clear the per-vcpu flush bit, it gets set again if the vCPU 399 + * Clear the per-cpu flush bit, it gets set again if the vCPU 402 400 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 403 401 * exits to userspace, or if KVM reaches one of the unsafe 404 - * VMEXIT handlers, e.g. if KVM calls into the emulator. 402 + * VMEXIT handlers, e.g. if KVM calls into the emulator, 403 + * or from the interrupt handlers. 405 404 */ 406 - flush_l1d = vcpu->arch.l1tf_flush_l1d; 407 - vcpu->arch.l1tf_flush_l1d = false; 408 - 409 - /* 410 - * Clear the per-cpu flush bit, it gets set again from 411 - * the interrupt handlers. 412 - */ 413 - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 414 - kvm_clear_cpu_l1tf_flush_l1d(); 415 - 416 - if (!flush_l1d) 405 + if (!kvm_get_cpu_l1tf_flush_l1d()) 417 406 return; 407 + kvm_clear_cpu_l1tf_flush_l1d(); 418 408 } 419 409 420 410 vcpu->stat.l1d_flush++;
+3 -3
arch/x86/kvm/x86.c
··· 5156 5156 { 5157 5157 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 5158 5158 5159 - vcpu->arch.l1tf_flush_l1d = true; 5159 + kvm_request_l1tf_flush_l1d(); 5160 5160 5161 5161 if (vcpu->scheduled_out && pmu->version && pmu->event_count) { 5162 5162 pmu->need_cleanup = true; ··· 7966 7966 unsigned int bytes, struct x86_exception *exception) 7967 7967 { 7968 7968 /* kvm_write_guest_virt_system can pull in tons of pages. */ 7969 - vcpu->arch.l1tf_flush_l1d = true; 7969 + kvm_request_l1tf_flush_l1d(); 7970 7970 7971 7971 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 7972 7972 PFERR_WRITE_MASK, exception); ··· 9374 9374 return handle_emulation_failure(vcpu, emulation_type); 9375 9375 } 9376 9376 9377 - vcpu->arch.l1tf_flush_l1d = true; 9377 + kvm_request_l1tf_flush_l1d(); 9378 9378 9379 9379 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 9380 9380 kvm_clear_exception_queue(vcpu);
+14
arch/x86/kvm/x86.h
··· 420 420 return !(kvm->arch.disabled_quirks & quirk); 421 421 } 422 422 423 + static __always_inline void kvm_request_l1tf_flush_l1d(void) 424 + { 425 + #if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) 426 + /* 427 + * Use a raw write to set the per-CPU flag, as KVM will ensure a flush 428 + * even if preemption is currently enabled.. If the current vCPU task 429 + * is migrated to a different CPU (or userspace runs the vCPU on a 430 + * different task) before the next VM-Entry, then kvm_arch_vcpu_load() 431 + * will request a flush on the new CPU. 432 + */ 433 + raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); 434 + #endif 435 + } 436 + 423 437 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 424 438 425 439 u64 get_kvmclock_ns(struct kvm *kvm);