Merge tag 'kvm-x86-fixes-6.14-rcN.2' of https://github.com/kvm-x86/linux into HEAD

KVM x86 fixes for 6.14-rcN #2

- Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow.

- Ensure DEBUGCTL is context switched on AMD to avoid running the guest with
the host's value, which can lead to unexpected bus lock #DBs.

- Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't properly
emulate BTF. KVM's lack of context switching has meant BTF has always been
broken to some extent.

- Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as the guest
can enable DebugSwap without KVM's knowledge.

- Fix a bug in mmu_stress_tests where a vCPU could finish the "writes to RO
memory" phase without actually generating a write-protection fault.

- Fix a printf() goof in the SEV smoke test that causes build failures with
-Werror.

- Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when PERFMON_V2
isn't supported by KVM.

+91 -35
+1
arch/x86/include/asm/kvm_host.h
··· 780 780 u32 pkru; 781 781 u32 hflags; 782 782 u64 efer; 783 + u64 host_debugctl; 783 784 u64 apic_base; 784 785 struct kvm_lapic *apic; /* kernel irqchip context */ 785 786 bool load_eoi_exitmap_pending;
+1 -1
arch/x86/kvm/cpuid.c
··· 1763 1763 1764 1764 entry->ecx = entry->edx = 0; 1765 1765 if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { 1766 - entry->eax = entry->ebx; 1766 + entry->eax = entry->ebx = 0; 1767 1767 break; 1768 1768 } 1769 1769
+17 -7
arch/x86/kvm/svm/sev.c
··· 4590 4590 4591 4591 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4592 4592 { 4593 + struct kvm *kvm = svm->vcpu.kvm; 4594 + 4593 4595 /* 4594 4596 * All host state for SEV-ES guests is categorized into three swap types 4595 4597 * based on how it is handled by hardware during a world switch: ··· 4615 4613 4616 4614 /* 4617 4615 * If DebugSwap is enabled, debug registers are loaded but NOT saved by 4618 - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both 4619 - * saves and loads debug registers (Type-A). 4616 + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does 4617 + * not save or load debug registers. Sadly, KVM can't prevent SNP 4618 + * guests from lying about DebugSwap on secondary vCPUs, i.e. the 4619 + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what 4620 + * the guest has actually enabled (or not!) in the VMSA. 4621 + * 4622 + * If DebugSwap is *possible*, save the masks so that they're restored 4623 + * if the guest enables DebugSwap. But for the DRs themselves, do NOT 4624 + * rely on the CPU to restore the host values; KVM will restore them as 4625 + * needed in common code, via hw_breakpoint_restore(). Note, KVM does 4626 + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs 4627 + * don't need to be restored per se, KVM just needs to ensure they are 4628 + * loaded with the correct values *if* the CPU writes the MSRs. 4620 4629 */ 4621 - if (sev_vcpu_has_debug_swap(svm)) { 4622 - hostsa->dr0 = native_get_debugreg(0); 4623 - hostsa->dr1 = native_get_debugreg(1); 4624 - hostsa->dr2 = native_get_debugreg(2); 4625 - hostsa->dr3 = native_get_debugreg(3); 4630 + if (sev_vcpu_has_debug_swap(svm) || 4631 + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { 4626 4632 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4627 4633 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4628 4634 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
+49
arch/x86/kvm/svm/svm.c
··· 3165 3165 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3166 3166 break; 3167 3167 } 3168 + 3169 + /* 3170 + * AMD changed the architectural behavior of bits 5:2. On CPUs 3171 + * without BusLockTrap, bits 5:2 control "external pins", but 3172 + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap 3173 + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed 3174 + * the guest to set bits 5:2 despite not actually virtualizing 3175 + * Performance-Monitoring/Breakpoint external pins. Drop bits 3176 + * 5:2 for backwards compatibility. 3177 + */ 3178 + data &= ~GENMASK(5, 2); 3179 + 3180 + /* 3181 + * Suppress BTF as KVM doesn't virtualize BTF, but there's no 3182 + * way to communicate lack of support to the guest. 3183 + */ 3184 + if (data & DEBUGCTLMSR_BTF) { 3185 + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 3186 + data &= ~DEBUGCTLMSR_BTF; 3187 + } 3188 + 3168 3189 if (data & DEBUGCTL_RESERVED_BITS) 3169 3190 return 1; 3170 3191 ··· 4210 4189 4211 4190 guest_state_enter_irqoff(); 4212 4191 4192 + /* 4193 + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of 4194 + * VMRUN controls whether or not physical IRQs are masked (KVM always 4195 + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the 4196 + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow 4197 + * into guest state if delivery of an event during VMRUN triggers a 4198 + * #VMEXIT, and the guest_state transitions already tell lockdep that 4199 + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of 4200 + * this path, so IRQs aren't actually unmasked while running host code. 4201 + */ 4202 + raw_local_irq_enable(); 4203 + 4213 4204 amd_clear_divider(); 4214 4205 4215 4206 if (sev_es_guest(vcpu->kvm)) ··· 4229 4196 sev_es_host_save_area(sd)); 4230 4197 else 4231 4198 __svm_vcpu_run(svm, spec_ctrl_intercepted); 4199 + 4200 + raw_local_irq_disable(); 4232 4201 4233 4202 guest_state_exit_irqoff(); 4234 4203 } ··· 4288 4253 clgi(); 4289 4254 kvm_load_guest_xsave_state(vcpu); 4290 4255 4256 + /* 4257 + * Hardware only context switches DEBUGCTL if LBR virtualization is 4258 + * enabled. Manually load DEBUGCTL if necessary (and restore it after 4259 + * VM-Exit), as running with the host's DEBUGCTL can negatively affect 4260 + * guest state and can even be fatal, e.g. due to Bus Lock Detect. 4261 + */ 4262 + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4263 + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4264 + update_debugctlmsr(svm->vmcb->save.dbgctl); 4265 + 4291 4266 kvm_wait_lapic_expire(vcpu); 4292 4267 4293 4268 /* ··· 4324 4279 4325 4280 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4326 4281 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 4282 + 4283 + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4284 + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4285 + update_debugctlmsr(vcpu->arch.host_debugctl); 4327 4286 4328 4287 kvm_load_host_xsave_state(vcpu); 4329 4288 stgi();
+1 -1
arch/x86/kvm/svm/svm.h
··· 584 584 /* svm.c */ 585 585 #define MSR_INVALID 0xffffffffU 586 586 587 - #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 587 + #define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) 588 588 589 589 extern bool dump_invalid_vmcb; 590 590
+1 -9
arch/x86/kvm/svm/vmenter.S
··· 170 170 mov VCPU_RDI(%_ASM_DI), %_ASM_DI 171 171 172 172 /* Enter guest mode */ 173 - sti 174 - 175 173 3: vmrun %_ASM_AX 176 174 4: 177 - cli 178 - 179 175 /* Pop @svm to RAX while it's the only available register. */ 180 176 pop %_ASM_AX 181 177 ··· 336 340 mov KVM_VMCB_pa(%rax), %rax 337 341 338 342 /* Enter guest mode */ 339 - sti 340 - 341 343 1: vmrun %rax 342 - 343 - 2: cli 344 - 344 + 2: 345 345 /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ 346 346 FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT 347 347
+2 -6
arch/x86/kvm/vmx/vmx.c
··· 1514 1514 */ 1515 1515 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1516 1516 { 1517 - struct vcpu_vmx *vmx = to_vmx(vcpu); 1518 - 1519 1517 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1520 1518 shrink_ple_window(vcpu); 1521 1519 1522 1520 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1523 1521 1524 1522 vmx_vcpu_pi_load(vcpu, cpu); 1525 - 1526 - vmx->host_debugctlmsr = get_debugctlmsr(); 1527 1523 } 1528 1524 1529 1525 void vmx_vcpu_put(struct kvm_vcpu *vcpu) ··· 7454 7458 } 7455 7459 7456 7460 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7457 - if (vmx->host_debugctlmsr) 7458 - update_debugctlmsr(vmx->host_debugctlmsr); 7461 + if (vcpu->arch.host_debugctl) 7462 + update_debugctlmsr(vcpu->arch.host_debugctl); 7459 7463 7460 7464 #ifndef CONFIG_X86_64 7461 7465 /*
-2
arch/x86/kvm/vmx/vmx.h
··· 340 340 /* apic deadline value in host tsc */ 341 341 u64 hv_deadline_tsc; 342 342 343 - unsigned long host_debugctlmsr; 344 - 345 343 /* 346 344 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in 347 345 * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
+2
arch/x86/kvm/x86.c
··· 10968 10968 set_debugreg(0, 7); 10969 10969 } 10970 10970 10971 + vcpu->arch.host_debugctl = get_debugctlmsr(); 10972 + 10971 10973 guest_timing_enter_irqoff(); 10972 10974 10973 10975 for (;;) {
+13 -8
tools/testing/selftests/kvm/mmu_stress_test.c
··· 18 18 #include "ucall_common.h" 19 19 20 20 static bool mprotect_ro_done; 21 + static bool all_vcpus_hit_ro_fault; 21 22 22 23 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) 23 24 { ··· 37 36 38 37 /* 39 38 * Write to the region while mprotect(PROT_READ) is underway. Keep 40 - * looping until the memory is guaranteed to be read-only, otherwise 41 - * vCPUs may complete their writes and advance to the next stage 42 - * prematurely. 39 + * looping until the memory is guaranteed to be read-only and a fault 40 + * has occurred, otherwise vCPUs may complete their writes and advance 41 + * to the next stage prematurely. 43 42 * 44 43 * For architectures that support skipping the faulting instruction, 45 44 * generate the store via inline assembly to ensure the exact length ··· 57 56 #else 58 57 vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa); 59 58 #endif 60 - } while (!READ_ONCE(mprotect_ro_done)); 59 + } while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault)); 61 60 62 61 /* 63 62 * Only architectures that write the entire range can explicitly sync, ··· 82 81 83 82 static int nr_vcpus; 84 83 static atomic_t rendezvous; 84 + static atomic_t nr_ro_faults; 85 85 86 86 static void rendezvous_with_boss(void) 87 87 { ··· 150 148 * be stuck on the faulting instruction for other architectures. Go to 151 149 * stage 3 without a rendezvous 152 150 */ 153 - do { 154 - r = _vcpu_run(vcpu); 155 - } while (!r); 151 + r = _vcpu_run(vcpu); 156 152 TEST_ASSERT(r == -1 && errno == EFAULT, 157 153 "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno); 154 + 155 + atomic_inc(&nr_ro_faults); 156 + if (atomic_read(&nr_ro_faults) == nr_vcpus) { 157 + WRITE_ONCE(all_vcpus_hit_ro_fault, true); 158 + sync_global_to_guest(vm, all_vcpus_hit_ro_fault); 159 + } 158 160 159 161 #if defined(__x86_64__) || defined(__aarch64__) 160 162 /* ··· 384 378 rendezvous_with_vcpus(&time_run2, "run 2"); 385 379 386 380 mprotect(mem, slot_size, PROT_READ); 387 - usleep(10); 388 381 mprotect_ro_done = true; 389 382 sync_global_to_guest(vm, mprotect_ro_done); 390 383
+2
tools/testing/selftests/kvm/x86/nested_exceptions_test.c
··· 85 85 86 86 GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector)); 87 87 GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code); 88 + GUEST_ASSERT(!ctrl->int_state); 88 89 } 89 90 90 91 static void l1_svm_code(struct svm_test_data *svm) ··· 123 122 GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); 124 123 GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector); 125 124 GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code); 125 + GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO)); 126 126 } 127 127 128 128 static void l1_vmx_code(struct vmx_pages *vmx)
+2 -1
tools/testing/selftests/kvm/x86/sev_smoke_test.c
··· 52 52 bool bad = false; 53 53 for (i = 0; i < 4095; i++) { 54 54 if (from_host[i] != from_guest[i]) { 55 - printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); 55 + printf("mismatch at %u | %02hhx %02hhx\n", 56 + i, from_host[i], from_guest[i]); 56 57 bad = true; 57 58 } 58 59 }