Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvm-x86-misc-6.12' of https://github.com/kvm-x86/linux into HEAD

KVM x86 misc changes for 6.12

- Advertise AVX10.1 to userspace (effectively prep work for the "real" AVX10
functionality that is on the horizon).

- Rework common MSR handling code to suppress errors on userspace accesses to
unsupported-but-advertised MSRs. This will allow removing (almost?) all of
KVM's exemptions for userspace access to MSRs that shouldn't exist based on
the vCPU model (the actual cleanup is non-trivial future work).

- Rework KVM's handling of x2APIC ICR, again, because AMD (x2AVIC) splits the
64-bit value into the legacy ICR and ICR2 storage, whereas Intel (APICv)
stores the entire 64-bit value a the ICR offset.

- Fix a bug where KVM would fail to exit to userspace if one was triggered by
a fastpath exit handler.

- Add fastpath handling of HLT VM-Exit to expedite re-entering the guest when
there's already a pending wake event at the time of the exit.

- Finally fix the RSM vs. nested VM-Enter WARN by forcing the vCPU out of
guest mode prior to signalling SHUTDOWN (architecturally, the SHUTDOWN is
supposed to hit L1, not L2).

+679 -502
+1
arch/x86/include/asm/cpuid.h
··· 179 179 case 0x1d: 180 180 case 0x1e: 181 181 case 0x1f: 182 + case 0x24: 182 183 case 0x8000001d: 183 184 return true; 184 185 }
+1 -1
arch/x86/include/asm/kvm-x86-ops.h
··· 125 125 KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) 126 126 KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) 127 127 KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) 128 - KVM_X86_OP(get_msr_feature) 128 + KVM_X86_OP(get_feature_msr) 129 129 KVM_X86_OP(check_emulate_instruction) 130 130 KVM_X86_OP(apic_init_signal_blocked) 131 131 KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
+4 -1
arch/x86/include/asm/kvm_host.h
··· 212 212 EXIT_FASTPATH_NONE, 213 213 EXIT_FASTPATH_REENTER_GUEST, 214 214 EXIT_FASTPATH_EXIT_HANDLED, 215 + EXIT_FASTPATH_EXIT_USERSPACE, 215 216 }; 216 217 typedef enum exit_fastpath_completion fastpath_t; 217 218 ··· 1731 1730 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 1732 1731 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 1733 1732 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 1733 + 1734 + const bool x2apic_icr_is_split; 1734 1735 const unsigned long required_apicv_inhibits; 1735 1736 bool allow_apicv_in_x2apic_without_x2apic_virtualization; 1736 1737 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); ··· 1812 1809 int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); 1813 1810 void (*guest_memory_reclaimed)(struct kvm *kvm); 1814 1811 1815 - int (*get_msr_feature)(struct kvm_msr_entry *entry); 1812 + int (*get_feature_msr)(u32 msr, u64 *data); 1816 1813 1817 1814 int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, 1818 1815 void *insn, int insn_len);
+28 -2
arch/x86/kvm/cpuid.c
··· 705 705 706 706 kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, 707 707 F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | 708 - F(AMX_COMPLEX) 708 + F(AMX_COMPLEX) | F(AVX10) 709 709 ); 710 710 711 711 kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, ··· 719 719 720 720 kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, 721 721 SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) 722 + ); 723 + 724 + kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX, 725 + F(AVX10_128) | F(AVX10_256) | F(AVX10_512) 722 726 ); 723 727 724 728 kvm_cpu_cap_mask(CPUID_8000_0001_ECX, ··· 953 949 switch (function) { 954 950 case 0: 955 951 /* Limited to the highest leaf implemented in KVM. */ 956 - entry->eax = min(entry->eax, 0x1fU); 952 + entry->eax = min(entry->eax, 0x24U); 957 953 break; 958 954 case 1: 959 955 cpuid_entry_override(entry, CPUID_1_EDX); ··· 1178 1174 break; 1179 1175 } 1180 1176 break; 1177 + case 0x24: { 1178 + u8 avx10_version; 1179 + 1180 + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { 1181 + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1182 + break; 1183 + } 1184 + 1185 + /* 1186 + * The AVX10 version is encoded in EBX[7:0]. Note, the version 1187 + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the 1188 + * version needs to be captured before overriding EBX features! 1189 + */ 1190 + avx10_version = min_t(u8, entry->ebx & 0xff, 1); 1191 + cpuid_entry_override(entry, CPUID_24_0_EBX); 1192 + entry->ebx |= avx10_version; 1193 + 1194 + entry->eax = 0; 1195 + entry->ecx = 0; 1196 + entry->edx = 0; 1197 + break; 1198 + } 1181 1199 case KVM_CPUID_SIGNATURE: { 1182 1200 const u32 *sigptr = (const u32 *)KVM_SIGNATURE; 1183 1201 entry->eax = KVM_CPUID_FEATURES;
+53 -22
arch/x86/kvm/lapic.c
··· 1944 1944 u64 ns = 0; 1945 1945 ktime_t expire; 1946 1946 struct kvm_vcpu *vcpu = apic->vcpu; 1947 - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1947 + u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; 1948 1948 unsigned long flags; 1949 1949 ktime_t now; 1950 1950 ··· 2453 2453 } 2454 2454 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); 2455 2455 2456 + #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) 2457 + 2458 + int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 2459 + { 2460 + if (data & X2APIC_ICR_RESERVED_BITS) 2461 + return 1; 2462 + 2463 + /* 2464 + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but 2465 + * only AMD requires it to be zero, Intel essentially just ignores the 2466 + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, 2467 + * the CPU performs the reserved bits checks, i.e. the underlying CPU 2468 + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no 2469 + * sane way to provide consistent behavior with respect to hardware. 2470 + */ 2471 + data &= ~APIC_ICR_BUSY; 2472 + 2473 + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 2474 + if (kvm_x86_ops.x2apic_icr_is_split) { 2475 + kvm_lapic_set_reg(apic, APIC_ICR, data); 2476 + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); 2477 + } else { 2478 + kvm_lapic_set_reg64(apic, APIC_ICR, data); 2479 + } 2480 + trace_kvm_apic_write(APIC_ICR, data); 2481 + return 0; 2482 + } 2483 + 2484 + static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) 2485 + { 2486 + if (kvm_x86_ops.x2apic_icr_is_split) 2487 + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | 2488 + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; 2489 + 2490 + return kvm_lapic_get_reg64(apic, APIC_ICR); 2491 + } 2492 + 2456 2493 /* emulate APIC access in a trap manner */ 2457 2494 void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) 2458 2495 { ··· 2507 2470 * maybe-unecessary write, and both are in the noise anyways. 2508 2471 */ 2509 2472 if (apic_x2apic_mode(apic) && offset == APIC_ICR) 2510 - kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); 2473 + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); 2511 2474 else 2512 2475 kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); 2513 2476 } ··· 3027 2990 3028 2991 /* 3029 2992 * In x2APIC mode, the LDR is fixed and based on the id. And 3030 - * ICR is internally a single 64-bit register, but needs to be 3031 - * split to ICR+ICR2 in userspace for backwards compatibility. 2993 + * if the ICR is _not_ split, ICR is internally a single 64-bit 2994 + * register, but needs to be split to ICR+ICR2 in userspace for 2995 + * backwards compatibility. 3032 2996 */ 3033 - if (set) { 2997 + if (set) 3034 2998 *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); 3035 2999 3036 - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | 3037 - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; 3038 - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); 3039 - } else { 3040 - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); 3041 - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); 3000 + if (!kvm_x86_ops.x2apic_icr_is_split) { 3001 + if (set) { 3002 + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | 3003 + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; 3004 + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); 3005 + } else { 3006 + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); 3007 + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); 3008 + } 3042 3009 } 3043 3010 } 3044 3011 ··· 3235 3194 return 0; 3236 3195 } 3237 3196 3238 - int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 3239 - { 3240 - data &= ~APIC_ICR_BUSY; 3241 - 3242 - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 3243 - kvm_lapic_set_reg64(apic, APIC_ICR, data); 3244 - trace_kvm_apic_write(APIC_ICR, data); 3245 - return 0; 3246 - } 3247 - 3248 3197 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) 3249 3198 { 3250 3199 u32 low; 3251 3200 3252 3201 if (reg == APIC_ICR) { 3253 - *data = kvm_lapic_get_reg64(apic, APIC_ICR); 3202 + *data = kvm_x2apic_icr_read(apic); 3254 3203 return 0; 3255 3204 } 3256 3205
-1
arch/x86/kvm/lapic.h
··· 96 96 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 97 97 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); 98 98 void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 99 - u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 100 99 void kvm_recalculate_apic_map(struct kvm *kvm); 101 100 void kvm_apic_set_version(struct kvm_vcpu *vcpu); 102 101 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
-2
arch/x86/kvm/mmu.h
··· 223 223 224 224 bool kvm_mmu_may_ignore_guest_pat(void); 225 225 226 - int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); 227 - 228 226 int kvm_mmu_post_init_vm(struct kvm *kvm); 229 227 void kvm_mmu_pre_destroy_vm(struct kvm *kvm); 230 228
-2
arch/x86/kvm/mmu/mmu_internal.h
··· 349 349 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); 350 350 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); 351 351 352 - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); 353 - 354 352 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); 355 353 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); 356 354
+8
arch/x86/kvm/reverse_cpuid.h
··· 17 17 CPUID_8000_0007_EDX, 18 18 CPUID_8000_0022_EAX, 19 19 CPUID_7_2_EDX, 20 + CPUID_24_0_EBX, 20 21 NR_KVM_CPU_CAPS, 21 22 22 23 NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, ··· 47 46 #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) 48 47 #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) 49 48 #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) 49 + #define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) 50 50 51 51 /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ 52 52 #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) ··· 56 54 #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) 57 55 #define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) 58 56 #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) 57 + 58 + /* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ 59 + #define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) 60 + #define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) 61 + #define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) 59 62 60 63 /* CPUID level 0x80000007 (EDX). */ 61 64 #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) ··· 97 90 [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, 98 91 [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, 99 92 [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, 93 + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, 100 94 }; 101 95 102 96 /*
+19 -5
arch/x86/kvm/smm.c
··· 624 624 #endif 625 625 626 626 /* 627 - * Give leave_smm() a chance to make ISA-specific changes to the vCPU 628 - * state (e.g. enter guest mode) before loading state from the SMM 629 - * state-save area. 627 + * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest 628 + * mode should happen _after_ loading state from SMRAM. However, KVM 629 + * piggybacks the nested VM-Enter flows (which is wrong for many other 630 + * reasons), and so nSVM/nVMX would clobber state that is loaded from 631 + * SMRAM and from the VMCS/VMCB. 630 632 */ 631 633 if (kvm_x86_call(leave_smm)(vcpu, &smram)) 632 634 return X86EMUL_UNHANDLEABLE; 633 635 634 636 #ifdef CONFIG_X86_64 635 637 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) 636 - return rsm_load_state_64(ctxt, &smram.smram64); 638 + ret = rsm_load_state_64(ctxt, &smram.smram64); 637 639 else 638 640 #endif 639 - return rsm_load_state_32(ctxt, &smram.smram32); 641 + ret = rsm_load_state_32(ctxt, &smram.smram32); 642 + 643 + /* 644 + * If RSM fails and triggers shutdown, architecturally the shutdown 645 + * occurs *before* the transition to guest mode. But due to KVM's 646 + * flawed handling of RSM to L2 (see above), the vCPU may already be 647 + * in_guest_mode(). Force the vCPU out of guest mode before delivering 648 + * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit 649 + * that architecturally shouldn't be possible. 650 + */ 651 + if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu)) 652 + kvm_leave_nested(vcpu); 653 + return ret; 640 654 }
+29 -15
arch/x86/kvm/svm/svm.c
··· 2825 2825 return kvm_complete_insn_gp(vcpu, ret); 2826 2826 } 2827 2827 2828 - static int svm_get_msr_feature(struct kvm_msr_entry *msr) 2828 + static int svm_get_feature_msr(u32 msr, u64 *data) 2829 2829 { 2830 - msr->data = 0; 2830 + *data = 0; 2831 2831 2832 - switch (msr->index) { 2832 + switch (msr) { 2833 2833 case MSR_AMD64_DE_CFG: 2834 2834 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2835 - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2835 + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2836 2836 break; 2837 2837 default: 2838 - return KVM_MSR_RET_INVALID; 2838 + return KVM_MSR_RET_UNSUPPORTED; 2839 2839 } 2840 2840 2841 2841 return 0; ··· 3191 3191 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3192 3192 break; 3193 3193 case MSR_AMD64_DE_CFG: { 3194 - struct kvm_msr_entry msr_entry; 3194 + u64 supported_de_cfg; 3195 3195 3196 - msr_entry.index = msr->index; 3197 - if (svm_get_msr_feature(&msr_entry)) 3196 + if (svm_get_feature_msr(ecx, &supported_de_cfg)) 3198 3197 return 1; 3199 3198 3200 - /* Check the supported bits */ 3201 - if (data & ~msr_entry.data) 3199 + if (data & ~supported_de_cfg) 3202 3200 return 1; 3203 3201 3204 - /* Don't allow the guest to change a bit, #GP */ 3205 - if (!msr->host_initiated && (data ^ msr_entry.data)) 3202 + /* 3203 + * Don't let the guest change the host-programmed value. The 3204 + * MSR is very model specific, i.e. contains multiple bits that 3205 + * are completely unknown to KVM, and the one bit known to KVM 3206 + * is simply a reflection of hardware capabilities. 3207 + */ 3208 + if (!msr->host_initiated && data != svm->msr_decfg) 3206 3209 return 1; 3207 3210 3208 3211 svm->msr_decfg = data; ··· 4159 4156 4160 4157 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4161 4158 { 4159 + struct vcpu_svm *svm = to_svm(vcpu); 4160 + 4162 4161 if (is_guest_mode(vcpu)) 4163 4162 return EXIT_FASTPATH_NONE; 4164 4163 4165 - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 4166 - to_svm(vcpu)->vmcb->control.exit_info_1) 4164 + switch (svm->vmcb->control.exit_code) { 4165 + case SVM_EXIT_MSR: 4166 + if (!svm->vmcb->control.exit_info_1) 4167 + break; 4167 4168 return handle_fastpath_set_msr_irqoff(vcpu); 4169 + case SVM_EXIT_HLT: 4170 + return handle_fastpath_hlt(vcpu); 4171 + default: 4172 + break; 4173 + } 4168 4174 4169 4175 return EXIT_FASTPATH_NONE; 4170 4176 } ··· 5024 5012 .vcpu_unblocking = avic_vcpu_unblocking, 5025 5013 5026 5014 .update_exception_bitmap = svm_update_exception_bitmap, 5027 - .get_msr_feature = svm_get_msr_feature, 5015 + .get_feature_msr = svm_get_feature_msr, 5028 5016 .get_msr = svm_get_msr, 5029 5017 .set_msr = svm_set_msr, 5030 5018 .get_segment_base = svm_get_segment_base, ··· 5075 5063 .enable_nmi_window = svm_enable_nmi_window, 5076 5064 .enable_irq_window = svm_enable_irq_window, 5077 5065 .update_cr8_intercept = svm_update_cr8_intercept, 5066 + 5067 + .x2apic_icr_is_split = true, 5078 5068 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5079 5069 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5080 5070 .apicv_post_state_restore = avic_apicv_post_state_restore,
+3 -1
arch/x86/kvm/vmx/main.c
··· 43 43 .vcpu_put = vmx_vcpu_put, 44 44 45 45 .update_exception_bitmap = vmx_update_exception_bitmap, 46 - .get_msr_feature = vmx_get_msr_feature, 46 + .get_feature_msr = vmx_get_feature_msr, 47 47 .get_msr = vmx_get_msr, 48 48 .set_msr = vmx_set_msr, 49 49 .get_segment_base = vmx_get_segment_base, ··· 91 91 .enable_nmi_window = vmx_enable_nmi_window, 92 92 .enable_irq_window = vmx_enable_irq_window, 93 93 .update_cr8_intercept = vmx_update_cr8_intercept, 94 + 95 + .x2apic_icr_is_split = false, 94 96 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 95 97 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 96 98 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+6 -4
arch/x86/kvm/vmx/vmx.c
··· 1998 1998 return !(msr->data & ~valid_bits); 1999 1999 } 2000 2000 2001 - int vmx_get_msr_feature(struct kvm_msr_entry *msr) 2001 + int vmx_get_feature_msr(u32 msr, u64 *data) 2002 2002 { 2003 - switch (msr->index) { 2003 + switch (msr) { 2004 2004 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2005 2005 if (!nested) 2006 2006 return 1; 2007 - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 2007 + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2008 2008 default: 2009 - return KVM_MSR_RET_INVALID; 2009 + return KVM_MSR_RET_UNSUPPORTED; 2010 2010 } 2011 2011 } 2012 2012 ··· 7265 7265 return handle_fastpath_set_msr_irqoff(vcpu); 7266 7266 case EXIT_REASON_PREEMPTION_TIMER: 7267 7267 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7268 + case EXIT_REASON_HLT: 7269 + return handle_fastpath_hlt(vcpu); 7268 7270 default: 7269 7271 return EXIT_FASTPATH_NONE; 7270 7272 }
-4
arch/x86/kvm/vmx/vmx.h
··· 17 17 #include "run_flags.h" 18 18 #include "../mmu.h" 19 19 20 - #define MSR_TYPE_R 1 21 - #define MSR_TYPE_W 2 22 - #define MSR_TYPE_RW 3 23 - 24 20 #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) 25 21 26 22 #ifdef CONFIG_X86_64
+1 -1
arch/x86/kvm/vmx/x86_ops.h
··· 57 57 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); 58 58 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 59 59 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); 60 - int vmx_get_msr_feature(struct kvm_msr_entry *msr); 60 + int vmx_get_feature_msr(u32 msr, u64 *data); 61 61 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 62 62 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); 63 63 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+425 -412
arch/x86/kvm/x86.c
··· 305 305 static struct kmem_cache *x86_emulator_cache; 306 306 307 307 /* 308 - * When called, it means the previous get/set msr reached an invalid msr. 309 - * Return true if we want to ignore/silent this failed msr access. 308 + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track 309 + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, 310 + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that 311 + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds 312 + * MSRs that KVM emulates without strictly requiring host support. 313 + * msr_based_features holds MSRs that enumerate features, i.e. are effectively 314 + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with 315 + * msrs_to_save and emulated_msrs. 310 316 */ 311 - static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) 312 - { 313 - const char *op = write ? "wrmsr" : "rdmsr"; 314 317 315 - if (ignore_msrs) { 316 - if (report_ignored_msrs) 317 - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", 318 - op, msr, data); 319 - /* Mask the error */ 318 + static const u32 msrs_to_save_base[] = { 319 + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 320 + MSR_STAR, 321 + #ifdef CONFIG_X86_64 322 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 323 + #endif 324 + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 325 + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 326 + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, 327 + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 328 + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 329 + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 330 + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 331 + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 332 + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 333 + MSR_IA32_UMWAIT_CONTROL, 334 + 335 + MSR_IA32_XFD, MSR_IA32_XFD_ERR, 336 + }; 337 + 338 + static const u32 msrs_to_save_pmu[] = { 339 + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 340 + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 341 + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 342 + MSR_CORE_PERF_GLOBAL_CTRL, 343 + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 344 + 345 + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ 346 + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 347 + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 348 + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 349 + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 350 + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 351 + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 352 + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 353 + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 354 + 355 + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 356 + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 357 + 358 + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ 359 + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 360 + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 361 + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 362 + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 363 + 364 + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 365 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 366 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 367 + }; 368 + 369 + static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 370 + ARRAY_SIZE(msrs_to_save_pmu)]; 371 + static unsigned num_msrs_to_save; 372 + 373 + static const u32 emulated_msrs_all[] = { 374 + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 375 + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 376 + 377 + #ifdef CONFIG_KVM_HYPERV 378 + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 379 + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 380 + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 381 + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 382 + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 383 + HV_X64_MSR_RESET, 384 + HV_X64_MSR_VP_INDEX, 385 + HV_X64_MSR_VP_RUNTIME, 386 + HV_X64_MSR_SCONTROL, 387 + HV_X64_MSR_STIMER0_CONFIG, 388 + HV_X64_MSR_VP_ASSIST_PAGE, 389 + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 390 + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, 391 + HV_X64_MSR_SYNDBG_OPTIONS, 392 + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 393 + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 394 + HV_X64_MSR_SYNDBG_PENDING_BUFFER, 395 + #endif 396 + 397 + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 398 + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 399 + 400 + MSR_IA32_TSC_ADJUST, 401 + MSR_IA32_TSC_DEADLINE, 402 + MSR_IA32_ARCH_CAPABILITIES, 403 + MSR_IA32_PERF_CAPABILITIES, 404 + MSR_IA32_MISC_ENABLE, 405 + MSR_IA32_MCG_STATUS, 406 + MSR_IA32_MCG_CTL, 407 + MSR_IA32_MCG_EXT_CTL, 408 + MSR_IA32_SMBASE, 409 + MSR_SMI_COUNT, 410 + MSR_PLATFORM_INFO, 411 + MSR_MISC_FEATURES_ENABLES, 412 + MSR_AMD64_VIRT_SPEC_CTRL, 413 + MSR_AMD64_TSC_RATIO, 414 + MSR_IA32_POWER_CTL, 415 + MSR_IA32_UCODE_REV, 416 + 417 + /* 418 + * KVM always supports the "true" VMX control MSRs, even if the host 419 + * does not. The VMX MSRs as a whole are considered "emulated" as KVM 420 + * doesn't strictly require them to exist in the host (ignoring that 421 + * KVM would refuse to load in the first place if the core set of MSRs 422 + * aren't supported). 423 + */ 424 + MSR_IA32_VMX_BASIC, 425 + MSR_IA32_VMX_TRUE_PINBASED_CTLS, 426 + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 427 + MSR_IA32_VMX_TRUE_EXIT_CTLS, 428 + MSR_IA32_VMX_TRUE_ENTRY_CTLS, 429 + MSR_IA32_VMX_MISC, 430 + MSR_IA32_VMX_CR0_FIXED0, 431 + MSR_IA32_VMX_CR4_FIXED0, 432 + MSR_IA32_VMX_VMCS_ENUM, 433 + MSR_IA32_VMX_PROCBASED_CTLS2, 434 + MSR_IA32_VMX_EPT_VPID_CAP, 435 + MSR_IA32_VMX_VMFUNC, 436 + 437 + MSR_K7_HWCR, 438 + MSR_KVM_POLL_CONTROL, 439 + }; 440 + 441 + static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 442 + static unsigned num_emulated_msrs; 443 + 444 + /* 445 + * List of MSRs that control the existence of MSR-based features, i.e. MSRs 446 + * that are effectively CPUID leafs. VMX MSRs are also included in the set of 447 + * feature MSRs, but are handled separately to allow expedited lookups. 448 + */ 449 + static const u32 msr_based_features_all_except_vmx[] = { 450 + MSR_AMD64_DE_CFG, 451 + MSR_IA32_UCODE_REV, 452 + MSR_IA32_ARCH_CAPABILITIES, 453 + MSR_IA32_PERF_CAPABILITIES, 454 + }; 455 + 456 + static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + 457 + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; 458 + static unsigned int num_msr_based_features; 459 + 460 + /* 461 + * All feature MSRs except uCode revID, which tracks the currently loaded uCode 462 + * patch, are immutable once the vCPU model is defined. 463 + */ 464 + static bool kvm_is_immutable_feature_msr(u32 msr) 465 + { 466 + int i; 467 + 468 + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) 320 469 return true; 321 - } else { 322 - kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 323 - op, msr, data); 324 - return false; 470 + 471 + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { 472 + if (msr == msr_based_features_all_except_vmx[i]) 473 + return msr != MSR_IA32_UCODE_REV; 325 474 } 475 + 476 + return false; 477 + } 478 + 479 + static bool kvm_is_advertised_msr(u32 msr_index) 480 + { 481 + unsigned int i; 482 + 483 + for (i = 0; i < num_msrs_to_save; i++) { 484 + if (msrs_to_save[i] == msr_index) 485 + return true; 486 + } 487 + 488 + for (i = 0; i < num_emulated_msrs; i++) { 489 + if (emulated_msrs[i] == msr_index) 490 + return true; 491 + } 492 + 493 + return false; 494 + } 495 + 496 + typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, 497 + bool host_initiated); 498 + 499 + static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, 500 + u64 *data, bool host_initiated, 501 + enum kvm_msr_access rw, 502 + msr_access_t msr_access_fn) 503 + { 504 + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; 505 + int ret; 506 + 507 + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); 508 + 509 + /* 510 + * Zero the data on read failures to avoid leaking stack data to the 511 + * guest and/or userspace, e.g. if the failure is ignored below. 512 + */ 513 + ret = msr_access_fn(vcpu, msr, data, host_initiated); 514 + if (ret && rw == MSR_TYPE_R) 515 + *data = 0; 516 + 517 + if (ret != KVM_MSR_RET_UNSUPPORTED) 518 + return ret; 519 + 520 + /* 521 + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM 522 + * advertises to userspace, even if an MSR isn't fully supported. 523 + * Simply check that @data is '0', which covers both the write '0' case 524 + * and all reads (in which case @data is zeroed on failure; see above). 525 + */ 526 + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) 527 + return 0; 528 + 529 + if (!ignore_msrs) { 530 + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", 531 + op, msr, *data); 532 + return ret; 533 + } 534 + 535 + if (report_ignored_msrs) 536 + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); 537 + 538 + return 0; 326 539 } 327 540 328 541 static struct kmem_cache *kvm_alloc_emulator_cache(void) ··· 626 413 627 414 static void kvm_user_return_msr_cpu_online(void) 628 415 { 629 - unsigned int cpu = smp_processor_id(); 630 - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); 416 + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); 631 417 u64 value; 632 418 int i; 633 419 ··· 831 619 ex->error_code = error_code; 832 620 ex->has_payload = has_payload; 833 621 ex->payload = payload; 834 - } 835 - 836 - /* Forcibly leave the nested mode in cases like a vCPU reset */ 837 - static void kvm_leave_nested(struct kvm_vcpu *vcpu) 838 - { 839 - kvm_x86_ops.nested_ops->leave_nested(vcpu); 840 622 } 841 623 842 624 static void kvm_multiple_exception(struct kvm_vcpu *vcpu, ··· 1618 1412 EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); 1619 1413 1620 1414 /* 1621 - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track 1622 - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, 1623 - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that 1624 - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds 1625 - * MSRs that KVM emulates without strictly requiring host support. 1626 - * msr_based_features holds MSRs that enumerate features, i.e. are effectively 1627 - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with 1628 - * msrs_to_save and emulated_msrs. 1629 - */ 1630 - 1631 - static const u32 msrs_to_save_base[] = { 1632 - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 1633 - MSR_STAR, 1634 - #ifdef CONFIG_X86_64 1635 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 1636 - #endif 1637 - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, 1638 - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, 1639 - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, 1640 - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, 1641 - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, 1642 - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, 1643 - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, 1644 - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, 1645 - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 1646 - MSR_IA32_UMWAIT_CONTROL, 1647 - 1648 - MSR_IA32_XFD, MSR_IA32_XFD_ERR, 1649 - }; 1650 - 1651 - static const u32 msrs_to_save_pmu[] = { 1652 - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1653 - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 1654 - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, 1655 - MSR_CORE_PERF_GLOBAL_CTRL, 1656 - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, 1657 - 1658 - /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ 1659 - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, 1660 - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, 1661 - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, 1662 - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, 1663 - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, 1664 - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, 1665 - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, 1666 - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, 1667 - 1668 - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, 1669 - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, 1670 - 1671 - /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ 1672 - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, 1673 - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 1674 - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 1675 - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 1676 - 1677 - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 1678 - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 1679 - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 1680 - }; 1681 - 1682 - static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 1683 - ARRAY_SIZE(msrs_to_save_pmu)]; 1684 - static unsigned num_msrs_to_save; 1685 - 1686 - static const u32 emulated_msrs_all[] = { 1687 - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 1688 - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 1689 - 1690 - #ifdef CONFIG_KVM_HYPERV 1691 - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 1692 - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, 1693 - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, 1694 - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, 1695 - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, 1696 - HV_X64_MSR_RESET, 1697 - HV_X64_MSR_VP_INDEX, 1698 - HV_X64_MSR_VP_RUNTIME, 1699 - HV_X64_MSR_SCONTROL, 1700 - HV_X64_MSR_STIMER0_CONFIG, 1701 - HV_X64_MSR_VP_ASSIST_PAGE, 1702 - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, 1703 - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, 1704 - HV_X64_MSR_SYNDBG_OPTIONS, 1705 - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, 1706 - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, 1707 - HV_X64_MSR_SYNDBG_PENDING_BUFFER, 1708 - #endif 1709 - 1710 - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, 1711 - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, 1712 - 1713 - MSR_IA32_TSC_ADJUST, 1714 - MSR_IA32_TSC_DEADLINE, 1715 - MSR_IA32_ARCH_CAPABILITIES, 1716 - MSR_IA32_PERF_CAPABILITIES, 1717 - MSR_IA32_MISC_ENABLE, 1718 - MSR_IA32_MCG_STATUS, 1719 - MSR_IA32_MCG_CTL, 1720 - MSR_IA32_MCG_EXT_CTL, 1721 - MSR_IA32_SMBASE, 1722 - MSR_SMI_COUNT, 1723 - MSR_PLATFORM_INFO, 1724 - MSR_MISC_FEATURES_ENABLES, 1725 - MSR_AMD64_VIRT_SPEC_CTRL, 1726 - MSR_AMD64_TSC_RATIO, 1727 - MSR_IA32_POWER_CTL, 1728 - MSR_IA32_UCODE_REV, 1729 - 1730 - /* 1731 - * KVM always supports the "true" VMX control MSRs, even if the host 1732 - * does not. The VMX MSRs as a whole are considered "emulated" as KVM 1733 - * doesn't strictly require them to exist in the host (ignoring that 1734 - * KVM would refuse to load in the first place if the core set of MSRs 1735 - * aren't supported). 1736 - */ 1737 - MSR_IA32_VMX_BASIC, 1738 - MSR_IA32_VMX_TRUE_PINBASED_CTLS, 1739 - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 1740 - MSR_IA32_VMX_TRUE_EXIT_CTLS, 1741 - MSR_IA32_VMX_TRUE_ENTRY_CTLS, 1742 - MSR_IA32_VMX_MISC, 1743 - MSR_IA32_VMX_CR0_FIXED0, 1744 - MSR_IA32_VMX_CR4_FIXED0, 1745 - MSR_IA32_VMX_VMCS_ENUM, 1746 - MSR_IA32_VMX_PROCBASED_CTLS2, 1747 - MSR_IA32_VMX_EPT_VPID_CAP, 1748 - MSR_IA32_VMX_VMFUNC, 1749 - 1750 - MSR_K7_HWCR, 1751 - MSR_KVM_POLL_CONTROL, 1752 - }; 1753 - 1754 - static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; 1755 - static unsigned num_emulated_msrs; 1756 - 1757 - /* 1758 - * List of MSRs that control the existence of MSR-based features, i.e. MSRs 1759 - * that are effectively CPUID leafs. VMX MSRs are also included in the set of 1760 - * feature MSRs, but are handled separately to allow expedited lookups. 1761 - */ 1762 - static const u32 msr_based_features_all_except_vmx[] = { 1763 - MSR_AMD64_DE_CFG, 1764 - MSR_IA32_UCODE_REV, 1765 - MSR_IA32_ARCH_CAPABILITIES, 1766 - MSR_IA32_PERF_CAPABILITIES, 1767 - }; 1768 - 1769 - static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + 1770 - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; 1771 - static unsigned int num_msr_based_features; 1772 - 1773 - /* 1774 - * All feature MSRs except uCode revID, which tracks the currently loaded uCode 1775 - * patch, are immutable once the vCPU model is defined. 1776 - */ 1777 - static bool kvm_is_immutable_feature_msr(u32 msr) 1778 - { 1779 - int i; 1780 - 1781 - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) 1782 - return true; 1783 - 1784 - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { 1785 - if (msr == msr_based_features_all_except_vmx[i]) 1786 - return msr != MSR_IA32_UCODE_REV; 1787 - } 1788 - 1789 - return false; 1790 - } 1791 - 1792 - /* 1793 1415 * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM 1794 1416 * does not yet virtualize. These include: 1795 1417 * 10 - MISC_PACKAGE_CTRLS ··· 1694 1660 return data; 1695 1661 } 1696 1662 1697 - static int kvm_get_msr_feature(struct kvm_msr_entry *msr) 1663 + static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1664 + bool host_initiated) 1698 1665 { 1699 - switch (msr->index) { 1666 + WARN_ON_ONCE(!host_initiated); 1667 + 1668 + switch (index) { 1700 1669 case MSR_IA32_ARCH_CAPABILITIES: 1701 - msr->data = kvm_get_arch_capabilities(); 1670 + *data = kvm_get_arch_capabilities(); 1702 1671 break; 1703 1672 case MSR_IA32_PERF_CAPABILITIES: 1704 - msr->data = kvm_caps.supported_perf_cap; 1673 + *data = kvm_caps.supported_perf_cap; 1705 1674 break; 1706 1675 case MSR_IA32_UCODE_REV: 1707 - rdmsrl_safe(msr->index, &msr->data); 1676 + rdmsrl_safe(index, data); 1708 1677 break; 1709 1678 default: 1710 - return kvm_x86_call(get_msr_feature)(msr); 1679 + return kvm_x86_call(get_feature_msr)(index, data); 1711 1680 } 1712 1681 return 0; 1713 1682 } 1714 1683 1715 - static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1684 + static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 1716 1685 { 1717 - struct kvm_msr_entry msr; 1718 - int r; 1719 - 1720 - /* Unconditionally clear the output for simplicity */ 1721 - msr.data = 0; 1722 - msr.index = index; 1723 - r = kvm_get_msr_feature(&msr); 1724 - 1725 - if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) 1726 - r = 0; 1727 - 1728 - *data = msr.data; 1729 - 1730 - return r; 1686 + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, 1687 + kvm_get_feature_msr); 1731 1688 } 1732 1689 1733 1690 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) ··· 1905 1880 return kvm_x86_call(set_msr)(vcpu, &msr); 1906 1881 } 1907 1882 1883 + static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1884 + bool host_initiated) 1885 + { 1886 + return __kvm_set_msr(vcpu, index, *data, host_initiated); 1887 + } 1888 + 1908 1889 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, 1909 1890 u32 index, u64 data, bool host_initiated) 1910 1891 { 1911 - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); 1912 - 1913 - if (ret == KVM_MSR_RET_INVALID) 1914 - if (kvm_msr_ignored_check(index, data, true)) 1915 - ret = 0; 1916 - 1917 - return ret; 1892 + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, 1893 + _kvm_set_msr); 1918 1894 } 1919 1895 1920 1896 /* ··· 1954 1928 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1955 1929 u32 index, u64 *data, bool host_initiated) 1956 1930 { 1957 - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); 1958 - 1959 - if (ret == KVM_MSR_RET_INVALID) { 1960 - /* Unconditionally clear *data for simplicity */ 1961 - *data = 0; 1962 - if (kvm_msr_ignored_check(index, 0, false)) 1963 - ret = 0; 1964 - } 1965 - 1966 - return ret; 1931 + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, 1932 + __kvm_get_msr); 1967 1933 } 1968 1934 1969 1935 static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) ··· 2017 1999 static u64 kvm_msr_reason(int r) 2018 2000 { 2019 2001 switch (r) { 2020 - case KVM_MSR_RET_INVALID: 2002 + case KVM_MSR_RET_UNSUPPORTED: 2021 2003 return KVM_MSR_EXIT_REASON_UNKNOWN; 2022 2004 case KVM_MSR_RET_FILTERED: 2023 2005 return KVM_MSR_EXIT_REASON_FILTER; ··· 2180 2162 { 2181 2163 u32 msr = kvm_rcx_read(vcpu); 2182 2164 u64 data; 2183 - fastpath_t ret = EXIT_FASTPATH_NONE; 2165 + fastpath_t ret; 2166 + bool handled; 2184 2167 2185 2168 kvm_vcpu_srcu_read_lock(vcpu); 2186 2169 2187 2170 switch (msr) { 2188 2171 case APIC_BASE_MSR + (APIC_ICR >> 4): 2189 2172 data = kvm_read_edx_eax(vcpu); 2190 - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { 2191 - kvm_skip_emulated_instruction(vcpu); 2192 - ret = EXIT_FASTPATH_EXIT_HANDLED; 2193 - } 2173 + handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); 2194 2174 break; 2195 2175 case MSR_IA32_TSC_DEADLINE: 2196 2176 data = kvm_read_edx_eax(vcpu); 2197 - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { 2198 - kvm_skip_emulated_instruction(vcpu); 2199 - ret = EXIT_FASTPATH_REENTER_GUEST; 2200 - } 2177 + handled = !handle_fastpath_set_tscdeadline(vcpu, data); 2201 2178 break; 2202 2179 default: 2180 + handled = false; 2203 2181 break; 2204 2182 } 2205 2183 2206 - if (ret != EXIT_FASTPATH_NONE) 2184 + if (handled) { 2185 + if (!kvm_skip_emulated_instruction(vcpu)) 2186 + ret = EXIT_FASTPATH_EXIT_USERSPACE; 2187 + else 2188 + ret = EXIT_FASTPATH_REENTER_GUEST; 2207 2189 trace_kvm_msr_write(msr, data); 2190 + } else { 2191 + ret = EXIT_FASTPATH_NONE; 2192 + } 2208 2193 2209 2194 kvm_vcpu_srcu_read_unlock(vcpu); 2210 2195 ··· 3767 3746 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); 3768 3747 } 3769 3748 3770 - static bool kvm_is_msr_to_save(u32 msr_index) 3771 - { 3772 - unsigned int i; 3773 - 3774 - for (i = 0; i < num_msrs_to_save; i++) { 3775 - if (msrs_to_save[i] == msr_index) 3776 - return true; 3777 - } 3778 - 3779 - return false; 3780 - } 3781 - 3782 3749 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3783 3750 { 3784 3751 u32 msr = msr_info->index; ··· 4148 4139 if (kvm_pmu_is_valid_msr(vcpu, msr)) 4149 4140 return kvm_pmu_set_msr(vcpu, msr_info); 4150 4141 4151 - /* 4152 - * Userspace is allowed to write '0' to MSRs that KVM reports 4153 - * as to-be-saved, even if an MSRs isn't fully supported. 4154 - */ 4155 - if (msr_info->host_initiated && !data && 4156 - kvm_is_msr_to_save(msr)) 4157 - break; 4158 - 4159 - return KVM_MSR_RET_INVALID; 4142 + return KVM_MSR_RET_UNSUPPORTED; 4160 4143 } 4161 4144 return 0; 4162 4145 } ··· 4499 4498 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 4500 4499 return kvm_pmu_get_msr(vcpu, msr_info); 4501 4500 4502 - /* 4503 - * Userspace is allowed to read MSRs that KVM reports as 4504 - * to-be-saved, even if an MSR isn't fully supported. 4505 - */ 4506 - if (msr_info->host_initiated && 4507 - kvm_is_msr_to_save(msr_info->index)) { 4508 - msr_info->data = 0; 4509 - break; 4510 - } 4511 - 4512 - return KVM_MSR_RET_INVALID; 4501 + return KVM_MSR_RET_UNSUPPORTED; 4513 4502 } 4514 4503 return 0; 4515 4504 } ··· 4937 4946 break; 4938 4947 } 4939 4948 case KVM_GET_MSRS: 4940 - r = msr_io(NULL, argp, do_get_msr_feature, 1); 4949 + r = msr_io(NULL, argp, do_get_feature_msr, 1); 4941 4950 break; 4942 4951 #ifdef CONFIG_KVM_HYPERV 4943 4952 case KVM_GET_SUPPORTED_HV_CPUID: ··· 7374 7383 7375 7384 static void kvm_probe_feature_msr(u32 msr_index) 7376 7385 { 7377 - struct kvm_msr_entry msr = { 7378 - .index = msr_index, 7379 - }; 7386 + u64 data; 7380 7387 7381 - if (kvm_get_msr_feature(&msr)) 7388 + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) 7382 7389 return; 7383 7390 7384 7391 msr_based_features[num_msr_based_features++] = msr_index; ··· 9915 9926 } 9916 9927 EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); 9917 9928 9918 - static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) 9919 - { 9920 - /* 9921 - * The vCPU has halted, e.g. executed HLT. Update the run state if the 9922 - * local APIC is in-kernel, the run loop will detect the non-runnable 9923 - * state and halt the vCPU. Exit to userspace if the local APIC is 9924 - * managed by userspace, in which case userspace is responsible for 9925 - * handling wake events. 9926 - */ 9927 - ++vcpu->stat.halt_exits; 9928 - if (lapic_in_kernel(vcpu)) { 9929 - vcpu->arch.mp_state = state; 9930 - return 1; 9931 - } else { 9932 - vcpu->run->exit_reason = reason; 9933 - return 0; 9934 - } 9935 - } 9936 - 9937 - int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) 9938 - { 9939 - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 9940 - } 9941 - EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); 9942 - 9943 - int kvm_emulate_halt(struct kvm_vcpu *vcpu) 9944 - { 9945 - int ret = kvm_skip_emulated_instruction(vcpu); 9946 - /* 9947 - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 9948 - * KVM_EXIT_DEBUG here. 9949 - */ 9950 - return kvm_emulate_halt_noskip(vcpu) && ret; 9951 - } 9952 - EXPORT_SYMBOL_GPL(kvm_emulate_halt); 9953 - 9954 - int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 9955 - { 9956 - int ret = kvm_skip_emulated_instruction(vcpu); 9957 - 9958 - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, 9959 - KVM_EXIT_AP_RESET_HOLD) && ret; 9960 - } 9961 - EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 9962 - 9963 9929 #ifdef CONFIG_X86_64 9964 9930 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, 9965 9931 unsigned long clock_type) ··· 11151 11207 if (vcpu->arch.apic_attention) 11152 11208 kvm_lapic_sync_from_vapic(vcpu); 11153 11209 11210 + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) 11211 + return 0; 11212 + 11154 11213 r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); 11155 11214 return r; 11156 11215 ··· 11165 11218 kvm_lapic_sync_from_vapic(vcpu); 11166 11219 out: 11167 11220 return r; 11221 + } 11222 + 11223 + static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 11224 + { 11225 + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 11226 + !vcpu->arch.apf.halted); 11227 + } 11228 + 11229 + static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 11230 + { 11231 + if (!list_empty_careful(&vcpu->async_pf.done)) 11232 + return true; 11233 + 11234 + if (kvm_apic_has_pending_init_or_sipi(vcpu) && 11235 + kvm_apic_init_sipi_allowed(vcpu)) 11236 + return true; 11237 + 11238 + if (vcpu->arch.pv.pv_unhalted) 11239 + return true; 11240 + 11241 + if (kvm_is_exception_pending(vcpu)) 11242 + return true; 11243 + 11244 + if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11245 + (vcpu->arch.nmi_pending && 11246 + kvm_x86_call(nmi_allowed)(vcpu, false))) 11247 + return true; 11248 + 11249 + #ifdef CONFIG_KVM_SMM 11250 + if (kvm_test_request(KVM_REQ_SMI, vcpu) || 11251 + (vcpu->arch.smi_pending && 11252 + kvm_x86_call(smi_allowed)(vcpu, false))) 11253 + return true; 11254 + #endif 11255 + 11256 + if (kvm_test_request(KVM_REQ_PMI, vcpu)) 11257 + return true; 11258 + 11259 + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) 11260 + return true; 11261 + 11262 + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) 11263 + return true; 11264 + 11265 + if (kvm_hv_has_stimer_pending(vcpu)) 11266 + return true; 11267 + 11268 + if (is_guest_mode(vcpu) && 11269 + kvm_x86_ops.nested_ops->has_events && 11270 + kvm_x86_ops.nested_ops->has_events(vcpu, false)) 11271 + return true; 11272 + 11273 + if (kvm_xen_has_pending_events(vcpu)) 11274 + return true; 11275 + 11276 + return false; 11277 + } 11278 + 11279 + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 11280 + { 11281 + return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 11168 11282 } 11169 11283 11170 11284 /* Called within kvm->srcu read side. */ ··· 11299 11291 return 1; 11300 11292 } 11301 11293 11302 - static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) 11303 - { 11304 - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && 11305 - !vcpu->arch.apf.halted); 11306 - } 11307 - 11308 11294 /* Called within kvm->srcu read side. */ 11309 11295 static int vcpu_run(struct kvm_vcpu *vcpu) 11310 11296 { ··· 11348 11346 } 11349 11347 11350 11348 return r; 11349 + } 11350 + 11351 + static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) 11352 + { 11353 + /* 11354 + * The vCPU has halted, e.g. executed HLT. Update the run state if the 11355 + * local APIC is in-kernel, the run loop will detect the non-runnable 11356 + * state and halt the vCPU. Exit to userspace if the local APIC is 11357 + * managed by userspace, in which case userspace is responsible for 11358 + * handling wake events. 11359 + */ 11360 + ++vcpu->stat.halt_exits; 11361 + if (lapic_in_kernel(vcpu)) { 11362 + if (kvm_vcpu_has_events(vcpu)) 11363 + vcpu->arch.pv.pv_unhalted = false; 11364 + else 11365 + vcpu->arch.mp_state = state; 11366 + return 1; 11367 + } else { 11368 + vcpu->run->exit_reason = reason; 11369 + return 0; 11370 + } 11371 + } 11372 + 11373 + int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) 11374 + { 11375 + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); 11376 + } 11377 + EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); 11378 + 11379 + int kvm_emulate_halt(struct kvm_vcpu *vcpu) 11380 + { 11381 + int ret = kvm_skip_emulated_instruction(vcpu); 11382 + /* 11383 + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered 11384 + * KVM_EXIT_DEBUG here. 11385 + */ 11386 + return kvm_emulate_halt_noskip(vcpu) && ret; 11387 + } 11388 + EXPORT_SYMBOL_GPL(kvm_emulate_halt); 11389 + 11390 + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) 11391 + { 11392 + int ret; 11393 + 11394 + kvm_vcpu_srcu_read_lock(vcpu); 11395 + ret = kvm_emulate_halt(vcpu); 11396 + kvm_vcpu_srcu_read_unlock(vcpu); 11397 + 11398 + if (!ret) 11399 + return EXIT_FASTPATH_EXIT_USERSPACE; 11400 + 11401 + if (kvm_vcpu_running(vcpu)) 11402 + return EXIT_FASTPATH_REENTER_GUEST; 11403 + 11404 + return EXIT_FASTPATH_EXIT_HANDLED; 11405 + } 11406 + EXPORT_SYMBOL_GPL(handle_fastpath_hlt); 11407 + 11408 + int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) 11409 + { 11410 + int ret = kvm_skip_emulated_instruction(vcpu); 11411 + 11412 + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, 11413 + KVM_EXIT_AP_RESET_HOLD) && ret; 11414 + } 11415 + EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); 11416 + 11417 + bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 11418 + { 11419 + return kvm_vcpu_apicv_active(vcpu) && 11420 + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); 11421 + } 11422 + 11423 + bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 11424 + { 11425 + return vcpu->arch.preempted_in_kernel; 11426 + } 11427 + 11428 + bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 11429 + { 11430 + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 11431 + return true; 11432 + 11433 + if (kvm_test_request(KVM_REQ_NMI, vcpu) || 11434 + #ifdef CONFIG_KVM_SMM 11435 + kvm_test_request(KVM_REQ_SMI, vcpu) || 11436 + #endif 11437 + kvm_test_request(KVM_REQ_EVENT, vcpu)) 11438 + return true; 11439 + 11440 + return kvm_arch_dy_has_pending_interrupt(vcpu); 11351 11441 } 11352 11442 11353 11443 static inline int complete_emulated_io(struct kvm_vcpu *vcpu) ··· 13264 13170 /* Free the arrays associated with the old memslot. */ 13265 13171 if (change == KVM_MR_MOVE) 13266 13172 kvm_arch_free_memslot(kvm, old); 13267 - } 13268 - 13269 - static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 13270 - { 13271 - if (!list_empty_careful(&vcpu->async_pf.done)) 13272 - return true; 13273 - 13274 - if (kvm_apic_has_pending_init_or_sipi(vcpu) && 13275 - kvm_apic_init_sipi_allowed(vcpu)) 13276 - return true; 13277 - 13278 - if (vcpu->arch.pv.pv_unhalted) 13279 - return true; 13280 - 13281 - if (kvm_is_exception_pending(vcpu)) 13282 - return true; 13283 - 13284 - if (kvm_test_request(KVM_REQ_NMI, vcpu) || 13285 - (vcpu->arch.nmi_pending && 13286 - kvm_x86_call(nmi_allowed)(vcpu, false))) 13287 - return true; 13288 - 13289 - #ifdef CONFIG_KVM_SMM 13290 - if (kvm_test_request(KVM_REQ_SMI, vcpu) || 13291 - (vcpu->arch.smi_pending && 13292 - kvm_x86_call(smi_allowed)(vcpu, false))) 13293 - return true; 13294 - #endif 13295 - 13296 - if (kvm_test_request(KVM_REQ_PMI, vcpu)) 13297 - return true; 13298 - 13299 - if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) 13300 - return true; 13301 - 13302 - if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) 13303 - return true; 13304 - 13305 - if (kvm_hv_has_stimer_pending(vcpu)) 13306 - return true; 13307 - 13308 - if (is_guest_mode(vcpu) && 13309 - kvm_x86_ops.nested_ops->has_events && 13310 - kvm_x86_ops.nested_ops->has_events(vcpu, false)) 13311 - return true; 13312 - 13313 - if (kvm_xen_has_pending_events(vcpu)) 13314 - return true; 13315 - 13316 - return false; 13317 - } 13318 - 13319 - int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 13320 - { 13321 - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); 13322 - } 13323 - 13324 - bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) 13325 - { 13326 - return kvm_vcpu_apicv_active(vcpu) && 13327 - kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); 13328 - } 13329 - 13330 - bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) 13331 - { 13332 - return vcpu->arch.preempted_in_kernel; 13333 - } 13334 - 13335 - bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 13336 - { 13337 - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) 13338 - return true; 13339 - 13340 - if (kvm_test_request(KVM_REQ_NMI, vcpu) || 13341 - #ifdef CONFIG_KVM_SMM 13342 - kvm_test_request(KVM_REQ_SMI, vcpu) || 13343 - #endif 13344 - kvm_test_request(KVM_REQ_EVENT, vcpu)) 13345 - return true; 13346 - 13347 - return kvm_arch_dy_has_pending_interrupt(vcpu); 13348 13173 } 13349 13174 13350 13175 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+24 -4
arch/x86/kvm/x86.h
··· 108 108 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); 109 109 int kvm_check_nested_events(struct kvm_vcpu *vcpu); 110 110 111 + /* Forcibly leave the nested mode in cases like a vCPU reset */ 112 + static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) 113 + { 114 + kvm_x86_ops.nested_ops->leave_nested(vcpu); 115 + } 116 + 111 117 static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) 112 118 { 113 119 return vcpu->arch.last_vmentry_cpu != -1; ··· 340 334 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 341 335 int emulation_type, void *insn, int insn_len); 342 336 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); 337 + fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); 343 338 344 339 extern struct kvm_caps kvm_caps; 345 340 extern struct kvm_host_values kvm_host; ··· 511 504 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); 512 505 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); 513 506 507 + enum kvm_msr_access { 508 + MSR_TYPE_R = BIT(0), 509 + MSR_TYPE_W = BIT(1), 510 + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, 511 + }; 512 + 514 513 /* 515 514 * Internal error codes that are used to indicate that MSR emulation encountered 516 - * an error that should result in #GP in the guest, unless userspace 517 - * handles it. 515 + * an error that should result in #GP in the guest, unless userspace handles it. 516 + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM 517 + * as part of KVM's lightly documented internal KVM_RUN return codes. 518 + * 519 + * UNSUPPORTED - The MSR isn't supported, either because it is completely 520 + * unknown to KVM, or because the MSR should not exist according 521 + * to the vCPU model. 522 + * 523 + * FILTERED - Access to the MSR is denied by a userspace MSR filter. 518 524 */ 519 - #define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ 520 - #define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ 525 + #define KVM_MSR_RET_UNSUPPORTED 2 526 + #define KVM_MSR_RET_FILTERED 3 521 527 522 528 #define __cr4_reserved_bits(__cpu_has, __c) \ 523 529 ({ \
+17 -2
tools/testing/selftests/kvm/guest_print_test.c
··· 107 107 expected_assert_msg, &assert_msg[offset]); 108 108 } 109 109 110 + /* 111 + * Open code vcpu_run(), sans the UCALL_ABORT handling, so that intentional 112 + * guest asserts guest can be verified instead of being reported as failures. 113 + */ 114 + static void do_vcpu_run(struct kvm_vcpu *vcpu) 115 + { 116 + int r; 117 + 118 + do { 119 + r = __vcpu_run(vcpu); 120 + } while (r == -1 && errno == EINTR); 121 + 122 + TEST_ASSERT(!r, KVM_IOCTL_ERROR(KVM_RUN, r)); 123 + } 124 + 110 125 static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, 111 126 const char *expected_assert) 112 127 { ··· 129 114 struct ucall uc; 130 115 131 116 while (1) { 132 - vcpu_run(vcpu); 117 + do_vcpu_run(vcpu); 133 118 134 119 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 135 120 "Unexpected exit reason: %u (%s),", ··· 174 159 175 160 vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); 176 161 run = vcpu->run; 177 - vcpu_run(vcpu); 162 + do_vcpu_run(vcpu); 178 163 179 164 TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, 180 165 "Unexpected exit reason: %u (%s),",
+20 -1
tools/testing/selftests/kvm/include/x86_64/apic.h
··· 11 11 #include <stdint.h> 12 12 13 13 #include "processor.h" 14 + #include "ucall_common.h" 14 15 15 16 #define APIC_DEFAULT_GPA 0xfee00000ULL 16 17 ··· 94 93 return rdmsr(APIC_BASE_MSR + (reg >> 4)); 95 94 } 96 95 96 + static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value) 97 + { 98 + return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value); 99 + } 100 + 97 101 static inline void x2apic_write_reg(unsigned int reg, uint64_t value) 98 102 { 99 - wrmsr(APIC_BASE_MSR + (reg >> 4), value); 103 + uint8_t fault = x2apic_write_reg_safe(reg, value); 104 + 105 + __GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n", 106 + fault, APIC_BASE_MSR + (reg >> 4), value); 100 107 } 108 + 109 + static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value) 110 + { 111 + uint8_t fault = x2apic_write_reg_safe(reg, value); 112 + 113 + __GUEST_ASSERT(fault == GP_VECTOR, 114 + "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n", 115 + APIC_BASE_MSR + (reg >> 4), value, fault); 116 + } 117 + 101 118 102 119 #endif /* SELFTEST_KVM_APIC_H */
+3 -5
tools/testing/selftests/kvm/lib/x86_64/processor.c
··· 566 566 if (kvm_fixup_exception(regs)) 567 567 return; 568 568 569 - ucall_assert(UCALL_UNHANDLED, 570 - "Unhandled exception in guest", __FILE__, __LINE__, 571 - "Unhandled exception '0x%lx' at guest RIP '0x%lx'", 572 - regs->vector, regs->rip); 569 + GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'", 570 + regs->vector, regs->rip); 573 571 } 574 572 575 573 static void vm_init_descriptor_tables(struct kvm_vm *vm) ··· 609 611 { 610 612 struct ucall uc; 611 613 612 - if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) 614 + if (get_ucall(vcpu, &uc) == UCALL_ABORT) 613 615 REPORT_GUEST_ASSERT(uc); 614 616 } 615 617
+37 -17
tools/testing/selftests/kvm/x86_64/xapic_state_test.c
··· 13 13 struct xapic_vcpu { 14 14 struct kvm_vcpu *vcpu; 15 15 bool is_x2apic; 16 + bool has_xavic_errata; 16 17 }; 17 18 18 19 static void xapic_guest_code(void) ··· 32 31 } 33 32 } 34 33 34 + #define X2APIC_RSVD_BITS_MASK (GENMASK_ULL(31, 20) | \ 35 + GENMASK_ULL(17, 16) | \ 36 + GENMASK_ULL(13, 13)) 37 + 35 38 static void x2apic_guest_code(void) 36 39 { 37 40 asm volatile("cli"); ··· 46 41 uint64_t val = x2apic_read_reg(APIC_IRR) | 47 42 x2apic_read_reg(APIC_IRR + 0x10) << 32; 48 43 49 - x2apic_write_reg(APIC_ICR, val); 44 + if (val & X2APIC_RSVD_BITS_MASK) { 45 + x2apic_write_reg_fault(APIC_ICR, val); 46 + } else { 47 + x2apic_write_reg(APIC_ICR, val); 48 + GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val); 49 + } 50 50 GUEST_SYNC(val); 51 51 } while (1); 52 52 } ··· 81 71 icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | 82 72 (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; 83 73 if (!x->is_x2apic) { 84 - val &= (-1u | (0xffull << (32 + 24))); 85 - TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); 86 - } else { 87 - TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); 74 + if (!x->has_xavic_errata) 75 + val &= (-1u | (0xffull << (32 + 24))); 76 + } else if (val & X2APIC_RSVD_BITS_MASK) { 77 + return; 88 78 } 89 - } 90 79 91 - #define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ 92 - GENMASK_ULL(17,16) | \ 93 - GENMASK_ULL(13,13)) 80 + if (x->has_xavic_errata) 81 + TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); 82 + else 83 + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); 84 + } 94 85 95 86 static void __test_icr(struct xapic_vcpu *x, uint64_t val) 96 87 { 97 - if (x->is_x2apic) { 98 - /* Hardware writing vICR register requires reserved bits 31:20, 99 - * 17:16 and 13 kept as zero to avoid #GP exception. Data value 100 - * written to vICR should mask out those bits above. 101 - */ 102 - val &= ~X2APIC_RSVED_BITS_MASK; 103 - } 104 - ____test_icr(x, val | APIC_ICR_BUSY); 88 + /* 89 + * The BUSY bit is reserved on both AMD and Intel, but only AMD treats 90 + * it is as _must_ be zero. Intel simply ignores the bit. Don't test 91 + * the BUSY bit for x2APIC, as there is no single correct behavior. 92 + */ 93 + if (!x->is_x2apic) 94 + ____test_icr(x, val | APIC_ICR_BUSY); 95 + 105 96 ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); 106 97 } 107 98 ··· 241 230 */ 242 231 vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); 243 232 x.is_x2apic = false; 233 + 234 + /* 235 + * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit), 236 + * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel 237 + * drops writes, AMD does not). Account for the errata when checking 238 + * that KVM reads back what was written. 239 + */ 240 + x.has_xavic_errata = host_cpu_is_amd && 241 + get_kvm_amd_param_bool("avic"); 244 242 245 243 vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); 246 244