Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvm-x86-misc-6.18' of https://github.com/kvm-x86/linux into HEAD

KVM x86 changes for 6.18

- Don't (re)check L1 intercepts when completing userspace I/O to fix a flaw
where a misbehaving usersepace (a.k.a. syzkaller) could swizzle L1's
intercepts and trigger a variety of WARNs in KVM.

- Emulate PERF_CNTR_GLOBAL_STATUS_SET for PerfMonV2 guests, as the MSR is
supposed to exist for v2 PMUs.

- Allow Centaur CPU leaves (base 0xC000_0000) for Zhaoxin CPUs.

- Clean up KVM's vector hashing code for delivering lowest priority IRQs.

- Clean up the fastpath handler code to only handle IPIs and WRMSRs that are
actually "fast", as opposed to handling those that KVM _hopes_ are fast, and
in the process of doing so add fastpath support for TSC_DEADLINE writes on
AMD CPUs.

- Clean up a pile of PMU code in anticipation of adding support for mediated
vPMUs.

- Add support for the immediate forms of RDMSR and WRMSRNS, sans full
emulator support (KVM should never need to emulate the MSRs outside of
forced emulation and other contrived testing scenarios).

- Clean up the MSR APIs in preparation for CET and FRED virtualization, as
well as mediated vPMU support.

- Rejecting a fully in-kernel IRQCHIP if EOIs are protected, i.e. for TDX VMs,
as KVM can't faithfully emulate an I/O APIC for such guests.

- KVM_REQ_MSR_FILTER_CHANGED into a generic RECALC_INTERCEPTS in preparation
for mediated vPMU support, as KVM will need to recalculate MSR intercepts in
response to PMU refreshes for guests with mediated vPMUs.

- Misc cleanups and minor fixes.

+715 -520
+6
Documentation/virt/kvm/api.rst
··· 3075 3075 Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. 3076 3076 See KVM_GET_PIT2 for details on struct kvm_pit_state2. 3077 3077 3078 + .. Tip:: 3079 + ``KVM_SET_PIT2`` strictly adheres to the spec of Intel 8254 PIT. For example, 3080 + a ``count`` value of 0 in ``struct kvm_pit_channel_state`` is interpreted as 3081 + 65536, which is the maximum count value. Refer to `Intel 8254 programmable 3082 + interval timer <https://www.scs.stanford.edu/10wi-cs140/pintos/specs/8254.pdf>`_. 3083 + 3078 3084 This IOCTL replaces the obsolete KVM_SET_PIT. 3079 3085 3080 3086
+3 -3
Documentation/virt/kvm/x86/hypercalls.rst
··· 137 137 Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, 138 138 or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. 139 139 140 - 6. KVM_HC_SEND_IPI 140 + 7. KVM_HC_SEND_IPI 141 141 ------------------ 142 142 143 143 :Architecture: x86 ··· 158 158 159 159 Returns the number of CPUs to which the IPIs were delivered successfully. 160 160 161 - 7. KVM_HC_SCHED_YIELD 161 + 8. KVM_HC_SCHED_YIELD 162 162 --------------------- 163 163 164 164 :Architecture: x86 ··· 170 170 :Usage example: When sending a call-function IPI-many to vCPUs, yield if 171 171 any of the IPI target vCPUs was preempted. 172 172 173 - 8. KVM_HC_MAP_GPA_RANGE 173 + 9. KVM_HC_MAP_GPA_RANGE 174 174 ------------------------- 175 175 :Architecture: x86 176 176 :Status: active
+1
arch/x86/include/asm/cpufeatures.h
··· 497 497 #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ 498 498 #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ 499 499 #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ 500 + #define X86_FEATURE_MSR_IMM (21*32+15) /* MSR immediate form instructions */ 500 501 501 502 /* 502 503 * BUG word(s)
+1 -1
arch/x86/include/asm/kvm-x86-ops.h
··· 138 138 KVM_X86_OP(apic_init_signal_blocked) 139 139 KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) 140 140 KVM_X86_OP_OPTIONAL(migrate_timers) 141 - KVM_X86_OP(recalc_msr_intercepts) 141 + KVM_X86_OP(recalc_intercepts) 142 142 KVM_X86_OP(complete_emulated_msr) 143 143 KVM_X86_OP(vcpu_deliver_sipi_vector) 144 144 KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
+20 -11
arch/x86/include/asm/kvm_host.h
··· 120 120 #define KVM_REQ_TLB_FLUSH_GUEST \ 121 121 KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 122 122 #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) 123 - #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) 123 + #define KVM_REQ_RECALC_INTERCEPTS KVM_ARCH_REQ(29) 124 124 #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ 125 125 KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) 126 126 #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ ··· 545 545 #define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ 546 546 KVM_MAX_NR_AMD_GP_COUNTERS) 547 547 548 - #define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3 549 - #define KVM_MAX_NR_AMD_FIXED_COUTNERS 0 550 - #define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \ 551 - KVM_MAX_NR_AMD_FIXED_COUTNERS) 548 + #define KVM_MAX_NR_INTEL_FIXED_COUNTERS 3 549 + #define KVM_MAX_NR_AMD_FIXED_COUNTERS 0 550 + #define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUNTERS, \ 551 + KVM_MAX_NR_AMD_FIXED_COUNTERS) 552 552 553 553 struct kvm_pmu { 554 554 u8 version; ··· 578 578 }; 579 579 DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); 580 580 DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); 581 + 582 + DECLARE_BITMAP(pmc_counting_instructions, X86_PMC_IDX_MAX); 583 + DECLARE_BITMAP(pmc_counting_branches, X86_PMC_IDX_MAX); 581 584 582 585 u64 ds_area; 583 586 u64 pebs_enable; ··· 774 771 CPUID_7_2_EDX, 775 772 CPUID_24_0_EBX, 776 773 CPUID_8000_0021_ECX, 774 + CPUID_7_1_ECX, 777 775 NR_KVM_CPU_CAPS, 778 776 779 777 NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, ··· 930 926 bool emulate_regs_need_sync_from_vcpu; 931 927 int (*complete_userspace_io)(struct kvm_vcpu *vcpu); 932 928 unsigned long cui_linear_rip; 929 + int cui_rdmsr_imm_reg; 933 930 934 931 gpa_t time; 935 932 s8 pvclock_tsc_shift; ··· 1386 1381 u8 vm_type; 1387 1382 bool has_private_mem; 1388 1383 bool has_protected_state; 1384 + bool has_protected_eoi; 1389 1385 bool pre_fault_allowed; 1390 1386 struct hlist_head *mmu_page_hash; 1391 1387 struct list_head active_mmu_pages; ··· 1927 1921 int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); 1928 1922 1929 1923 void (*migrate_timers)(struct kvm_vcpu *vcpu); 1930 - void (*recalc_msr_intercepts)(struct kvm_vcpu *vcpu); 1924 + void (*recalc_intercepts)(struct kvm_vcpu *vcpu); 1931 1925 int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); 1932 1926 1933 1927 void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); ··· 2168 2162 2169 2163 void kvm_enable_efer_bits(u64); 2170 2164 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); 2171 - int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2172 - int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); 2173 - int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); 2174 - int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2175 - int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); 2165 + int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2166 + int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); 2167 + int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2168 + int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); 2169 + int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data); 2170 + int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data); 2176 2171 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); 2172 + int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); 2177 2173 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); 2174 + int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); 2178 2175 int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); 2179 2176 int kvm_emulate_invd(struct kvm_vcpu *vcpu); 2180 2177 int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
+10 -6
arch/x86/include/asm/msr-index.h
··· 315 315 #define PERF_CAP_PT_IDX 16 316 316 317 317 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 318 - #define PERF_CAP_PEBS_TRAP BIT_ULL(6) 319 - #define PERF_CAP_ARCH_REG BIT_ULL(7) 320 - #define PERF_CAP_PEBS_FORMAT 0xf00 321 - #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) 322 - #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ 323 - PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) 318 + 319 + #define PERF_CAP_LBR_FMT 0x3f 320 + #define PERF_CAP_PEBS_TRAP BIT_ULL(6) 321 + #define PERF_CAP_ARCH_REG BIT_ULL(7) 322 + #define PERF_CAP_PEBS_FORMAT 0xf00 323 + #define PERF_CAP_FW_WRITES BIT_ULL(13) 324 + #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) 325 + #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ 326 + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) 324 327 325 328 #define MSR_IA32_RTIT_CTL 0x00000570 326 329 #define RTIT_CTL_TRACEEN BIT(0) ··· 736 733 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 737 734 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 738 735 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 736 + #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET 0xc0000303 739 737 740 738 /* AMD Hardware Feedback Support MSRs */ 741 739 #define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500
+5 -1
arch/x86/include/uapi/asm/vmx.h
··· 94 94 #define EXIT_REASON_BUS_LOCK 74 95 95 #define EXIT_REASON_NOTIFY 75 96 96 #define EXIT_REASON_TDCALL 77 97 + #define EXIT_REASON_MSR_READ_IMM 84 98 + #define EXIT_REASON_MSR_WRITE_IMM 85 97 99 98 100 #define VMX_EXIT_REASONS \ 99 101 { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ ··· 160 158 { EXIT_REASON_TPAUSE, "TPAUSE" }, \ 161 159 { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ 162 160 { EXIT_REASON_NOTIFY, "NOTIFY" }, \ 163 - { EXIT_REASON_TDCALL, "TDCALL" } 161 + { EXIT_REASON_TDCALL, "TDCALL" }, \ 162 + { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \ 163 + { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" } 164 164 165 165 #define VMX_EXIT_REASON_FLAGS \ 166 166 { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
+1
arch/x86/kernel/cpu/scattered.c
··· 27 27 { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, 28 28 { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, 29 29 { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, 30 + { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 }, 30 31 { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, 31 32 { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, 32 33 { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+10 -3
arch/x86/kvm/cpuid.c
··· 448 448 * adjustments to the reserved GPA bits. 449 449 */ 450 450 kvm_mmu_after_set_cpuid(vcpu); 451 + 452 + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); 451 453 } 452 454 453 455 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) ··· 987 985 F(LAM), 988 986 ); 989 987 988 + kvm_cpu_cap_init(CPUID_7_1_ECX, 989 + SCATTERED_F(MSR_IMM), 990 + ); 991 + 990 992 kvm_cpu_cap_init(CPUID_7_1_EDX, 991 993 F(AVX_VNNI_INT8), 992 994 F(AVX_NE_CONVERT), ··· 1417 1411 goto out; 1418 1412 1419 1413 cpuid_entry_override(entry, CPUID_7_1_EAX); 1414 + cpuid_entry_override(entry, CPUID_7_1_ECX); 1420 1415 cpuid_entry_override(entry, CPUID_7_1_EDX); 1421 1416 entry->ebx = 0; 1422 - entry->ecx = 0; 1423 1417 } 1424 1418 if (max_idx >= 2) { 1425 1419 entry = do_host_cpuid(array, function, 2); ··· 1826 1820 int r; 1827 1821 1828 1822 if (func == CENTAUR_CPUID_SIGNATURE && 1829 - boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) 1823 + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && 1824 + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) 1830 1825 return 0; 1831 1826 1832 1827 r = do_cpuid_func(array, func, type); ··· 2008 2001 if (function == 7 && index == 0) { 2009 2002 u64 data; 2010 2003 if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && 2011 - !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && 2004 + !kvm_msr_read(vcpu, MSR_IA32_TSX_CTRL, &data) && 2012 2005 (data & TSX_CTRL_CPUID_CLEAR)) 2013 2006 *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); 2014 2007 } else if (function == 0x80000007) {
+6 -7
arch/x86/kvm/emulate.c
··· 4330 4330 I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), 4331 4331 G(ByteOp, group11), G(0, group11), 4332 4332 /* 0xC8 - 0xCF */ 4333 - I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter), 4334 - I(Stack | IsBranch, em_leave), 4333 + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), 4334 + I(Stack, em_leave), 4335 4335 I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), 4336 4336 I(ImplicitOps | IsBranch, em_ret_far), 4337 4337 D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), ··· 5107 5107 ctxt->mem_read.end = 0; 5108 5108 } 5109 5109 5110 - int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) 5110 + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) 5111 5111 { 5112 5112 const struct x86_emulate_ops *ops = ctxt->ops; 5113 5113 int rc = X86EMUL_CONTINUE; 5114 5114 int saved_dst_type = ctxt->dst.type; 5115 - bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt); 5116 5115 5117 5116 ctxt->mem_read.pos = 0; 5118 5117 ··· 5159 5160 fetch_possible_mmx_operand(&ctxt->dst); 5160 5161 } 5161 5162 5162 - if (unlikely(is_guest_mode) && ctxt->intercept) { 5163 + if (unlikely(check_intercepts) && ctxt->intercept) { 5163 5164 rc = emulator_check_intercept(ctxt, ctxt->intercept, 5164 5165 X86_ICPT_PRE_EXCEPT); 5165 5166 if (rc != X86EMUL_CONTINUE) ··· 5188 5189 goto done; 5189 5190 } 5190 5191 5191 - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { 5192 + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { 5192 5193 rc = emulator_check_intercept(ctxt, ctxt->intercept, 5193 5194 X86_ICPT_POST_EXCEPT); 5194 5195 if (rc != X86EMUL_CONTINUE) ··· 5242 5243 5243 5244 special_insn: 5244 5245 5245 - if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) { 5246 + if (unlikely(check_intercepts) && (ctxt->d & Intercept)) { 5246 5247 rc = emulator_check_intercept(ctxt, ctxt->intercept, 5247 5248 X86_ICPT_POST_MEMACCESS); 5248 5249 if (rc != X86EMUL_CONTINUE)
+5 -7
arch/x86/kvm/hyperv.c
··· 1168 1168 BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); 1169 1169 BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); 1170 1170 1171 - mutex_lock(&hv->hv_lock); 1171 + guard(mutex)(&hv->hv_lock); 1172 1172 1173 1173 if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || 1174 1174 hv->hv_tsc_page_status == HV_TSC_PAGE_SET || 1175 1175 hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) 1176 - goto out_unlock; 1176 + return; 1177 1177 1178 1178 if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) 1179 - goto out_unlock; 1179 + return; 1180 1180 1181 1181 gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; 1182 1182 /* ··· 1192 1192 goto out_err; 1193 1193 1194 1194 hv->hv_tsc_page_status = HV_TSC_PAGE_SET; 1195 - goto out_unlock; 1195 + return; 1196 1196 } 1197 1197 1198 1198 /* ··· 1228 1228 goto out_err; 1229 1229 1230 1230 hv->hv_tsc_page_status = HV_TSC_PAGE_SET; 1231 - goto out_unlock; 1231 + return; 1232 1232 1233 1233 out_err: 1234 1234 hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; 1235 - out_unlock: 1236 - mutex_unlock(&hv->hv_lock); 1237 1235 } 1238 1236 1239 1237 void kvm_hv_request_tsc_page_update(struct kvm *kvm)
+1 -14
arch/x86/kvm/ioapic.c
··· 1 + // SPDX-License-Identifier: LGPL-2.1-or-later 1 2 /* 2 3 * Copyright (C) 2001 MandrakeSoft S.A. 3 4 * Copyright 2010 Red Hat, Inc. and/or its affiliates. ··· 8 7 * 75002 Paris - France 9 8 * http://www.linux-mandrake.com/ 10 9 * http://www.mandrakesoft.com/ 11 - * 12 - * This library is free software; you can redistribute it and/or 13 - * modify it under the terms of the GNU Lesser General Public 14 - * License as published by the Free Software Foundation; either 15 - * version 2 of the License, or (at your option) any later version. 16 - * 17 - * This library is distributed in the hope that it will be useful, 18 - * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 - * Lesser General Public License for more details. 21 - * 22 - * You should have received a copy of the GNU Lesser General Public 23 - * License along with this library; if not, write to the Free Software 24 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 25 10 * 26 11 * Yunhong Jiang <yunhong.jiang@intel.com> 27 12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
-57
arch/x86/kvm/irq.c
··· 195 195 return irqchip_in_kernel(kvm); 196 196 } 197 197 198 - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 199 - struct kvm_lapic_irq *irq, struct dest_map *dest_map) 200 - { 201 - int r = -1; 202 - struct kvm_vcpu *vcpu, *lowest = NULL; 203 - unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; 204 - unsigned int dest_vcpus = 0; 205 - 206 - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 207 - return r; 208 - 209 - if (irq->dest_mode == APIC_DEST_PHYSICAL && 210 - irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { 211 - pr_info("apic: phys broadcast and lowest prio\n"); 212 - irq->delivery_mode = APIC_DM_FIXED; 213 - } 214 - 215 - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); 216 - 217 - kvm_for_each_vcpu(i, vcpu, kvm) { 218 - if (!kvm_apic_present(vcpu)) 219 - continue; 220 - 221 - if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, 222 - irq->dest_id, irq->dest_mode)) 223 - continue; 224 - 225 - if (!kvm_lowest_prio_delivery(irq)) { 226 - if (r < 0) 227 - r = 0; 228 - r += kvm_apic_set_irq(vcpu, irq, dest_map); 229 - } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { 230 - if (!kvm_vector_hashing_enabled()) { 231 - if (!lowest) 232 - lowest = vcpu; 233 - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) 234 - lowest = vcpu; 235 - } else { 236 - __set_bit(i, dest_vcpu_bitmap); 237 - dest_vcpus++; 238 - } 239 - } 240 - } 241 - 242 - if (dest_vcpus != 0) { 243 - int idx = kvm_vector_to_index(irq->vector, dest_vcpus, 244 - dest_vcpu_bitmap, KVM_MAX_VCPUS); 245 - 246 - lowest = kvm_get_vcpu(kvm, idx); 247 - } 248 - 249 - if (lowest) 250 - r = kvm_apic_set_irq(lowest, irq, dest_map); 251 - 252 - return r; 253 - } 254 - 255 198 static void kvm_msi_to_lapic_irq(struct kvm *kvm, 256 199 struct kvm_kernel_irq_routing_entry *e, 257 200 struct kvm_lapic_irq *irq)
-4
arch/x86/kvm/irq.h
··· 121 121 122 122 int apic_has_pending_timer(struct kvm_vcpu *vcpu); 123 123 124 - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 125 - struct kvm_lapic_irq *irq, 126 - struct dest_map *dest_map); 127 - 128 124 #endif
+1 -2
arch/x86/kvm/kvm_emulate.h
··· 235 235 void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); 236 236 237 237 bool (*is_smm)(struct x86_emulate_ctxt *ctxt); 238 - bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); 239 238 int (*leave_smm)(struct x86_emulate_ctxt *ctxt); 240 239 void (*triple_fault)(struct x86_emulate_ctxt *ctxt); 241 240 int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); ··· 520 521 #define EMULATION_RESTART 1 521 522 #define EMULATION_INTERCEPTED 2 522 523 void init_decode_cache(struct x86_emulate_ctxt *ctxt); 523 - int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); 524 + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts); 524 525 int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 525 526 u16 tss_selector, int idt_index, int reason, 526 527 bool has_error_code, u32 error_code);
+129 -44
arch/x86/kvm/lapic.c
··· 74 74 #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 75 75 /* step-by-step approximation to mitigate fluctuation */ 76 76 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 77 + 78 + static bool __read_mostly vector_hashing_enabled = true; 79 + module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444); 80 + 77 81 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); 78 82 static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); 79 83 ··· 134 130 (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); 135 131 } 136 132 137 - bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) 133 + static bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) 138 134 { 139 135 return kvm_x86_ops.set_hv_timer 140 136 && !(kvm_mwait_in_guest(vcpu->kvm) || ··· 1067 1063 } 1068 1064 EXPORT_SYMBOL_GPL(kvm_apic_match_dest); 1069 1065 1070 - int kvm_vector_to_index(u32 vector, u32 dest_vcpus, 1071 - const unsigned long *bitmap, u32 bitmap_size) 1066 + static int kvm_vector_to_index(u32 vector, u32 dest_vcpus, 1067 + const unsigned long *bitmap, u32 bitmap_size) 1072 1068 { 1073 - u32 mod; 1074 - int i, idx = -1; 1069 + int idx = find_nth_bit(bitmap, bitmap_size, vector % dest_vcpus); 1075 1070 1076 - mod = vector % dest_vcpus; 1077 - 1078 - for (i = 0; i <= mod; i++) { 1079 - idx = find_next_bit(bitmap, bitmap_size, idx + 1); 1080 - BUG_ON(idx == bitmap_size); 1081 - } 1082 - 1071 + BUG_ON(idx >= bitmap_size); 1083 1072 return idx; 1084 1073 } 1085 1074 ··· 1101 1104 } 1102 1105 1103 1106 return false; 1107 + } 1108 + 1109 + static bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) 1110 + { 1111 + return (irq->delivery_mode == APIC_DM_LOWEST || irq->msi_redir_hint); 1112 + } 1113 + 1114 + static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) 1115 + { 1116 + return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 1104 1117 } 1105 1118 1106 1119 /* Return true if the interrupt can be handled by using *bitmap as index mask ··· 1156 1149 if (!kvm_lowest_prio_delivery(irq)) 1157 1150 return true; 1158 1151 1159 - if (!kvm_vector_hashing_enabled()) { 1152 + if (!vector_hashing_enabled) { 1160 1153 lowest = -1; 1161 1154 for_each_set_bit(i, bitmap, 16) { 1162 1155 if (!(*dst)[i]) ··· 1263 1256 1264 1257 rcu_read_unlock(); 1265 1258 return ret; 1259 + } 1260 + 1261 + int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 1262 + struct kvm_lapic_irq *irq, struct dest_map *dest_map) 1263 + { 1264 + int r = -1; 1265 + struct kvm_vcpu *vcpu, *lowest = NULL; 1266 + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; 1267 + unsigned int dest_vcpus = 0; 1268 + 1269 + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) 1270 + return r; 1271 + 1272 + if (irq->dest_mode == APIC_DEST_PHYSICAL && 1273 + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { 1274 + pr_info("apic: phys broadcast and lowest prio\n"); 1275 + irq->delivery_mode = APIC_DM_FIXED; 1276 + } 1277 + 1278 + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); 1279 + 1280 + kvm_for_each_vcpu(i, vcpu, kvm) { 1281 + if (!kvm_apic_present(vcpu)) 1282 + continue; 1283 + 1284 + if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, 1285 + irq->dest_id, irq->dest_mode)) 1286 + continue; 1287 + 1288 + if (!kvm_lowest_prio_delivery(irq)) { 1289 + if (r < 0) 1290 + r = 0; 1291 + r += kvm_apic_set_irq(vcpu, irq, dest_map); 1292 + } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) { 1293 + if (!vector_hashing_enabled) { 1294 + if (!lowest) 1295 + lowest = vcpu; 1296 + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) 1297 + lowest = vcpu; 1298 + } else { 1299 + __set_bit(i, dest_vcpu_bitmap); 1300 + dest_vcpus++; 1301 + } 1302 + } 1303 + } 1304 + 1305 + if (dest_vcpus != 0) { 1306 + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, 1307 + dest_vcpu_bitmap, KVM_MAX_VCPUS); 1308 + 1309 + lowest = kvm_get_vcpu(kvm, idx); 1310 + } 1311 + 1312 + if (lowest) 1313 + r = kvm_apic_set_irq(lowest, irq, dest_map); 1314 + 1315 + return r; 1266 1316 } 1267 1317 1268 1318 /* ··· 1465 1401 rcu_read_unlock(); 1466 1402 } 1467 1403 1468 - int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) 1469 - { 1470 - return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; 1471 - } 1472 - 1473 1404 static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) 1474 1405 { 1475 1406 return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); ··· 1542 1483 } 1543 1484 EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); 1544 1485 1486 + static void kvm_icr_to_lapic_irq(struct kvm_lapic *apic, u32 icr_low, 1487 + u32 icr_high, struct kvm_lapic_irq *irq) 1488 + { 1489 + /* KVM has no delay and should always clear the BUSY/PENDING flag. */ 1490 + WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); 1491 + 1492 + irq->vector = icr_low & APIC_VECTOR_MASK; 1493 + irq->delivery_mode = icr_low & APIC_MODE_MASK; 1494 + irq->dest_mode = icr_low & APIC_DEST_MASK; 1495 + irq->level = (icr_low & APIC_INT_ASSERT) != 0; 1496 + irq->trig_mode = icr_low & APIC_INT_LEVELTRIG; 1497 + irq->shorthand = icr_low & APIC_SHORT_MASK; 1498 + irq->msi_redir_hint = false; 1499 + if (apic_x2apic_mode(apic)) 1500 + irq->dest_id = icr_high; 1501 + else 1502 + irq->dest_id = GET_XAPIC_DEST_FIELD(icr_high); 1503 + } 1504 + 1545 1505 void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) 1546 1506 { 1547 1507 struct kvm_lapic_irq irq; 1548 1508 1549 - /* KVM has no delay and should always clear the BUSY/PENDING flag. */ 1550 - WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); 1551 - 1552 - irq.vector = icr_low & APIC_VECTOR_MASK; 1553 - irq.delivery_mode = icr_low & APIC_MODE_MASK; 1554 - irq.dest_mode = icr_low & APIC_DEST_MASK; 1555 - irq.level = (icr_low & APIC_INT_ASSERT) != 0; 1556 - irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; 1557 - irq.shorthand = icr_low & APIC_SHORT_MASK; 1558 - irq.msi_redir_hint = false; 1559 - if (apic_x2apic_mode(apic)) 1560 - irq.dest_id = icr_high; 1561 - else 1562 - irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); 1509 + kvm_icr_to_lapic_irq(apic, icr_low, icr_high, &irq); 1563 1510 1564 1511 trace_kvm_apic_ipi(icr_low, irq.dest_id); 1565 1512 ··· 2500 2435 2501 2436 #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) 2502 2437 2503 - int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 2438 + static int __kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data, bool fast) 2504 2439 { 2505 2440 if (data & X2APIC_ICR_RESERVED_BITS) 2506 2441 return 1; ··· 2515 2450 */ 2516 2451 data &= ~APIC_ICR_BUSY; 2517 2452 2518 - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 2453 + if (fast) { 2454 + struct kvm_lapic_irq irq; 2455 + int ignored; 2456 + 2457 + kvm_icr_to_lapic_irq(apic, (u32)data, (u32)(data >> 32), &irq); 2458 + 2459 + if (!kvm_irq_delivery_to_apic_fast(apic->vcpu->kvm, apic, &irq, 2460 + &ignored, NULL)) 2461 + return -EWOULDBLOCK; 2462 + 2463 + trace_kvm_apic_ipi((u32)data, irq.dest_id); 2464 + } else { 2465 + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); 2466 + } 2519 2467 if (kvm_x86_ops.x2apic_icr_is_split) { 2520 2468 kvm_lapic_set_reg(apic, APIC_ICR, data); 2521 2469 kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); ··· 2537 2459 } 2538 2460 trace_kvm_apic_write(APIC_ICR, data); 2539 2461 return 0; 2462 + } 2463 + 2464 + static int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) 2465 + { 2466 + return __kvm_x2apic_icr_write(apic, data, false); 2467 + } 2468 + 2469 + int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data) 2470 + { 2471 + return __kvm_x2apic_icr_write(apic, data, true); 2540 2472 } 2541 2473 2542 2474 static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) ··· 2749 2661 int kvm_alloc_apic_access_page(struct kvm *kvm) 2750 2662 { 2751 2663 void __user *hva; 2752 - int ret = 0; 2753 2664 2754 - mutex_lock(&kvm->slots_lock); 2665 + guard(mutex)(&kvm->slots_lock); 2666 + 2755 2667 if (kvm->arch.apic_access_memslot_enabled || 2756 2668 kvm->arch.apic_access_memslot_inhibited) 2757 - goto out; 2669 + return 0; 2758 2670 2759 2671 hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 2760 2672 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); 2761 - if (IS_ERR(hva)) { 2762 - ret = PTR_ERR(hva); 2763 - goto out; 2764 - } 2673 + if (IS_ERR(hva)) 2674 + return PTR_ERR(hva); 2765 2675 2766 2676 kvm->arch.apic_access_memslot_enabled = true; 2767 - out: 2768 - mutex_unlock(&kvm->slots_lock); 2769 - return ret; 2677 + 2678 + return 0; 2770 2679 } 2771 2680 EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); 2772 2681
+4 -11
arch/x86/kvm/lapic.h
··· 105 105 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); 106 106 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, 107 107 int shorthand, unsigned int dest, int dest_mode); 108 - int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 109 108 void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); 110 109 bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); 111 110 bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); ··· 118 119 119 120 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, 120 121 struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); 122 + int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, 123 + struct kvm_lapic_irq *irq, 124 + struct dest_map *dest_map); 121 125 void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); 122 126 123 127 int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); ··· 139 137 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 140 138 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 141 139 142 - int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data); 140 + int kvm_x2apic_icr_write_fast(struct kvm_lapic *apic, u64 data); 143 141 int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); 144 142 int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 145 143 ··· 224 222 !kvm_x86_call(apic_init_signal_blocked)(vcpu); 225 223 } 226 224 227 - static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) 228 - { 229 - return (irq->delivery_mode == APIC_DM_LOWEST || 230 - irq->msi_redir_hint); 231 - } 232 - 233 225 static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) 234 226 { 235 227 return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); ··· 238 242 239 243 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, 240 244 struct kvm_vcpu **dest_vcpu); 241 - int kvm_vector_to_index(u32 vector, u32 dest_vcpus, 242 - const unsigned long *bitmap, u32 bitmap_size); 243 245 void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); 244 246 void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); 245 247 void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); 246 248 bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); 247 249 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); 248 - bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu); 249 250 250 251 static inline enum lapic_mode kvm_apic_mode(u64 apic_base) 251 252 {
+137 -32
arch/x86/kvm/pmu.c
··· 26 26 /* This is enough to filter the vast majority of currently defined events. */ 27 27 #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 28 28 29 + /* Unadultered PMU capabilities of the host, i.e. of hardware. */ 30 + static struct x86_pmu_capability __read_mostly kvm_host_pmu; 31 + 32 + /* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */ 29 33 struct x86_pmu_capability __read_mostly kvm_pmu_cap; 30 34 EXPORT_SYMBOL_GPL(kvm_pmu_cap); 31 35 32 - struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; 33 - EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); 36 + struct kvm_pmu_emulated_event_selectors { 37 + u64 INSTRUCTIONS_RETIRED; 38 + u64 BRANCH_INSTRUCTIONS_RETIRED; 39 + }; 40 + static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; 34 41 35 42 /* Precise Distribution of Instructions Retired (PDIR) */ 36 43 static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { ··· 101 94 #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP 102 95 #include <asm/kvm-x86-pmu-ops.h> 103 96 #undef __KVM_X86_PMU_OP 97 + } 98 + 99 + void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) 100 + { 101 + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; 102 + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; 103 + 104 + perf_get_x86_pmu_capability(&kvm_host_pmu); 105 + 106 + /* 107 + * Hybrid PMUs don't play nice with virtualization without careful 108 + * configuration by userspace, and KVM's APIs for reporting supported 109 + * vPMU features do not account for hybrid PMUs. Disable vPMU support 110 + * for hybrid PMUs until KVM gains a way to let userspace opt-in. 111 + */ 112 + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) 113 + enable_pmu = false; 114 + 115 + if (enable_pmu) { 116 + /* 117 + * WARN if perf did NOT disable hardware PMU if the number of 118 + * architecturally required GP counters aren't present, i.e. if 119 + * there are a non-zero number of counters, but fewer than what 120 + * is architecturally required. 121 + */ 122 + if (!kvm_host_pmu.num_counters_gp || 123 + WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs)) 124 + enable_pmu = false; 125 + else if (is_intel && !kvm_host_pmu.version) 126 + enable_pmu = false; 127 + } 128 + 129 + if (!enable_pmu) { 130 + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); 131 + return; 132 + } 133 + 134 + memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu)); 135 + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); 136 + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, 137 + pmu_ops->MAX_NR_GP_COUNTERS); 138 + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, 139 + KVM_MAX_NR_FIXED_COUNTERS); 140 + 141 + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = 142 + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); 143 + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = 144 + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 104 145 } 105 146 106 147 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) ··· 481 426 return true; 482 427 } 483 428 484 - static bool check_pmu_event_filter(struct kvm_pmc *pmc) 429 + static bool pmc_is_event_allowed(struct kvm_pmc *pmc) 485 430 { 486 431 struct kvm_x86_pmu_event_filter *filter; 487 432 struct kvm *kvm = pmc->vcpu->kvm; ··· 496 441 return is_fixed_event_allowed(filter, pmc->idx); 497 442 } 498 443 499 - static bool pmc_event_is_allowed(struct kvm_pmc *pmc) 500 - { 501 - return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && 502 - check_pmu_event_filter(pmc); 503 - } 504 - 505 444 static int reprogram_counter(struct kvm_pmc *pmc) 506 445 { 507 446 struct kvm_pmu *pmu = pmc_to_pmu(pmc); ··· 506 457 507 458 emulate_overflow = pmc_pause_counter(pmc); 508 459 509 - if (!pmc_event_is_allowed(pmc)) 460 + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || 461 + !pmc_is_event_allowed(pmc)) 510 462 return 0; 511 463 512 464 if (emulate_overflow) ··· 541 491 !(eventsel & ARCH_PERFMON_EVENTSEL_OS), 542 492 eventsel & ARCH_PERFMON_EVENTSEL_INT); 543 493 } 494 + 495 + static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel) 496 + { 497 + /* 498 + * Ignore checks for edge detect (all events currently emulated by KVM 499 + * are always rising edges), pin control (unsupported by modern CPUs), 500 + * and counter mask and its invert flag (KVM doesn't emulate multiple 501 + * events in a single clock cycle). 502 + * 503 + * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit 504 + * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34). 505 + * Checking the "in HLE/RTM transaction" flags is correct as the vCPU 506 + * can't be in a transaction if KVM is emulating an instruction. 507 + * 508 + * Checking the reserved bits might be wrong if they are defined in the 509 + * future, but so could ignoring them, so do the simple thing for now. 510 + */ 511 + return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB); 512 + } 513 + 514 + void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) 515 + { 516 + bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1); 517 + bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1); 518 + 519 + /* 520 + * Do NOT consult the PMU event filters, as the filters must be checked 521 + * at the time of emulation to ensure KVM uses fresh information, e.g. 522 + * omitting a PMC from a bitmap could result in a missed event if the 523 + * filter is changed to allow counting the event. 524 + */ 525 + if (!pmc_is_locally_enabled(pmc)) 526 + return; 527 + 528 + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) 529 + bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1); 530 + 531 + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) 532 + bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); 533 + } 534 + EXPORT_SYMBOL_GPL(kvm_pmu_recalc_pmc_emulation); 544 535 545 536 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) 546 537 { ··· 618 527 */ 619 528 if (unlikely(pmu->need_cleanup)) 620 529 kvm_pmu_cleanup(vcpu); 530 + 531 + kvm_for_each_pmc(pmu, pmc, bit, bitmap) 532 + kvm_pmu_recalc_pmc_emulation(pmu, pmc); 621 533 } 622 534 623 535 int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) ··· 744 650 msr_info->data = pmu->global_ctrl; 745 651 break; 746 652 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: 653 + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: 747 654 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 748 655 msr_info->data = 0; 749 656 break; ··· 805 710 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: 806 711 if (!msr_info->host_initiated) 807 712 pmu->global_status &= ~data; 713 + break; 714 + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: 715 + if (!msr_info->host_initiated) 716 + pmu->global_status |= data & ~pmu->global_status_rsvd; 808 717 break; 809 718 default: 810 719 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); ··· 888 789 */ 889 790 if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) 890 791 pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); 792 + 793 + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); 794 + bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, 795 + pmu->nr_arch_fixed_counters); 891 796 } 892 797 893 798 void kvm_pmu_init(struct kvm_vcpu *vcpu) ··· 916 813 pmu->pmc_in_use, X86_PMC_IDX_MAX); 917 814 918 815 kvm_for_each_pmc(pmu, pmc, i, bitmask) { 919 - if (pmc->perf_event && !pmc_speculative_in_use(pmc)) 816 + if (pmc->perf_event && !pmc_is_locally_enabled(pmc)) 920 817 pmc_stop_counter(pmc); 921 818 } 922 819 ··· 963 860 select_user; 964 861 } 965 862 966 - void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) 863 + static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, 864 + const unsigned long *event_pmcs) 967 865 { 968 866 DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); 969 867 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 970 868 struct kvm_pmc *pmc; 971 - int i; 869 + int i, idx; 972 870 973 871 BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); 974 872 873 + if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX)) 874 + return; 875 + 975 876 if (!kvm_pmu_has_perf_global_ctrl(pmu)) 976 - bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); 977 - else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, 877 + bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX); 878 + else if (!bitmap_and(bitmap, event_pmcs, 978 879 (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) 979 880 return; 980 881 882 + idx = srcu_read_lock(&vcpu->kvm->srcu); 981 883 kvm_for_each_pmc(pmu, pmc, i, bitmap) { 982 - /* 983 - * Ignore checks for edge detect (all events currently emulated 984 - * but KVM are always rising edges), pin control (unsupported 985 - * by modern CPUs), and counter mask and its invert flag (KVM 986 - * doesn't emulate multiple events in a single clock cycle). 987 - * 988 - * Note, the uppermost nibble of AMD's mask overlaps Intel's 989 - * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved 990 - * bits (bits 35:34). Checking the "in HLE/RTM transaction" 991 - * flags is correct as the vCPU can't be in a transaction if 992 - * KVM is emulating an instruction. Checking the reserved bits 993 - * might be wrong if they are defined in the future, but so 994 - * could ignoring them, so do the simple thing for now. 995 - */ 996 - if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || 997 - !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) 884 + if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) 998 885 continue; 999 886 1000 887 kvm_pmu_incr_counter(pmc); 1001 888 } 889 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 1002 890 } 1003 - EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); 891 + 892 + void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) 893 + { 894 + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); 895 + } 896 + EXPORT_SYMBOL_GPL(kvm_pmu_instruction_retired); 897 + 898 + void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) 899 + { 900 + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); 901 + } 902 + EXPORT_SYMBOL_GPL(kvm_pmu_branch_retired); 1004 903 1005 904 static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) 1006 905 {
+7 -53
arch/x86/kvm/pmu.h
··· 23 23 24 24 #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED 25 25 26 - struct kvm_pmu_emulated_event_selectors { 27 - u64 INSTRUCTIONS_RETIRED; 28 - u64 BRANCH_INSTRUCTIONS_RETIRED; 29 - }; 30 - 31 26 struct kvm_pmu_ops { 32 27 struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, 33 28 unsigned int idx, u64 *mask); ··· 160 165 return NULL; 161 166 } 162 167 163 - static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) 168 + static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) 164 169 { 165 170 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 166 171 ··· 173 178 } 174 179 175 180 extern struct x86_pmu_capability kvm_pmu_cap; 176 - extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; 177 181 178 - static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) 179 - { 180 - bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; 181 - int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; 182 + void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); 182 183 183 - /* 184 - * Hybrid PMUs don't play nice with virtualization without careful 185 - * configuration by userspace, and KVM's APIs for reporting supported 186 - * vPMU features do not account for hybrid PMUs. Disable vPMU support 187 - * for hybrid PMUs until KVM gains a way to let userspace opt-in. 188 - */ 189 - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) 190 - enable_pmu = false; 191 - 192 - if (enable_pmu) { 193 - perf_get_x86_pmu_capability(&kvm_pmu_cap); 194 - 195 - /* 196 - * WARN if perf did NOT disable hardware PMU if the number of 197 - * architecturally required GP counters aren't present, i.e. if 198 - * there are a non-zero number of counters, but fewer than what 199 - * is architecturally required. 200 - */ 201 - if (!kvm_pmu_cap.num_counters_gp || 202 - WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) 203 - enable_pmu = false; 204 - else if (is_intel && !kvm_pmu_cap.version) 205 - enable_pmu = false; 206 - } 207 - 208 - if (!enable_pmu) { 209 - memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); 210 - return; 211 - } 212 - 213 - kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); 214 - kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, 215 - pmu_ops->MAX_NR_GP_COUNTERS); 216 - kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, 217 - KVM_MAX_NR_FIXED_COUNTERS); 218 - 219 - kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = 220 - perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); 221 - kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = 222 - perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 223 - } 184 + void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); 224 185 225 186 static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) 226 187 { 188 + kvm_pmu_recalc_pmc_emulation(pmc_to_pmu(pmc), pmc); 189 + 227 190 set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); 228 191 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 229 192 } ··· 225 272 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); 226 273 void kvm_pmu_destroy(struct kvm_vcpu *vcpu); 227 274 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); 228 - void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); 275 + void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); 276 + void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); 229 277 230 278 bool is_vmware_backdoor_pmc(u32 pmc_idx); 231 279
+5
arch/x86/kvm/reverse_cpuid.h
··· 25 25 #define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1) 26 26 #define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11) 27 27 28 + /* Intel-defined sub-features, CPUID level 0x00000007:1 (ECX) */ 29 + #define KVM_X86_FEATURE_MSR_IMM KVM_X86_FEATURE(CPUID_7_1_ECX, 5) 30 + 28 31 /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ 29 32 #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) 30 33 #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) ··· 90 87 [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, 91 88 [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, 92 89 [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX}, 90 + [CPUID_7_1_ECX] = { 7, 1, CPUID_ECX}, 93 91 }; 94 92 95 93 /* ··· 132 128 KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); 133 129 KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO); 134 130 KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO); 131 + KVM_X86_TRANSLATE_FEATURE(MSR_IMM); 135 132 default: 136 133 return x86_feature; 137 134 }
+2 -2
arch/x86/kvm/smm.c
··· 529 529 530 530 vcpu->arch.smbase = smstate->smbase; 531 531 532 - if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) 532 + if (__kvm_emulate_msr_write(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA)) 533 533 return X86EMUL_UNHANDLEABLE; 534 534 535 535 rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR); ··· 620 620 621 621 /* And finally go back to 32-bit mode. */ 622 622 efer = 0; 623 - kvm_set_msr(vcpu, MSR_EFER, efer); 623 + __kvm_emulate_msr_write(vcpu, MSR_EFER, efer); 624 624 } 625 625 #endif 626 626
+4 -4
arch/x86/kvm/svm/pmu.c
··· 41 41 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); 42 42 unsigned int idx; 43 43 44 - if (!vcpu->kvm->arch.enable_pmu) 44 + if (!pmu->version) 45 45 return NULL; 46 46 47 47 switch (msr) { ··· 113 113 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: 114 114 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: 115 115 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: 116 + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: 116 117 return pmu->version > 1; 117 118 default: 118 119 if (msr > MSR_F15H_PERF_CTR5 && ··· 200 199 kvm_pmu_cap.num_counters_gp); 201 200 202 201 if (pmu->version > 1) { 203 - pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); 202 + pmu->global_ctrl_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); 204 203 pmu->global_status_rsvd = pmu->global_ctrl_rsvd; 205 204 } 206 205 207 - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; 206 + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(48) - 1; 208 207 pmu->reserved_bits = 0xfffffff000280000ull; 209 208 pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; 210 209 /* not applicable to AMD; but clean them to prevent any fall out */ 211 210 pmu->counter_bitmask[KVM_PMC_FIXED] = 0; 212 211 pmu->nr_arch_fixed_counters = 0; 213 - bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); 214 212 } 215 213 216 214 static void amd_pmu_init(struct kvm_vcpu *vcpu)
+21 -9
arch/x86/kvm/svm/svm.c
··· 1008 1008 } 1009 1009 } 1010 1010 1011 - static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) 1011 + static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) 1012 1012 { 1013 1013 svm_recalc_instruction_intercepts(vcpu); 1014 1014 svm_recalc_msr_intercepts(vcpu); ··· 1156 1156 1157 1157 svm_hv_init_vmcb(vmcb); 1158 1158 1159 - svm_recalc_intercepts_after_set_cpuid(vcpu); 1159 + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); 1160 1160 1161 1161 vmcb_mark_all_dirty(vmcb); 1162 1162 ··· 4093 4093 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4094 4094 { 4095 4095 struct vcpu_svm *svm = to_svm(vcpu); 4096 + struct vmcb_control_area *control = &svm->vmcb->control; 4097 + 4098 + /* 4099 + * Next RIP must be provided as IRQs are disabled, and accessing guest 4100 + * memory to decode the instruction might fault, i.e. might sleep. 4101 + */ 4102 + if (!nrips || !control->next_rip) 4103 + return EXIT_FASTPATH_NONE; 4096 4104 4097 4105 if (is_guest_mode(vcpu)) 4098 4106 return EXIT_FASTPATH_NONE; 4099 4107 4100 - switch (svm->vmcb->control.exit_code) { 4108 + switch (control->exit_code) { 4101 4109 case SVM_EXIT_MSR: 4102 - if (!svm->vmcb->control.exit_info_1) 4110 + if (!control->exit_info_1) 4103 4111 break; 4104 - return handle_fastpath_set_msr_irqoff(vcpu); 4112 + return handle_fastpath_wrmsr(vcpu); 4105 4113 case SVM_EXIT_HLT: 4106 4114 return handle_fastpath_hlt(vcpu); 4115 + case SVM_EXIT_INVD: 4116 + return handle_fastpath_invd(vcpu); 4107 4117 default: 4108 4118 break; 4109 4119 } ··· 4390 4380 4391 4381 if (sev_guest(vcpu->kvm)) 4392 4382 sev_vcpu_after_set_cpuid(svm); 4393 - 4394 - svm_recalc_intercepts_after_set_cpuid(vcpu); 4395 4383 } 4396 4384 4397 4385 static bool svm_has_wbinvd_exit(void) ··· 5091 5083 5092 5084 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 5093 5085 5094 - .recalc_msr_intercepts = svm_recalc_msr_intercepts, 5086 + .recalc_intercepts = svm_recalc_intercepts, 5095 5087 .complete_emulated_msr = svm_complete_emulated_msr, 5096 5088 5097 5089 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, ··· 5221 5213 /* CPUID 0x8000001F (SME/SEV features) */ 5222 5214 sev_set_cpu_caps(); 5223 5215 5224 - /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ 5216 + /* 5217 + * Clear capabilities that are automatically configured by common code, 5218 + * but that require explicit SVM support (that isn't yet implemented). 5219 + */ 5225 5220 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); 5221 + kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); 5226 5222 } 5227 5223 5228 5224 static __init int svm_hardware_setup(void)
-3
arch/x86/kvm/vmx/capabilities.h
··· 20 20 #define PT_MODE_SYSTEM 0 21 21 #define PT_MODE_HOST_GUEST 1 22 22 23 - #define PMU_CAP_FW_WRITES (1ULL << 13) 24 - #define PMU_CAP_LBR_FMT 0x3f 25 - 26 23 struct nested_vmx_msrs { 27 24 /* 28 25 * We only store the "true" versions of the VMX capability MSRs. We
+7 -7
arch/x86/kvm/vmx/main.c
··· 188 188 return vmx_get_msr(vcpu, msr_info); 189 189 } 190 190 191 - static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 191 + static void vt_recalc_intercepts(struct kvm_vcpu *vcpu) 192 192 { 193 193 /* 194 - * TDX doesn't allow VMM to configure interception of MSR accesses. 195 - * TDX guest requests MSR accesses by calling TDVMCALL. The MSR 196 - * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR 197 - * if the userspace has set any. 194 + * TDX doesn't allow VMM to configure interception of instructions or 195 + * MSR accesses. TDX guest requests MSR accesses by calling TDVMCALL. 196 + * The MSR filters will be applied when handling the TDVMCALL for 197 + * RDMSR/WRMSR if the userspace has set any. 198 198 */ 199 199 if (is_td_vcpu(vcpu)) 200 200 return; 201 201 202 - vmx_recalc_msr_intercepts(vcpu); 202 + vmx_recalc_intercepts(vcpu); 203 203 } 204 204 205 205 static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) ··· 996 996 .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), 997 997 .migrate_timers = vmx_migrate_timers, 998 998 999 - .recalc_msr_intercepts = vt_op(recalc_msr_intercepts), 999 + .recalc_intercepts = vt_op(recalc_intercepts), 1000 1000 .complete_emulated_msr = vt_op(complete_emulated_msr), 1001 1001 1002 1002 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+19 -10
arch/x86/kvm/vmx/nested.c
··· 997 997 __func__, i, e.index, e.reserved); 998 998 goto fail; 999 999 } 1000 - if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { 1000 + if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1001 1001 pr_debug_ratelimited( 1002 1002 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1003 1003 __func__, i, e.index, e.value); ··· 1033 1033 } 1034 1034 } 1035 1035 1036 - if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { 1036 + if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1037 1037 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1038 1038 msr_index); 1039 1039 return false; ··· 2770 2770 2771 2771 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2772 2772 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2773 - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2774 - vmcs12->guest_ia32_perf_global_ctrl))) { 2773 + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2774 + vmcs12->guest_ia32_perf_global_ctrl))) { 2775 2775 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2776 2776 return -EINVAL; 2777 2777 } ··· 3690 3690 return 1; 3691 3691 } 3692 3692 3693 - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3693 + kvm_pmu_branch_retired(vcpu); 3694 3694 3695 3695 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3696 3696 return nested_vmx_failInvalid(vcpu); ··· 4758 4758 } 4759 4759 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4760 4760 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4761 - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4762 - vmcs12->host_ia32_perf_global_ctrl)); 4761 + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4762 + vmcs12->host_ia32_perf_global_ctrl)); 4763 4763 4764 4764 /* Set L1 segment info according to Intel SDM 4765 4765 27.5.2 Loading Host Segment and Descriptor-Table Registers */ ··· 4937 4937 goto vmabort; 4938 4938 } 4939 4939 4940 - if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { 4940 + if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 4941 4941 pr_debug_ratelimited( 4942 4942 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4943 4943 __func__, j, h.index, h.value); ··· 6216 6216 struct vmcs12 *vmcs12, 6217 6217 union vmx_exit_reason exit_reason) 6218 6218 { 6219 - u32 msr_index = kvm_rcx_read(vcpu); 6219 + u32 msr_index; 6220 6220 gpa_t bitmap; 6221 6221 6222 6222 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6223 6223 return true; 6224 + 6225 + if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6226 + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6227 + msr_index = vmx_get_exit_qual(vcpu); 6228 + else 6229 + msr_index = kvm_rcx_read(vcpu); 6224 6230 6225 6231 /* 6226 6232 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, ··· 6234 6228 * First we need to figure out which of the four to use: 6235 6229 */ 6236 6230 bitmap = vmcs12->msr_bitmap; 6237 - if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6231 + if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6232 + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6238 6233 bitmap += 2048; 6239 6234 if (msr_index >= 0xc0000000) { 6240 6235 msr_index -= 0xc0000000; ··· 6534 6527 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6535 6528 case EXIT_REASON_MSR_READ: 6536 6529 case EXIT_REASON_MSR_WRITE: 6530 + case EXIT_REASON_MSR_READ_IMM: 6531 + case EXIT_REASON_MSR_WRITE_IMM: 6537 6532 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6538 6533 case EXIT_REASON_INVALID_STATE: 6539 6534 return true;
+35 -44
arch/x86/kvm/vmx/pmu_intel.c
··· 138 138 139 139 static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) 140 140 { 141 - return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0; 141 + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; 142 142 } 143 143 144 144 static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) ··· 478 478 }; 479 479 u64 eventsel; 480 480 481 - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); 482 - BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); 481 + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUNTERS); 482 + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUNTERS); 483 483 484 484 /* 485 485 * Yell if perf reports support for a fixed counter but perf doesn't ··· 536 536 kvm_pmu_cap.num_counters_gp); 537 537 eax.split.bit_width = min_t(int, eax.split.bit_width, 538 538 kvm_pmu_cap.bit_width_gp); 539 - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; 539 + pmu->counter_bitmask[KVM_PMC_GP] = BIT_ULL(eax.split.bit_width) - 1; 540 540 eax.split.mask_length = min_t(int, eax.split.mask_length, 541 541 kvm_pmu_cap.events_mask_len); 542 - pmu->available_event_types = ~entry->ebx & 543 - ((1ull << eax.split.mask_length) - 1); 542 + pmu->available_event_types = ~entry->ebx & (BIT_ULL(eax.split.mask_length) - 1); 544 543 545 - if (pmu->version == 1) { 546 - pmu->nr_arch_fixed_counters = 0; 547 - } else { 548 - pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, 549 - kvm_pmu_cap.num_counters_fixed); 550 - edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, 551 - kvm_pmu_cap.bit_width_fixed); 552 - pmu->counter_bitmask[KVM_PMC_FIXED] = 553 - ((u64)1 << edx.split.bit_width_fixed) - 1; 544 + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); 545 + if (entry && 546 + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && 547 + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { 548 + pmu->reserved_bits ^= HSW_IN_TX; 549 + pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); 554 550 } 551 + 552 + perf_capabilities = vcpu_get_perf_capabilities(vcpu); 553 + if (intel_pmu_lbr_is_compatible(vcpu) && 554 + (perf_capabilities & PERF_CAP_LBR_FMT)) 555 + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); 556 + else 557 + lbr_desc->records.nr = 0; 558 + 559 + if (lbr_desc->records.nr) 560 + bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); 561 + 562 + if (pmu->version == 1) 563 + return; 564 + 565 + pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, 566 + kvm_pmu_cap.num_counters_fixed); 567 + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, 568 + kvm_pmu_cap.bit_width_fixed); 569 + pmu->counter_bitmask[KVM_PMC_FIXED] = BIT_ULL(edx.split.bit_width_fixed) - 1; 555 570 556 571 intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | 557 572 INTEL_FIXED_0_USER | 558 573 INTEL_FIXED_0_ENABLE_PMI); 559 574 560 - counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | 561 - (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); 575 + counter_rsvd = ~((BIT_ULL(pmu->nr_arch_gp_counters) - 1) | 576 + ((BIT_ULL(pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); 562 577 pmu->global_ctrl_rsvd = counter_rsvd; 563 578 564 579 /* ··· 588 573 pmu->global_status_rsvd &= 589 574 ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; 590 575 591 - entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); 592 - if (entry && 593 - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && 594 - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { 595 - pmu->reserved_bits ^= HSW_IN_TX; 596 - pmu->raw_event_mask |= (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); 597 - } 598 - 599 - bitmap_set(pmu->all_valid_pmc_idx, 600 - 0, pmu->nr_arch_gp_counters); 601 - bitmap_set(pmu->all_valid_pmc_idx, 602 - INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); 603 - 604 - perf_capabilities = vcpu_get_perf_capabilities(vcpu); 605 - if (intel_pmu_lbr_is_compatible(vcpu) && 606 - (perf_capabilities & PMU_CAP_LBR_FMT)) 607 - memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); 608 - else 609 - lbr_desc->records.nr = 0; 610 - 611 - if (lbr_desc->records.nr) 612 - bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); 613 - 614 576 if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { 615 577 if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { 616 578 pmu->pebs_enable_rsvd = counter_rsvd; ··· 595 603 pmu->pebs_data_cfg_rsvd = ~0xff00000full; 596 604 intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); 597 605 } else { 598 - pmu->pebs_enable_rsvd = 599 - ~((1ull << pmu->nr_arch_gp_counters) - 1); 606 + pmu->pebs_enable_rsvd = ~(BIT_ULL(pmu->nr_arch_gp_counters) - 1); 600 607 } 601 608 } 602 609 } ··· 616 625 pmu->gp_counters[i].current_config = 0; 617 626 } 618 627 619 - for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { 628 + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUNTERS; i++) { 620 629 pmu->fixed_counters[i].type = KVM_PMC_FIXED; 621 630 pmu->fixed_counters[i].vcpu = vcpu; 622 631 pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; ··· 753 762 int bit, hw_idx; 754 763 755 764 kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { 756 - if (!pmc_speculative_in_use(pmc) || 765 + if (!pmc_is_locally_enabled(pmc) || 757 766 !pmc_is_globally_enabled(pmc) || !pmc->perf_event) 758 767 continue; 759 768
+5
arch/x86/kvm/vmx/tdx.c
··· 629 629 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); 630 630 631 631 kvm->arch.has_protected_state = true; 632 + /* 633 + * TDX Module doesn't allow the hypervisor to modify the EOI-bitmap, 634 + * i.e. all EOIs are accelerated and never trigger exits. 635 + */ 636 + kvm->arch.has_protected_eoi = true; 632 637 kvm->arch.has_private_mem = true; 633 638 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; 634 639
+59 -32
arch/x86/kvm/vmx/vmx.c
··· 2140 2140 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2141 2141 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2142 2142 2143 - if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2143 + if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && 2144 2144 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2145 2145 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2146 2146 ··· 2425 2425 vmx->pt_desc.guest.addr_a[index / 2] = data; 2426 2426 break; 2427 2427 case MSR_IA32_PERF_CAPABILITIES: 2428 - if (data & PMU_CAP_LBR_FMT) { 2429 - if ((data & PMU_CAP_LBR_FMT) != 2430 - (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2428 + if (data & PERF_CAP_LBR_FMT) { 2429 + if ((data & PERF_CAP_LBR_FMT) != 2430 + (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) 2431 2431 return 1; 2432 2432 if (!cpuid_model_is_consistent(vcpu)) 2433 2433 return 1; ··· 4081 4081 } 4082 4082 } 4083 4083 4084 - void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4084 + static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4085 4085 { 4086 4086 if (!cpu_has_vmx_msr_bitmap()) 4087 4087 return; ··· 4132 4132 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4133 4133 * filtered by userspace. 4134 4134 */ 4135 + } 4136 + 4137 + void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) 4138 + { 4139 + vmx_recalc_msr_intercepts(vcpu); 4135 4140 } 4136 4141 4137 4142 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, ··· 4322 4317 return pin_based_exec_ctrl; 4323 4318 } 4324 4319 4325 - static u32 vmx_vmentry_ctrl(void) 4320 + static u32 vmx_get_initial_vmentry_ctrl(void) 4326 4321 { 4327 4322 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4328 4323 ··· 4339 4334 return vmentry_ctrl; 4340 4335 } 4341 4336 4342 - static u32 vmx_vmexit_ctrl(void) 4337 + static u32 vmx_get_initial_vmexit_ctrl(void) 4343 4338 { 4344 4339 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4345 4340 ··· 4369 4364 4370 4365 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4371 4366 4372 - if (kvm_vcpu_apicv_active(vcpu)) { 4373 - secondary_exec_controls_setbit(vmx, 4374 - SECONDARY_EXEC_APIC_REGISTER_VIRT | 4375 - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4376 - if (enable_ipiv) 4377 - tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4378 - } else { 4379 - secondary_exec_controls_clearbit(vmx, 4380 - SECONDARY_EXEC_APIC_REGISTER_VIRT | 4381 - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4382 - if (enable_ipiv) 4383 - tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4384 - } 4367 + secondary_exec_controls_changebit(vmx, 4368 + SECONDARY_EXEC_APIC_REGISTER_VIRT | 4369 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY, 4370 + kvm_vcpu_apicv_active(vcpu)); 4371 + if (enable_ipiv) 4372 + tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT, 4373 + kvm_vcpu_apicv_active(vcpu)); 4385 4374 4386 4375 vmx_update_msr_bitmap_x2apic(vcpu); 4387 4376 } ··· 4698 4699 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4699 4700 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4700 4701 4701 - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4702 + vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl()); 4702 4703 4703 4704 /* 22.2.1, 20.8.1 */ 4704 - vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4705 + vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl()); 4705 4706 4706 4707 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4707 4708 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); ··· 6022 6023 return 1; 6023 6024 } 6024 6025 6026 + static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu) 6027 + { 6028 + return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO)); 6029 + } 6030 + 6031 + static int handle_rdmsr_imm(struct kvm_vcpu *vcpu) 6032 + { 6033 + return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6034 + vmx_get_msr_imm_reg(vcpu)); 6035 + } 6036 + 6037 + static int handle_wrmsr_imm(struct kvm_vcpu *vcpu) 6038 + { 6039 + return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 6040 + vmx_get_msr_imm_reg(vcpu)); 6041 + } 6042 + 6025 6043 /* 6026 6044 * The exit handlers return 1 if the exit was handled fully and guest execution 6027 6045 * may resume. Otherwise they set the kvm_run parameter to indicate what needs ··· 6097 6081 [EXIT_REASON_ENCLS] = handle_encls, 6098 6082 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6099 6083 [EXIT_REASON_NOTIFY] = handle_notify, 6084 + [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, 6085 + [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, 6100 6086 }; 6101 6087 6102 6088 static const int kvm_vmx_max_exit_handlers = ··· 6533 6515 #ifdef CONFIG_MITIGATION_RETPOLINE 6534 6516 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6535 6517 return kvm_emulate_wrmsr(vcpu); 6518 + else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6519 + return handle_wrmsr_imm(vcpu); 6536 6520 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6537 6521 return handle_preemption_timer(vcpu); 6538 6522 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) ··· 7210 7190 7211 7191 switch (vmx_get_exit_reason(vcpu).basic) { 7212 7192 case EXIT_REASON_MSR_WRITE: 7213 - return handle_fastpath_set_msr_irqoff(vcpu); 7193 + return handle_fastpath_wrmsr(vcpu); 7194 + case EXIT_REASON_MSR_WRITE_IMM: 7195 + return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu), 7196 + vmx_get_msr_imm_reg(vcpu)); 7214 7197 case EXIT_REASON_PREEMPTION_TIMER: 7215 7198 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7216 7199 case EXIT_REASON_HLT: 7217 7200 return handle_fastpath_hlt(vcpu); 7201 + case EXIT_REASON_INVD: 7202 + return handle_fastpath_invd(vcpu); 7218 7203 default: 7219 7204 return EXIT_FASTPATH_NONE; 7220 7205 } ··· 7820 7795 vmx->msr_ia32_feature_control_valid_bits &= 7821 7796 ~FEAT_CTL_SGX_LC_ENABLED; 7822 7797 7823 - /* Recalc MSR interception to account for feature changes. */ 7824 - vmx_recalc_msr_intercepts(vcpu); 7825 - 7826 7798 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7827 7799 vmx_update_exception_bitmap(vcpu); 7828 7800 } 7829 7801 7830 7802 static __init u64 vmx_get_perf_capabilities(void) 7831 7803 { 7832 - u64 perf_cap = PMU_CAP_FW_WRITES; 7804 + u64 perf_cap = PERF_CAP_FW_WRITES; 7833 7805 u64 host_perf_cap = 0; 7834 7806 7835 7807 if (!enable_pmu) ··· 7846 7824 if (!vmx_lbr_caps.has_callstack) 7847 7825 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7848 7826 else if (vmx_lbr_caps.nr) 7849 - perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7827 + perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT; 7850 7828 } 7851 7829 7852 7830 if (vmx_pebs_supported()) { ··· 8375 8353 8376 8354 vmx_setup_user_return_msrs(); 8377 8355 8378 - if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8379 - return -EIO; 8380 8356 8381 8357 if (boot_cpu_has(X86_FEATURE_NX)) 8382 8358 kvm_enable_efer_bits(EFER_NX); ··· 8600 8580 return -EOPNOTSUPP; 8601 8581 8602 8582 /* 8603 - * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8604 - * to unwind if a later step fails. 8583 + * Note, VMCS and eVMCS configuration only touch VMX knobs/variables, 8584 + * i.e. there's nothing to unwind if a later step fails. 8605 8585 */ 8606 8586 hv_init_evmcs(); 8587 + 8588 + /* 8589 + * Parse the VMCS config and VMX capabilities before anything else, so 8590 + * that the information is available to all setup flows. 8591 + */ 8592 + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8593 + return -EIO; 8607 8594 8608 8595 r = kvm_x86_vendor_init(&vt_init_ops); 8609 8596 if (r)
+13
arch/x86/kvm/vmx/vmx.h
··· 608 608 { \ 609 609 BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ 610 610 lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ 611 + } \ 612 + static __always_inline void lname##_controls_changebit(struct vcpu_vmx *vmx, u##bits val, \ 613 + bool set) \ 614 + { \ 615 + if (set) \ 616 + lname##_controls_setbit(vmx, val); \ 617 + else \ 618 + lname##_controls_clearbit(vmx, val); \ 611 619 } 612 620 BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) 613 621 BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) ··· 713 705 } 714 706 715 707 void dump_vmcs(struct kvm_vcpu *vcpu); 708 + 709 + static inline int vmx_get_instr_info_reg(u32 vmx_instr_info) 710 + { 711 + return (vmx_instr_info >> 3) & 0xf; 712 + } 716 713 717 714 static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) 718 715 {
+1 -1
arch/x86/kvm/vmx/x86_ops.h
··· 52 52 int trig_mode, int vector); 53 53 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); 54 54 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); 55 - void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu); 55 + void vmx_recalc_intercepts(struct kvm_vcpu *vcpu); 56 56 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); 57 57 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); 58 58 int vmx_get_feature_msr(u32 msr, u64 *data);
+189 -147
arch/x86/kvm/x86.c
··· 164 164 static u32 __read_mostly tsc_tolerance_ppm = 250; 165 165 module_param(tsc_tolerance_ppm, uint, 0644); 166 166 167 - static bool __read_mostly vector_hashing = true; 168 - module_param(vector_hashing, bool, 0444); 169 - 170 167 bool __read_mostly enable_vmware_backdoor = false; 171 168 module_param(enable_vmware_backdoor, bool, 0444); 172 169 EXPORT_SYMBOL_GPL(enable_vmware_backdoor); ··· 364 367 MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 365 368 MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 366 369 MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 370 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, 367 371 }; 368 372 369 373 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + ··· 1577 1579 1578 1580 int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) 1579 1581 { 1580 - u32 ecx = kvm_rcx_read(vcpu); 1582 + u32 pmc = kvm_rcx_read(vcpu); 1581 1583 u64 data; 1582 1584 1583 - if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { 1585 + if (kvm_pmu_rdpmc(vcpu, pmc, &data)) { 1584 1586 kvm_inject_gp(vcpu, 0); 1585 1587 return 1; 1586 1588 } ··· 1903 1905 * Returns 0 on success, non-0 otherwise. 1904 1906 * Assumes vcpu_load() was already called. 1905 1907 */ 1906 - int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1907 - bool host_initiated) 1908 + static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, 1909 + bool host_initiated) 1908 1910 { 1909 1911 struct msr_data msr; 1910 1912 int ret; ··· 1930 1932 return ret; 1931 1933 } 1932 1934 1935 + int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) 1936 + { 1937 + return __kvm_set_msr(vcpu, index, data, true); 1938 + } 1939 + 1940 + int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1941 + { 1942 + return __kvm_get_msr(vcpu, index, data, true); 1943 + } 1944 + 1933 1945 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, 1934 1946 u32 index, u64 *data, bool host_initiated) 1935 1947 { ··· 1947 1939 __kvm_get_msr); 1948 1940 } 1949 1941 1950 - int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1942 + int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1943 + { 1944 + return kvm_get_msr_ignored_check(vcpu, index, data, false); 1945 + } 1946 + EXPORT_SYMBOL_GPL(__kvm_emulate_msr_read); 1947 + 1948 + int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) 1949 + { 1950 + return kvm_set_msr_ignored_check(vcpu, index, data, false); 1951 + } 1952 + EXPORT_SYMBOL_GPL(__kvm_emulate_msr_write); 1953 + 1954 + int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1951 1955 { 1952 1956 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) 1953 1957 return KVM_MSR_RET_FILTERED; 1954 - return kvm_get_msr_ignored_check(vcpu, index, data, false); 1955 - } 1956 - EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); 1957 1958 1958 - int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) 1959 + return __kvm_emulate_msr_read(vcpu, index, data); 1960 + } 1961 + EXPORT_SYMBOL_GPL(kvm_emulate_msr_read); 1962 + 1963 + int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) 1959 1964 { 1960 1965 if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) 1961 1966 return KVM_MSR_RET_FILTERED; 1962 - return kvm_set_msr_ignored_check(vcpu, index, data, false); 1963 - } 1964 - EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); 1965 1967 1966 - int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) 1967 - { 1968 - return kvm_get_msr_ignored_check(vcpu, index, data, false); 1968 + return __kvm_emulate_msr_write(vcpu, index, data); 1969 1969 } 1970 - EXPORT_SYMBOL_GPL(kvm_get_msr); 1970 + EXPORT_SYMBOL_GPL(kvm_emulate_msr_write); 1971 1971 1972 - int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) 1973 - { 1974 - return kvm_set_msr_ignored_check(vcpu, index, data, false); 1975 - } 1976 - EXPORT_SYMBOL_GPL(kvm_set_msr); 1977 1972 1978 1973 static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) 1979 1974 { ··· 2005 1994 static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) 2006 1995 { 2007 1996 complete_userspace_rdmsr(vcpu); 1997 + return complete_fast_msr_access(vcpu); 1998 + } 1999 + 2000 + static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) 2001 + { 2002 + if (!vcpu->run->msr.error) 2003 + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, 2004 + vcpu->run->msr.data); 2005 + 2008 2006 return complete_fast_msr_access(vcpu); 2009 2007 } 2010 2008 ··· 2051 2031 return 1; 2052 2032 } 2053 2033 2054 - int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) 2034 + static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, 2035 + int (*complete_rdmsr)(struct kvm_vcpu *)) 2055 2036 { 2056 - u32 ecx = kvm_rcx_read(vcpu); 2057 2037 u64 data; 2058 2038 int r; 2059 2039 2060 - r = kvm_get_msr_with_filter(vcpu, ecx, &data); 2040 + r = kvm_emulate_msr_read(vcpu, msr, &data); 2061 2041 2062 2042 if (!r) { 2063 - trace_kvm_msr_read(ecx, data); 2043 + trace_kvm_msr_read(msr, data); 2064 2044 2065 - kvm_rax_write(vcpu, data & -1u); 2066 - kvm_rdx_write(vcpu, (data >> 32) & -1u); 2045 + if (reg < 0) { 2046 + kvm_rax_write(vcpu, data & -1u); 2047 + kvm_rdx_write(vcpu, (data >> 32) & -1u); 2048 + } else { 2049 + kvm_register_write(vcpu, reg, data); 2050 + } 2067 2051 } else { 2068 2052 /* MSR read failed? See if we should ask user space */ 2069 - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, 2070 - complete_fast_rdmsr, r)) 2053 + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, 2054 + complete_rdmsr, r)) 2071 2055 return 0; 2072 - trace_kvm_msr_read_ex(ecx); 2056 + trace_kvm_msr_read_ex(msr); 2073 2057 } 2074 2058 2075 2059 return kvm_x86_call(complete_emulated_msr)(vcpu, r); 2076 2060 } 2061 + 2062 + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) 2063 + { 2064 + return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, 2065 + complete_fast_rdmsr); 2066 + } 2077 2067 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); 2078 2068 2079 - int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) 2069 + int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) 2080 2070 { 2081 - u32 ecx = kvm_rcx_read(vcpu); 2082 - u64 data = kvm_read_edx_eax(vcpu); 2071 + vcpu->arch.cui_rdmsr_imm_reg = reg; 2072 + 2073 + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); 2074 + } 2075 + EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr_imm); 2076 + 2077 + static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 2078 + { 2083 2079 int r; 2084 2080 2085 - r = kvm_set_msr_with_filter(vcpu, ecx, data); 2086 - 2081 + r = kvm_emulate_msr_write(vcpu, msr, data); 2087 2082 if (!r) { 2088 - trace_kvm_msr_write(ecx, data); 2083 + trace_kvm_msr_write(msr, data); 2089 2084 } else { 2090 2085 /* MSR write failed? See if we should ask user space */ 2091 - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, 2086 + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, 2092 2087 complete_fast_msr_access, r)) 2093 2088 return 0; 2094 2089 /* Signal all other negative errors to userspace */ 2095 2090 if (r < 0) 2096 2091 return r; 2097 - trace_kvm_msr_write_ex(ecx, data); 2092 + trace_kvm_msr_write_ex(msr, data); 2098 2093 } 2099 2094 2100 2095 return kvm_x86_call(complete_emulated_msr)(vcpu, r); 2101 2096 } 2097 + 2098 + int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) 2099 + { 2100 + return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), 2101 + kvm_read_edx_eax(vcpu)); 2102 + } 2102 2103 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); 2104 + 2105 + int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) 2106 + { 2107 + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); 2108 + } 2109 + EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr_imm); 2103 2110 2104 2111 int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) 2105 2112 { ··· 2139 2092 return kvm_emulate_as_nop(vcpu); 2140 2093 } 2141 2094 EXPORT_SYMBOL_GPL(kvm_emulate_invd); 2095 + 2096 + fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) 2097 + { 2098 + if (!kvm_emulate_invd(vcpu)) 2099 + return EXIT_FASTPATH_EXIT_USERSPACE; 2100 + 2101 + return EXIT_FASTPATH_REENTER_GUEST; 2102 + } 2103 + EXPORT_SYMBOL_GPL(handle_fastpath_invd); 2142 2104 2143 2105 int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) 2144 2106 { ··· 2196 2140 kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); 2197 2141 } 2198 2142 2199 - /* 2200 - * The fast path for frequent and performance sensitive wrmsr emulation, 2201 - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces 2202 - * the latency of virtual IPI by avoiding the expensive bits of transitioning 2203 - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the 2204 - * other cases which must be called after interrupts are enabled on the host. 2205 - */ 2206 - static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) 2143 + static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 2207 2144 { 2208 - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) 2209 - return 1; 2210 - 2211 - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && 2212 - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && 2213 - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && 2214 - ((u32)(data >> 32) != X2APIC_BROADCAST)) 2215 - return kvm_x2apic_icr_write(vcpu->arch.apic, data); 2216 - 2217 - return 1; 2218 - } 2219 - 2220 - static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) 2221 - { 2222 - if (!kvm_can_use_hv_timer(vcpu)) 2223 - return 1; 2224 - 2225 - kvm_set_lapic_tscdeadline_msr(vcpu, data); 2226 - return 0; 2227 - } 2228 - 2229 - fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) 2230 - { 2231 - u32 msr = kvm_rcx_read(vcpu); 2232 - u64 data; 2233 - fastpath_t ret; 2234 - bool handled; 2235 - 2236 - kvm_vcpu_srcu_read_lock(vcpu); 2237 - 2238 2145 switch (msr) { 2239 2146 case APIC_BASE_MSR + (APIC_ICR >> 4): 2240 - data = kvm_read_edx_eax(vcpu); 2241 - handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); 2147 + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || 2148 + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) 2149 + return EXIT_FASTPATH_NONE; 2242 2150 break; 2243 2151 case MSR_IA32_TSC_DEADLINE: 2244 - data = kvm_read_edx_eax(vcpu); 2245 - handled = !handle_fastpath_set_tscdeadline(vcpu, data); 2152 + kvm_set_lapic_tscdeadline_msr(vcpu, data); 2246 2153 break; 2247 2154 default: 2248 - handled = false; 2249 - break; 2155 + return EXIT_FASTPATH_NONE; 2250 2156 } 2251 2157 2252 - if (handled) { 2253 - if (!kvm_skip_emulated_instruction(vcpu)) 2254 - ret = EXIT_FASTPATH_EXIT_USERSPACE; 2255 - else 2256 - ret = EXIT_FASTPATH_REENTER_GUEST; 2257 - trace_kvm_msr_write(msr, data); 2258 - } else { 2259 - ret = EXIT_FASTPATH_NONE; 2260 - } 2158 + trace_kvm_msr_write(msr, data); 2261 2159 2262 - kvm_vcpu_srcu_read_unlock(vcpu); 2160 + if (!kvm_skip_emulated_instruction(vcpu)) 2161 + return EXIT_FASTPATH_EXIT_USERSPACE; 2263 2162 2264 - return ret; 2163 + return EXIT_FASTPATH_REENTER_GUEST; 2265 2164 } 2266 - EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); 2165 + 2166 + fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) 2167 + { 2168 + return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), 2169 + kvm_read_edx_eax(vcpu)); 2170 + } 2171 + EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr); 2172 + 2173 + fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) 2174 + { 2175 + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); 2176 + } 2177 + EXPORT_SYMBOL_GPL(handle_fastpath_wrmsr_imm); 2267 2178 2268 2179 /* 2269 2180 * Adapt set_msr() to msr_io()'s calling convention ··· 6801 6778 6802 6779 kvm_free_msr_filter(old_filter); 6803 6780 6804 - kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); 6781 + /* 6782 + * Recalc MSR intercepts as userspace may want to intercept accesses to 6783 + * MSRs that KVM would otherwise pass through to the guest. 6784 + */ 6785 + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); 6805 6786 6806 6787 return 0; 6807 6788 } ··· 6998 6971 6999 6972 r = -EEXIST; 7000 6973 if (irqchip_in_kernel(kvm)) 6974 + goto create_irqchip_unlock; 6975 + 6976 + /* 6977 + * Disallow an in-kernel I/O APIC if the VM has protected EOIs, 6978 + * i.e. if KVM can't intercept EOIs and thus can't properly 6979 + * emulate level-triggered interrupts. 6980 + */ 6981 + r = -ENOTTY; 6982 + if (kvm->arch.has_protected_eoi) 7001 6983 goto create_irqchip_unlock; 7002 6984 7003 6985 r = -EINVAL; ··· 7396 7360 case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: 7397 7361 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: 7398 7362 case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: 7363 + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: 7399 7364 if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) 7400 7365 return; 7401 7366 break; ··· 8397 8360 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 8398 8361 int r; 8399 8362 8400 - r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); 8363 + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); 8401 8364 if (r < 0) 8402 8365 return X86EMUL_UNHANDLEABLE; 8403 8366 ··· 8420 8383 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 8421 8384 int r; 8422 8385 8423 - r = kvm_set_msr_with_filter(vcpu, msr_index, data); 8386 + r = kvm_emulate_msr_write(vcpu, msr_index, data); 8424 8387 if (r < 0) 8425 8388 return X86EMUL_UNHANDLEABLE; 8426 8389 ··· 8440 8403 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, 8441 8404 u32 msr_index, u64 *pdata) 8442 8405 { 8443 - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); 8406 + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); 8444 8407 } 8445 8408 8446 8409 static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) ··· 8512 8475 static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) 8513 8476 { 8514 8477 return is_smm(emul_to_vcpu(ctxt)); 8515 - } 8516 - 8517 - static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) 8518 - { 8519 - return is_guest_mode(emul_to_vcpu(ctxt)); 8520 8478 } 8521 8479 8522 8480 #ifndef CONFIG_KVM_SMM ··· 8597 8565 .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, 8598 8566 .set_nmi_mask = emulator_set_nmi_mask, 8599 8567 .is_smm = emulator_is_smm, 8600 - .is_guest_mode = emulator_is_guest_mode, 8601 8568 .leave_smm = emulator_leave_smm, 8602 8569 .triple_fault = emulator_triple_fault, 8603 8570 .set_xcr = emulator_set_xcr, ··· 8902 8871 if (unlikely(!r)) 8903 8872 return 0; 8904 8873 8905 - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); 8874 + kvm_pmu_instruction_retired(vcpu); 8906 8875 8907 8876 /* 8908 8877 * rflags is the old, "raw" value of the flags. The new value has ··· 9181 9150 ctxt->exception.address = 0; 9182 9151 } 9183 9152 9184 - r = x86_emulate_insn(ctxt); 9153 + /* 9154 + * Check L1's instruction intercepts when emulating instructions for 9155 + * L2, unless KVM is re-emulating a previously decoded instruction, 9156 + * e.g. to complete userspace I/O, in which case KVM has already 9157 + * checked the intercepts. 9158 + */ 9159 + r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) && 9160 + !(emulation_type & EMULTYPE_NO_DECODE)); 9185 9161 9186 9162 if (r == EMULATION_INTERCEPTED) 9187 9163 return 1; ··· 9243 9205 */ 9244 9206 if (!ctxt->have_exception || 9245 9207 exception_type(ctxt->exception.vector) == EXCPT_TRAP) { 9246 - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); 9208 + kvm_pmu_instruction_retired(vcpu); 9247 9209 if (ctxt->is_branch) 9248 - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 9210 + kvm_pmu_branch_retired(vcpu); 9249 9211 kvm_rip_write(vcpu, ctxt->eip); 9250 9212 if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) 9251 9213 r = kvm_vcpu_do_singlestep(vcpu); ··· 10841 10803 if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) 10842 10804 kvm_check_async_pf_completion(vcpu); 10843 10805 10844 - /* 10845 - * Recalc MSR intercepts as userspace may want to intercept 10846 - * accesses to MSRs that KVM would otherwise pass through to 10847 - * the guest. 10848 - */ 10849 - if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) 10850 - kvm_x86_call(recalc_msr_intercepts)(vcpu); 10806 + if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu)) 10807 + kvm_x86_call(recalc_intercepts)(vcpu); 10851 10808 10852 10809 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) 10853 10810 kvm_x86_call(update_cpu_dirty_logging)(vcpu); ··· 11343 11310 11344 11311 fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) 11345 11312 { 11346 - int ret; 11347 - 11348 - kvm_vcpu_srcu_read_lock(vcpu); 11349 - ret = kvm_emulate_halt(vcpu); 11350 - kvm_vcpu_srcu_read_unlock(vcpu); 11351 - 11352 - if (!ret) 11313 + if (!kvm_emulate_halt(vcpu)) 11353 11314 return EXIT_FASTPATH_EXIT_USERSPACE; 11354 11315 11355 11316 if (kvm_vcpu_running(vcpu)) ··· 12422 12395 kvfree(vcpu->arch.cpuid_entries); 12423 12396 } 12424 12397 12398 + static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) 12399 + { 12400 + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; 12401 + u64 xfeatures_mask; 12402 + int i; 12403 + 12404 + /* 12405 + * Guest FPU state is zero allocated and so doesn't need to be manually 12406 + * cleared on RESET, i.e. during vCPU creation. 12407 + */ 12408 + if (!init_event || !fpstate) 12409 + return; 12410 + 12411 + /* 12412 + * On INIT, only select XSTATE components are zeroed, most components 12413 + * are unchanged. Currently, the only components that are zeroed and 12414 + * supported by KVM are MPX related. 12415 + */ 12416 + xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & 12417 + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 12418 + if (!xfeatures_mask) 12419 + return; 12420 + 12421 + BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); 12422 + 12423 + /* 12424 + * All paths that lead to INIT are required to load the guest's FPU 12425 + * state (because most paths are buried in KVM_RUN). 12426 + */ 12427 + kvm_put_guest_fpu(vcpu); 12428 + for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) 12429 + fpstate_clear_xstate_component(fpstate, i); 12430 + kvm_load_guest_fpu(vcpu); 12431 + } 12432 + 12425 12433 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 12426 12434 { 12427 12435 struct kvm_cpuid_entry2 *cpuid_0x1; ··· 12514 12452 kvm_async_pf_hash_reset(vcpu); 12515 12453 vcpu->arch.apf.halted = false; 12516 12454 12517 - if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { 12518 - struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; 12519 - 12520 - /* 12521 - * All paths that lead to INIT are required to load the guest's 12522 - * FPU state (because most paths are buried in KVM_RUN). 12523 - */ 12524 - if (init_event) 12525 - kvm_put_guest_fpu(vcpu); 12526 - 12527 - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); 12528 - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); 12529 - 12530 - if (init_event) 12531 - kvm_load_guest_fpu(vcpu); 12532 - } 12455 + kvm_xstate_reset(vcpu, init_event); 12533 12456 12534 12457 if (!init_event) { 12535 12458 vcpu->arch.smbase = 0x30000; ··· 12526 12479 MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; 12527 12480 12528 12481 __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); 12529 - __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); 12482 + kvm_msr_write(vcpu, MSR_IA32_XSS, 0); 12530 12483 } 12531 12484 12532 12485 /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ ··· 13572 13525 return atomic_read(&kvm->arch.noncoherent_dma_count); 13573 13526 } 13574 13527 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); 13575 - 13576 - bool kvm_vector_hashing_enabled(void) 13577 - { 13578 - return vector_hashing; 13579 - } 13580 13528 13581 13529 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) 13582 13530 {
+3 -2
arch/x86/kvm/x86.h
··· 431 431 432 432 int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); 433 433 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 434 - bool kvm_vector_hashing_enabled(void); 435 434 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); 436 435 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, 437 436 void *insn, int insn_len); 438 437 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 439 438 int emulation_type, void *insn, int insn_len); 440 - fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); 439 + fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu); 440 + fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg); 441 441 fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); 442 + fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu); 442 443 443 444 extern struct kvm_caps kvm_caps; 444 445 extern struct kvm_host_values kvm_host;
+5 -3
tools/testing/selftests/kvm/x86/pmu_counters_test.c
··· 14 14 #define NUM_BRANCH_INSNS_RETIRED (NUM_LOOPS) 15 15 16 16 /* 17 - * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 18 - * 1 LOOP. 17 + * Number of instructions in each loop. 1 ENTER, 1 CLFLUSH/CLFLUSHOPT/NOP, 18 + * 1 MFENCE, 1 MOV, 1 LEAVE, 1 LOOP. 19 19 */ 20 - #define NUM_INSNS_PER_LOOP 4 20 + #define NUM_INSNS_PER_LOOP 6 21 21 22 22 /* 23 23 * Number of "extra" instructions that will be counted, i.e. the number of ··· 226 226 __asm__ __volatile__("wrmsr\n\t" \ 227 227 " mov $" __stringify(NUM_LOOPS) ", %%ecx\n\t" \ 228 228 "1:\n\t" \ 229 + FEP "enter $0, $0\n\t" \ 229 230 clflush "\n\t" \ 230 231 "mfence\n\t" \ 231 232 "mov %[m], %%eax\n\t" \ 233 + FEP "leave\n\t" \ 232 234 FEP "loop 1b\n\t" \ 233 235 FEP "mov %%edi, %%ecx\n\t" \ 234 236 FEP "xor %%eax, %%eax\n\t" \