Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+3 -3

Documentation/virt/kvm/api.rst

··· 5545 5545 The Stats Data block contains an array of 64-bit values in the same order 5546 5546 as the descriptors in Descriptors block. 5547 5547 5548 - 4.42 KVM_GET_XSAVE2 5549 - ------------------ 5548 + 4.134 KVM_GET_XSAVE2 5549 + -------------------- 5550 5550 5551 5551 :Capability: KVM_CAP_XSAVE2 5552 5552 :Architectures: x86 ··· 7363 7363 limit the attack surface on KVM's MSR emulation code. 7364 7364 7365 7365 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID 7366 - ----------------------------- 7366 + ------------------------------------- 7367 7367 7368 7368 Architectures: x86 7369 7369

+1 -2

arch/x86/include/asm/kvm-x86-ops.h

··· 55 55 KVM_X86_OP_NULL(tlb_remote_flush_with_range) 56 56 KVM_X86_OP(tlb_flush_gva) 57 57 KVM_X86_OP(tlb_flush_guest) 58 + KVM_X86_OP(vcpu_pre_run) 58 59 KVM_X86_OP(run) 59 60 KVM_X86_OP_NULL(handle_exit) 60 61 KVM_X86_OP_NULL(skip_emulated_instruction) ··· 99 98 KVM_X86_OP_NULL(request_immediate_exit) 100 99 KVM_X86_OP(sched_in) 101 100 KVM_X86_OP_NULL(update_cpu_dirty_logging) 102 - KVM_X86_OP_NULL(pre_block) 103 - KVM_X86_OP_NULL(post_block) 104 101 KVM_X86_OP_NULL(vcpu_blocking) 105 102 KVM_X86_OP_NULL(vcpu_unblocking) 106 103 KVM_X86_OP_NULL(update_pi_irte)

+1 -12

arch/x86/include/asm/kvm_host.h

··· 1381 1381 */ 1382 1382 void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); 1383 1383 1384 + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); 1384 1385 enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); 1385 1386 int (*handle_exit)(struct kvm_vcpu *vcpu, 1386 1387 enum exit_fastpath_completion exit_fastpath); ··· 1454 1453 /* pmu operations of sub-arch */ 1455 1454 const struct kvm_pmu_ops *pmu_ops; 1456 1455 const struct kvm_x86_nested_ops *nested_ops; 1457 - 1458 - /* 1459 - * Architecture specific hooks for vCPU blocking due to 1460 - * HLT instruction. 1461 - * Returns for .pre_block(): 1462 - * - 0 means continue to block the vCPU. 1463 - * - 1 means we cannot block the vCPU since some event 1464 - * happens during this period, such as, 'ON' bit in 1465 - * posted-interrupts descriptor is set. 1466 - */ 1467 - int (*pre_block)(struct kvm_vcpu *vcpu); 1468 - void (*post_block)(struct kvm_vcpu *vcpu); 1469 1456 1470 1457 void (*vcpu_blocking)(struct kvm_vcpu *vcpu); 1471 1458 void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);

+66 -13

arch/x86/kvm/cpuid.c

··· 119 119 return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); 120 120 } 121 121 122 + /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ 123 + static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, 124 + int nent) 125 + { 126 + struct kvm_cpuid_entry2 *orig; 127 + int i; 128 + 129 + if (nent != vcpu->arch.cpuid_nent) 130 + return -EINVAL; 131 + 132 + for (i = 0; i < nent; i++) { 133 + orig = &vcpu->arch.cpuid_entries[i]; 134 + if (e2[i].function != orig->function || 135 + e2[i].index != orig->index || 136 + e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || 137 + e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) 138 + return -EINVAL; 139 + } 140 + 141 + return 0; 142 + } 143 + 122 144 static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) 123 145 { 124 146 u32 function; ··· 167 145 } 168 146 } 169 147 170 - static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) 148 + static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, 149 + struct kvm_cpuid_entry2 *entries, int nent) 171 150 { 172 151 u32 base = vcpu->arch.kvm_cpuid_base; 173 152 174 153 if (!base) 175 154 return NULL; 176 155 177 - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); 156 + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); 157 + } 158 + 159 + static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) 160 + { 161 + return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, 162 + vcpu->arch.cpuid_nent); 178 163 } 179 164 180 165 void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) ··· 196 167 vcpu->arch.pv_cpuid.features = best->eax; 197 168 } 198 169 199 - void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) 170 + static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, 171 + int nent) 200 172 { 201 173 struct kvm_cpuid_entry2 *best; 202 174 203 - best = kvm_find_cpuid_entry(vcpu, 1, 0); 175 + best = cpuid_entry2_find(entries, nent, 1, 0); 204 176 if (best) { 205 177 /* Update OSXSAVE bit */ 206 178 if (boot_cpu_has(X86_FEATURE_XSAVE)) ··· 212 182 vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); 213 183 } 214 184 215 - best = kvm_find_cpuid_entry(vcpu, 7, 0); 185 + best = cpuid_entry2_find(entries, nent, 7, 0); 216 186 if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) 217 187 cpuid_entry_change(best, X86_FEATURE_OSPKE, 218 188 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); 219 189 220 - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); 190 + best = cpuid_entry2_find(entries, nent, 0xD, 0); 221 191 if (best) 222 192 best->ebx = xstate_required_size(vcpu->arch.xcr0, false); 223 193 224 - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); 194 + best = cpuid_entry2_find(entries, nent, 0xD, 1); 225 195 if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || 226 196 cpuid_entry_has(best, X86_FEATURE_XSAVEC))) 227 197 best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 228 198 229 - best = kvm_find_kvm_cpuid_features(vcpu); 199 + best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); 230 200 if (kvm_hlt_in_guest(vcpu->kvm) && best && 231 201 (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) 232 202 best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); 233 203 234 204 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { 235 - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); 205 + best = cpuid_entry2_find(entries, nent, 0x1, 0); 236 206 if (best) 237 207 cpuid_entry_change(best, X86_FEATURE_MWAIT, 238 208 vcpu->arch.ia32_misc_enable_msr & 239 209 MSR_IA32_MISC_ENABLE_MWAIT); 240 210 } 211 + } 212 + 213 + void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) 214 + { 215 + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); 241 216 } 242 217 EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); 243 218 ··· 333 298 { 334 299 int r; 335 300 301 + __kvm_update_cpuid_runtime(vcpu, e2, nent); 302 + 303 + /* 304 + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as 305 + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't 306 + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page 307 + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with 308 + * the core vCPU model on the fly. It would've been better to forbid any 309 + * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately 310 + * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do 311 + * KVM_SET_CPUID{,2} again. To support this legacy behavior, check 312 + * whether the supplied CPUID data is equal to what's already set. 313 + */ 314 + if (vcpu->arch.last_vmentry_cpu != -1) 315 + return kvm_cpuid_check_equal(vcpu, e2, nent); 316 + 336 317 r = kvm_check_cpuid(vcpu, e2, nent); 337 318 if (r) 338 319 return r; ··· 358 307 vcpu->arch.cpuid_nent = nent; 359 308 360 309 kvm_update_kvm_cpuid_base(vcpu); 361 - kvm_update_cpuid_runtime(vcpu); 362 310 kvm_vcpu_after_set_cpuid(vcpu); 363 311 364 312 return 0; ··· 845 795 perf_get_x86_pmu_capability(&cap); 846 796 847 797 /* 848 - * Only support guest architectural pmu on a host 849 - * with architectural pmu. 798 + * The guest architecture pmu is only supported if the architecture 799 + * pmu exists on the host and the module parameters allow it. 850 800 */ 851 - if (!cap.version) 801 + if (!cap.version || !enable_pmu) 852 802 memset(&cap, 0, sizeof(cap)); 853 803 854 804 eax.split.version_id = min(cap.version, 2); ··· 936 886 --array->nent; 937 887 continue; 938 888 } 889 + 890 + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) 891 + entry->ecx &= ~BIT_ULL(2); 939 892 entry->edx = 0; 940 893 } 941 894 break;

-2

arch/x86/kvm/lapic.c

··· 1950 1950 { 1951 1951 restart_apic_timer(vcpu->arch.apic); 1952 1952 } 1953 - EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); 1954 1953 1955 1954 void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) 1956 1955 { ··· 1961 1962 start_sw_timer(apic); 1962 1963 preempt_enable(); 1963 1964 } 1964 - EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); 1965 1965 1966 1966 void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) 1967 1967 {

+22 -9

arch/x86/kvm/mmu/mmu.c

··· 5756 5756 continue; 5757 5757 5758 5758 flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, 5759 + 5759 5760 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 5760 5761 start, end - 1, true, flush); 5761 5762 } ··· 5826 5825 } 5827 5826 5828 5827 /* 5829 - * We can flush all the TLBs out of the mmu lock without TLB 5830 - * corruption since we just change the spte from writable to 5831 - * readonly so that we only need to care the case of changing 5832 - * spte from present to present (changing the spte from present 5833 - * to nonpresent will flush all the TLBs immediately), in other 5834 - * words, the only case we care is mmu_spte_update() where we 5835 - * have checked Host-writable | MMU-writable instead of 5836 - * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK 5837 - * anymore. 5828 + * Flush TLBs if any SPTEs had to be write-protected to ensure that 5829 + * guest writes are reflected in the dirty bitmap before the memslot 5830 + * update completes, i.e. before enabling dirty logging is visible to 5831 + * userspace. 5832 + * 5833 + * Perform the TLB flush outside the mmu_lock to reduce the amount of 5834 + * time the lock is held. However, this does mean that another CPU can 5835 + * now grab mmu_lock and encounter a write-protected SPTE while CPUs 5836 + * still have a writable mapping for the associated GFN in their TLB. 5837 + * 5838 + * This is safe but requires KVM to be careful when making decisions 5839 + * based on the write-protection status of an SPTE. Specifically, KVM 5840 + * also write-protects SPTEs to monitor changes to guest page tables 5841 + * during shadow paging, and must guarantee no CPUs can write to those 5842 + * page before the lock is dropped. As mentioned in the previous 5843 + * paragraph, a write-protected SPTE is no guarantee that CPU cannot 5844 + * perform writes. So to determine if a TLB flush is truly required, KVM 5845 + * will clear a separate software-only bit (MMU-writable) and skip the 5846 + * flush if-and-only-if this bit was already clear. 5847 + * 5848 + * See DEFAULT_SPTE_MMU_WRITEABLE for more details. 5838 5849 */ 5839 5850 if (flush) 5840 5851 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);

+1

arch/x86/kvm/mmu/spte.c

··· 216 216 217 217 new_spte &= ~PT_WRITABLE_MASK; 218 218 new_spte &= ~shadow_host_writable_mask; 219 + new_spte &= ~shadow_mmu_writable_mask; 219 220 220 221 new_spte = mark_spte_for_access_track(new_spte); 221 222

+36 -6

arch/x86/kvm/mmu/spte.h

··· 60 60 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 61 61 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 62 62 63 - /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ 64 - #define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) 65 - #define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) 66 - 67 63 /* 68 64 * The mask/shift to use for saving the original R/X bits when marking the PTE 69 65 * as not-present for access tracking purposes. We do not save the W bit as the ··· 73 77 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ 74 78 SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) 75 79 static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); 80 + 81 + /* 82 + * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits 83 + * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs 84 + * that map guest pages in read-only memslots and read-only VMAs. 85 + * 86 + * Invariants: 87 + * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. 88 + * 89 + * 90 + * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU 91 + * allows writes to the guest page mapped by the SPTE. This bit is cleared when 92 + * the guest page mapped by the SPTE contains a page table that is being 93 + * monitored for shadow paging. In this case the SPTE can only be made writable 94 + * by unsyncing the shadow page under the mmu_lock. 95 + * 96 + * Invariants: 97 + * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. 98 + * - If MMU-writable is set, Host-writable must be set. 99 + * 100 + * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared 101 + * to track writes for dirty logging. For such SPTEs, KVM will locklessly set 102 + * PT_WRITABLE_MASK upon the next write from the guest and record the write in 103 + * the dirty log (see fast_page_fault()). 104 + */ 105 + 106 + /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ 107 + #define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) 108 + #define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) 76 109 77 110 /* 78 111 * Low ignored bits are at a premium for EPT, use high ignored bits, taking care ··· 341 316 342 317 static inline bool spte_can_locklessly_be_made_writable(u64 spte) 343 318 { 344 - return (spte & shadow_host_writable_mask) && 345 - (spte & shadow_mmu_writable_mask); 319 + if (spte & shadow_mmu_writable_mask) { 320 + WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); 321 + return true; 322 + } 323 + 324 + WARN_ON_ONCE(spte & PT_WRITABLE_MASK); 325 + return false; 346 326 } 347 327 348 328 static inline u64 get_mmio_spte_generation(u64 spte)

+3 -3

arch/x86/kvm/mmu/tdp_mmu.c

··· 1442 1442 !is_last_spte(iter.old_spte, iter.level)) 1443 1443 continue; 1444 1444 1445 - if (!is_writable_pte(iter.old_spte)) 1446 - break; 1447 - 1448 1445 new_spte = iter.old_spte & 1449 1446 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1447 + 1448 + if (new_spte == iter.old_spte) 1449 + break; 1450 1450 1451 1451 tdp_mmu_set_spte(kvm, &iter, new_spte); 1452 1452 spte_set = true;

+22 -11

arch/x86/kvm/pmu.c

··· 13 13 #include <linux/types.h> 14 14 #include <linux/kvm_host.h> 15 15 #include <linux/perf_event.h> 16 + #include <linux/bsearch.h> 17 + #include <linux/sort.h> 16 18 #include <asm/perf_event.h> 17 19 #include "x86.h" 18 20 #include "cpuid.h" ··· 111 109 .config = config, 112 110 }; 113 111 112 + if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) 113 + return; 114 + 114 115 attr.sample_period = get_sample_period(pmc, pmc->counter); 115 116 116 117 if (in_tx) ··· 174 169 return true; 175 170 } 176 171 172 + static int cmp_u64(const void *a, const void *b) 173 + { 174 + return *(__u64 *)a - *(__u64 *)b; 175 + } 176 + 177 177 void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) 178 178 { 179 179 unsigned config, type = PERF_TYPE_RAW; 180 180 struct kvm *kvm = pmc->vcpu->kvm; 181 181 struct kvm_pmu_event_filter *filter; 182 - int i; 183 182 bool allow_event = true; 184 183 185 184 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) ··· 198 189 199 190 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); 200 191 if (filter) { 201 - for (i = 0; i < filter->nevents; i++) 202 - if (filter->events[i] == 203 - (eventsel & AMD64_RAW_EVENT_MASK_NB)) 204 - break; 205 - if (filter->action == KVM_PMU_EVENT_ALLOW && 206 - i == filter->nevents) 207 - allow_event = false; 208 - if (filter->action == KVM_PMU_EVENT_DENY && 209 - i < filter->nevents) 210 - allow_event = false; 192 + __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; 193 + 194 + if (bsearch(&key, filter->events, filter->nevents, 195 + sizeof(__u64), cmp_u64)) 196 + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; 197 + else 198 + allow_event = filter->action == KVM_PMU_EVENT_DENY; 211 199 } 212 200 if (!allow_event) 213 201 return; ··· 578 572 579 573 /* Ensure nevents can't be changed between the user copies. */ 580 574 *filter = tmp; 575 + 576 + /* 577 + * Sort the in-kernel list so that we can search it with bsearch. 578 + */ 579 + sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); 581 580 582 581 mutex_lock(&kvm->lock); 583 582 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,

+84 -41

arch/x86/kvm/svm/avic.c

··· 295 295 struct kvm_vcpu *vcpu; 296 296 unsigned long i; 297 297 298 + /* 299 + * Wake any target vCPUs that are blocking, i.e. waiting for a wake 300 + * event. There's no need to signal doorbells, as hardware has handled 301 + * vCPUs that were in guest at the time of the IPI, and vCPUs that have 302 + * since entered the guest will have processed pending IRQs at VMRUN. 303 + */ 298 304 kvm_for_each_vcpu(i, vcpu, kvm) { 299 - bool m = kvm_apic_match_dest(vcpu, source, 300 - icrl & APIC_SHORT_MASK, 301 - GET_APIC_DEST_FIELD(icrh), 302 - icrl & APIC_DEST_MASK); 303 - 304 - if (m && !avic_vcpu_is_running(vcpu)) 305 + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, 306 + GET_APIC_DEST_FIELD(icrh), 307 + icrl & APIC_DEST_MASK)) 305 308 kvm_vcpu_wake_up(vcpu); 306 309 } 307 310 } ··· 675 672 return -1; 676 673 677 674 kvm_lapic_set_irr(vec, vcpu->arch.apic); 675 + 676 + /* 677 + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in 678 + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before 679 + * the read of guest_mode, which guarantees that either VMRUN will see 680 + * and process the new vIRR entry, or that the below code will signal 681 + * the doorbell if the vCPU is already running in the guest. 682 + */ 678 683 smp_mb__after_atomic(); 679 684 680 - if (avic_vcpu_is_running(vcpu)) { 685 + /* 686 + * Signal the doorbell to tell hardware to inject the IRQ if the vCPU 687 + * is in the guest. If the vCPU is not in the guest, hardware will 688 + * automatically process AVIC interrupts at VMRUN. 689 + */ 690 + if (vcpu->mode == IN_GUEST_MODE) { 681 691 int cpu = READ_ONCE(vcpu->cpu); 682 692 683 693 /* ··· 704 688 if (cpu != get_cpu()) 705 689 wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); 706 690 put_cpu(); 707 - } else 691 + } else { 692 + /* 693 + * Wake the vCPU if it was blocking. KVM will then detect the 694 + * pending IRQ when checking if the vCPU has a wake event. 695 + */ 708 696 kvm_vcpu_wake_up(vcpu); 697 + } 709 698 710 699 return 0; 711 700 } ··· 978 957 int h_physical_id = kvm_cpu_get_apicid(cpu); 979 958 struct vcpu_svm *svm = to_svm(vcpu); 980 959 960 + lockdep_assert_preemption_disabled(); 961 + 981 962 /* 982 963 * Since the host physical APIC id is 8 bits, 983 964 * we can support host APIC ID upto 255. ··· 987 964 if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) 988 965 return; 989 966 967 + /* 968 + * No need to update anything if the vCPU is blocking, i.e. if the vCPU 969 + * is being scheduled in after being preempted. The CPU entries in the 970 + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. 971 + * If the vCPU was migrated, its new CPU value will be stuffed when the 972 + * vCPU unblocks. 973 + */ 974 + if (kvm_vcpu_is_blocking(vcpu)) 975 + return; 976 + 990 977 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 991 978 WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 992 979 993 980 entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; 994 981 entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); 995 - 996 - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 997 - if (svm->avic_is_running) 998 - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 982 + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 999 983 1000 984 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 1001 - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, 1002 - svm->avic_is_running); 985 + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); 1003 986 } 1004 987 1005 988 void avic_vcpu_put(struct kvm_vcpu *vcpu) ··· 1013 984 u64 entry; 1014 985 struct vcpu_svm *svm = to_svm(vcpu); 1015 986 987 + lockdep_assert_preemption_disabled(); 988 + 1016 989 entry = READ_ONCE(*(svm->avic_physical_id_cache)); 1017 - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) 1018 - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); 990 + 991 + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ 992 + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) 993 + return; 994 + 995 + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); 1019 996 1020 997 entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; 1021 998 WRITE_ONCE(*(svm->avic_physical_id_cache), entry); 1022 999 } 1023 1000 1024 - /* 1025 - * This function is called during VCPU halt/unhalt. 1026 - */ 1027 - static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) 1001 + void avic_vcpu_blocking(struct kvm_vcpu *vcpu) 1028 1002 { 1029 - struct vcpu_svm *svm = to_svm(vcpu); 1030 - int cpu = get_cpu(); 1003 + if (!kvm_vcpu_apicv_active(vcpu)) 1004 + return; 1031 1005 1006 + preempt_disable(); 1007 + 1008 + /* 1009 + * Unload the AVIC when the vCPU is about to block, _before_ 1010 + * the vCPU actually blocks. 1011 + * 1012 + * Any IRQs that arrive before IsRunning=0 will not cause an 1013 + * incomplete IPI vmexit on the source, therefore vIRR will also 1014 + * be checked by kvm_vcpu_check_block() before blocking. The 1015 + * memory barrier implicit in set_current_state orders writing 1016 + * IsRunning=0 before reading the vIRR. The processor needs a 1017 + * matching memory barrier on interrupt delivery between writing 1018 + * IRR and reading IsRunning; the lack of this barrier might be 1019 + * the cause of errata #1235). 1020 + */ 1021 + avic_vcpu_put(vcpu); 1022 + 1023 + preempt_enable(); 1024 + } 1025 + 1026 + void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) 1027 + { 1028 + int cpu; 1029 + 1030 + if (!kvm_vcpu_apicv_active(vcpu)) 1031 + return; 1032 + 1033 + cpu = get_cpu(); 1032 1034 WARN_ON(cpu != vcpu->cpu); 1033 - svm->avic_is_running = is_run; 1034 1035 1035 - if (kvm_vcpu_apicv_active(vcpu)) { 1036 - if (is_run) 1037 - avic_vcpu_load(vcpu, cpu); 1038 - else 1039 - avic_vcpu_put(vcpu); 1040 - } 1036 + avic_vcpu_load(vcpu, cpu); 1037 + 1041 1038 put_cpu(); 1042 - } 1043 - 1044 - void svm_vcpu_blocking(struct kvm_vcpu *vcpu) 1045 - { 1046 - avic_set_running(vcpu, false); 1047 - } 1048 - 1049 - void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) 1050 - { 1051 - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) 1052 - kvm_vcpu_update_apicv(vcpu); 1053 - avic_set_running(vcpu, true); 1054 1039 }

+1 -1

arch/x86/kvm/svm/pmu.c

··· 101 101 { 102 102 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); 103 103 104 - if (!pmu) 104 + if (!enable_pmu) 105 105 return NULL; 106 106 107 107 switch (msr) {

+245 -245

arch/x86/kvm/svm/svm.c

··· 192 192 static int lbrv = true; 193 193 module_param(lbrv, int, 0444); 194 194 195 - /* enable/disable PMU virtualization */ 196 - bool pmu = true; 197 - module_param(pmu, bool, 0444); 198 - 199 195 static int tsc_scaling = true; 200 196 module_param(tsc_scaling, int, 0444); 201 197 ··· 869 873 } 870 874 } 871 875 872 - /* 873 - * The default MMIO mask is a single bit (excluding the present bit), 874 - * which could conflict with the memory encryption bit. Check for 875 - * memory encryption support and override the default MMIO mask if 876 - * memory encryption is enabled. 877 - */ 878 - static __init void svm_adjust_mmio_mask(void) 879 - { 880 - unsigned int enc_bit, mask_bit; 881 - u64 msr, mask; 882 - 883 - /* If there is no memory encryption support, use existing mask */ 884 - if (cpuid_eax(0x80000000) < 0x8000001f) 885 - return; 886 - 887 - /* If memory encryption is not enabled, use existing mask */ 888 - rdmsrl(MSR_AMD64_SYSCFG, msr); 889 - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 890 - return; 891 - 892 - enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 893 - mask_bit = boot_cpu_data.x86_phys_bits; 894 - 895 - /* Increment the mask bit if it is the same as the encryption bit */ 896 - if (enc_bit == mask_bit) 897 - mask_bit++; 898 - 899 - /* 900 - * If the mask bit location is below 52, then some bits above the 901 - * physical addressing limit will always be reserved, so use the 902 - * rsvd_bits() function to generate the mask. This mask, along with 903 - * the present bit, will be used to generate a page fault with 904 - * PFER.RSV = 1. 905 - * 906 - * If the mask bit location is 52 (or above), then clear the mask. 907 - */ 908 - mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 909 - 910 - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 911 - } 912 - 913 876 static void svm_hardware_teardown(void) 914 877 { 915 878 int cpu; ··· 881 926 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), 882 927 get_order(IOPM_SIZE)); 883 928 iopm_base = 0; 884 - } 885 - 886 - static __init void svm_set_cpu_caps(void) 887 - { 888 - kvm_set_cpu_caps(); 889 - 890 - supported_xss = 0; 891 - 892 - /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 893 - if (nested) { 894 - kvm_cpu_cap_set(X86_FEATURE_SVM); 895 - 896 - if (nrips) 897 - kvm_cpu_cap_set(X86_FEATURE_NRIPS); 898 - 899 - if (npt_enabled) 900 - kvm_cpu_cap_set(X86_FEATURE_NPT); 901 - 902 - if (tsc_scaling) 903 - kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 904 - 905 - /* Nested VM can receive #VMEXIT instead of triggering #GP */ 906 - kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 907 - } 908 - 909 - /* CPUID 0x80000008 */ 910 - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 911 - boot_cpu_has(X86_FEATURE_AMD_SSBD)) 912 - kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 913 - 914 - /* AMD PMU PERFCTR_CORE CPUID */ 915 - if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 916 - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); 917 - 918 - /* CPUID 0x8000001F (SME/SEV features) */ 919 - sev_set_cpu_caps(); 920 - } 921 - 922 - static __init int svm_hardware_setup(void) 923 - { 924 - int cpu; 925 - struct page *iopm_pages; 926 - void *iopm_va; 927 - int r; 928 - unsigned int order = get_order(IOPM_SIZE); 929 - 930 - /* 931 - * NX is required for shadow paging and for NPT if the NX huge pages 932 - * mitigation is enabled. 933 - */ 934 - if (!boot_cpu_has(X86_FEATURE_NX)) { 935 - pr_err_ratelimited("NX (Execute Disable) not supported\n"); 936 - return -EOPNOTSUPP; 937 - } 938 - kvm_enable_efer_bits(EFER_NX); 939 - 940 - iopm_pages = alloc_pages(GFP_KERNEL, order); 941 - 942 - if (!iopm_pages) 943 - return -ENOMEM; 944 - 945 - iopm_va = page_address(iopm_pages); 946 - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 947 - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 948 - 949 - init_msrpm_offsets(); 950 - 951 - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 952 - 953 - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 954 - kvm_enable_efer_bits(EFER_FFXSR); 955 - 956 - if (tsc_scaling) { 957 - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 958 - tsc_scaling = false; 959 - } else { 960 - pr_info("TSC scaling supported\n"); 961 - kvm_has_tsc_control = true; 962 - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; 963 - kvm_tsc_scaling_ratio_frac_bits = 32; 964 - } 965 - } 966 - 967 - tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 968 - 969 - /* Check for pause filtering support */ 970 - if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 971 - pause_filter_count = 0; 972 - pause_filter_thresh = 0; 973 - } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 974 - pause_filter_thresh = 0; 975 - } 976 - 977 - if (nested) { 978 - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 979 - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 980 - } 981 - 982 - /* 983 - * KVM's MMU doesn't support using 2-level paging for itself, and thus 984 - * NPT isn't supported if the host is using 2-level paging since host 985 - * CR4 is unchanged on VMRUN. 986 - */ 987 - if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 988 - npt_enabled = false; 989 - 990 - if (!boot_cpu_has(X86_FEATURE_NPT)) 991 - npt_enabled = false; 992 - 993 - /* Force VM NPT level equal to the host's paging level */ 994 - kvm_configure_mmu(npt_enabled, get_npt_level(), 995 - get_npt_level(), PG_LEVEL_1G); 996 - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); 997 - 998 - /* Note, SEV setup consumes npt_enabled. */ 999 - sev_hardware_setup(); 1000 - 1001 - svm_hv_hardware_setup(); 1002 - 1003 - svm_adjust_mmio_mask(); 1004 - 1005 - for_each_possible_cpu(cpu) { 1006 - r = svm_cpu_init(cpu); 1007 - if (r) 1008 - goto err; 1009 - } 1010 - 1011 - if (nrips) { 1012 - if (!boot_cpu_has(X86_FEATURE_NRIPS)) 1013 - nrips = false; 1014 - } 1015 - 1016 - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); 1017 - 1018 - if (enable_apicv) { 1019 - pr_info("AVIC enabled\n"); 1020 - 1021 - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1022 - } 1023 - 1024 - if (vls) { 1025 - if (!npt_enabled || 1026 - !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 1027 - !IS_ENABLED(CONFIG_X86_64)) { 1028 - vls = false; 1029 - } else { 1030 - pr_info("Virtual VMLOAD VMSAVE supported\n"); 1031 - } 1032 - } 1033 - 1034 - if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 1035 - svm_gp_erratum_intercept = false; 1036 - 1037 - if (vgif) { 1038 - if (!boot_cpu_has(X86_FEATURE_VGIF)) 1039 - vgif = false; 1040 - else 1041 - pr_info("Virtual GIF supported\n"); 1042 - } 1043 - 1044 - if (lbrv) { 1045 - if (!boot_cpu_has(X86_FEATURE_LBRV)) 1046 - lbrv = false; 1047 - else 1048 - pr_info("LBR virtualization supported\n"); 1049 - } 1050 - 1051 - if (!pmu) 1052 - pr_info("PMU virtualization is disabled\n"); 1053 - 1054 - svm_set_cpu_caps(); 1055 - 1056 - /* 1057 - * It seems that on AMD processors PTE's accessed bit is 1058 - * being set by the CPU hardware before the NPF vmexit. 1059 - * This is not expected behaviour and our tests fail because 1060 - * of it. 1061 - * A workaround here is to disable support for 1062 - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 1063 - * In this case userspace can know if there is support using 1064 - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 1065 - * it 1066 - * If future AMD CPU models change the behaviour described above, 1067 - * this variable can be changed accordingly 1068 - */ 1069 - allow_smaller_maxphyaddr = !npt_enabled; 1070 - 1071 - return 0; 1072 - 1073 - err: 1074 - svm_hardware_teardown(); 1075 - return r; 1076 929 } 1077 930 1078 931 static void init_seg(struct vmcb_seg *seg) ··· 1206 1443 err = avic_init_vcpu(svm); 1207 1444 if (err) 1208 1445 goto error_free_vmsa_page; 1209 - 1210 - /* We initialize this flag to true to make sure that the is_running 1211 - * bit would be set the first time the vcpu is loaded. 1212 - */ 1213 - if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) 1214 - svm->avic_is_running = true; 1215 1446 1216 1447 svm->msrpm = svm_vcpu_alloc_msrpm(); 1217 1448 if (!svm->msrpm) { ··· 3590 3833 svm_complete_interrupts(vcpu); 3591 3834 } 3592 3835 3836 + static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 3837 + { 3838 + return 1; 3839 + } 3840 + 3593 3841 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 3594 3842 { 3595 3843 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && ··· 4391 4629 .prepare_guest_switch = svm_prepare_guest_switch, 4392 4630 .vcpu_load = svm_vcpu_load, 4393 4631 .vcpu_put = svm_vcpu_put, 4394 - .vcpu_blocking = svm_vcpu_blocking, 4395 - .vcpu_unblocking = svm_vcpu_unblocking, 4632 + .vcpu_blocking = avic_vcpu_blocking, 4633 + .vcpu_unblocking = avic_vcpu_unblocking, 4396 4634 4397 4635 .update_exception_bitmap = svm_update_exception_bitmap, 4398 4636 .get_msr_feature = svm_get_msr_feature, ··· 4424 4662 .tlb_flush_gva = svm_flush_tlb_gva, 4425 4663 .tlb_flush_guest = svm_flush_tlb, 4426 4664 4665 + .vcpu_pre_run = svm_vcpu_pre_run, 4427 4666 .run = svm_vcpu_run, 4428 4667 .handle_exit = handle_exit, 4429 4668 .skip_emulated_instruction = skip_emulated_instruction, ··· 4504 4741 4505 4742 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 4506 4743 }; 4744 + 4745 + /* 4746 + * The default MMIO mask is a single bit (excluding the present bit), 4747 + * which could conflict with the memory encryption bit. Check for 4748 + * memory encryption support and override the default MMIO mask if 4749 + * memory encryption is enabled. 4750 + */ 4751 + static __init void svm_adjust_mmio_mask(void) 4752 + { 4753 + unsigned int enc_bit, mask_bit; 4754 + u64 msr, mask; 4755 + 4756 + /* If there is no memory encryption support, use existing mask */ 4757 + if (cpuid_eax(0x80000000) < 0x8000001f) 4758 + return; 4759 + 4760 + /* If memory encryption is not enabled, use existing mask */ 4761 + rdmsrl(MSR_AMD64_SYSCFG, msr); 4762 + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 4763 + return; 4764 + 4765 + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 4766 + mask_bit = boot_cpu_data.x86_phys_bits; 4767 + 4768 + /* Increment the mask bit if it is the same as the encryption bit */ 4769 + if (enc_bit == mask_bit) 4770 + mask_bit++; 4771 + 4772 + /* 4773 + * If the mask bit location is below 52, then some bits above the 4774 + * physical addressing limit will always be reserved, so use the 4775 + * rsvd_bits() function to generate the mask. This mask, along with 4776 + * the present bit, will be used to generate a page fault with 4777 + * PFER.RSV = 1. 4778 + * 4779 + * If the mask bit location is 52 (or above), then clear the mask. 4780 + */ 4781 + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 4782 + 4783 + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 4784 + } 4785 + 4786 + static __init void svm_set_cpu_caps(void) 4787 + { 4788 + kvm_set_cpu_caps(); 4789 + 4790 + supported_xss = 0; 4791 + 4792 + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 4793 + if (nested) { 4794 + kvm_cpu_cap_set(X86_FEATURE_SVM); 4795 + 4796 + if (nrips) 4797 + kvm_cpu_cap_set(X86_FEATURE_NRIPS); 4798 + 4799 + if (npt_enabled) 4800 + kvm_cpu_cap_set(X86_FEATURE_NPT); 4801 + 4802 + if (tsc_scaling) 4803 + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 4804 + 4805 + /* Nested VM can receive #VMEXIT instead of triggering #GP */ 4806 + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 4807 + } 4808 + 4809 + /* CPUID 0x80000008 */ 4810 + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 4811 + boot_cpu_has(X86_FEATURE_AMD_SSBD)) 4812 + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 4813 + 4814 + /* AMD PMU PERFCTR_CORE CPUID */ 4815 + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 4816 + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); 4817 + 4818 + /* CPUID 0x8000001F (SME/SEV features) */ 4819 + sev_set_cpu_caps(); 4820 + } 4821 + 4822 + static __init int svm_hardware_setup(void) 4823 + { 4824 + int cpu; 4825 + struct page *iopm_pages; 4826 + void *iopm_va; 4827 + int r; 4828 + unsigned int order = get_order(IOPM_SIZE); 4829 + 4830 + /* 4831 + * NX is required for shadow paging and for NPT if the NX huge pages 4832 + * mitigation is enabled. 4833 + */ 4834 + if (!boot_cpu_has(X86_FEATURE_NX)) { 4835 + pr_err_ratelimited("NX (Execute Disable) not supported\n"); 4836 + return -EOPNOTSUPP; 4837 + } 4838 + kvm_enable_efer_bits(EFER_NX); 4839 + 4840 + iopm_pages = alloc_pages(GFP_KERNEL, order); 4841 + 4842 + if (!iopm_pages) 4843 + return -ENOMEM; 4844 + 4845 + iopm_va = page_address(iopm_pages); 4846 + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 4847 + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 4848 + 4849 + init_msrpm_offsets(); 4850 + 4851 + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 4852 + 4853 + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 4854 + kvm_enable_efer_bits(EFER_FFXSR); 4855 + 4856 + if (tsc_scaling) { 4857 + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 4858 + tsc_scaling = false; 4859 + } else { 4860 + pr_info("TSC scaling supported\n"); 4861 + kvm_has_tsc_control = true; 4862 + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; 4863 + kvm_tsc_scaling_ratio_frac_bits = 32; 4864 + } 4865 + } 4866 + 4867 + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 4868 + 4869 + /* Check for pause filtering support */ 4870 + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 4871 + pause_filter_count = 0; 4872 + pause_filter_thresh = 0; 4873 + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 4874 + pause_filter_thresh = 0; 4875 + } 4876 + 4877 + if (nested) { 4878 + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 4879 + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 4880 + } 4881 + 4882 + /* 4883 + * KVM's MMU doesn't support using 2-level paging for itself, and thus 4884 + * NPT isn't supported if the host is using 2-level paging since host 4885 + * CR4 is unchanged on VMRUN. 4886 + */ 4887 + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 4888 + npt_enabled = false; 4889 + 4890 + if (!boot_cpu_has(X86_FEATURE_NPT)) 4891 + npt_enabled = false; 4892 + 4893 + /* Force VM NPT level equal to the host's paging level */ 4894 + kvm_configure_mmu(npt_enabled, get_npt_level(), 4895 + get_npt_level(), PG_LEVEL_1G); 4896 + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); 4897 + 4898 + /* Note, SEV setup consumes npt_enabled. */ 4899 + sev_hardware_setup(); 4900 + 4901 + svm_hv_hardware_setup(); 4902 + 4903 + svm_adjust_mmio_mask(); 4904 + 4905 + for_each_possible_cpu(cpu) { 4906 + r = svm_cpu_init(cpu); 4907 + if (r) 4908 + goto err; 4909 + } 4910 + 4911 + if (nrips) { 4912 + if (!boot_cpu_has(X86_FEATURE_NRIPS)) 4913 + nrips = false; 4914 + } 4915 + 4916 + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); 4917 + 4918 + if (enable_apicv) { 4919 + pr_info("AVIC enabled\n"); 4920 + 4921 + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 4922 + } else { 4923 + svm_x86_ops.vcpu_blocking = NULL; 4924 + svm_x86_ops.vcpu_unblocking = NULL; 4925 + } 4926 + 4927 + if (vls) { 4928 + if (!npt_enabled || 4929 + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 4930 + !IS_ENABLED(CONFIG_X86_64)) { 4931 + vls = false; 4932 + } else { 4933 + pr_info("Virtual VMLOAD VMSAVE supported\n"); 4934 + } 4935 + } 4936 + 4937 + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 4938 + svm_gp_erratum_intercept = false; 4939 + 4940 + if (vgif) { 4941 + if (!boot_cpu_has(X86_FEATURE_VGIF)) 4942 + vgif = false; 4943 + else 4944 + pr_info("Virtual GIF supported\n"); 4945 + } 4946 + 4947 + if (lbrv) { 4948 + if (!boot_cpu_has(X86_FEATURE_LBRV)) 4949 + lbrv = false; 4950 + else 4951 + pr_info("LBR virtualization supported\n"); 4952 + } 4953 + 4954 + if (!enable_pmu) 4955 + pr_info("PMU virtualization is disabled\n"); 4956 + 4957 + svm_set_cpu_caps(); 4958 + 4959 + /* 4960 + * It seems that on AMD processors PTE's accessed bit is 4961 + * being set by the CPU hardware before the NPF vmexit. 4962 + * This is not expected behaviour and our tests fail because 4963 + * of it. 4964 + * A workaround here is to disable support for 4965 + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 4966 + * In this case userspace can know if there is support using 4967 + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 4968 + * it 4969 + * If future AMD CPU models change the behaviour described above, 4970 + * this variable can be changed accordingly 4971 + */ 4972 + allow_smaller_maxphyaddr = !npt_enabled; 4973 + 4974 + return 0; 4975 + 4976 + err: 4977 + svm_hardware_teardown(); 4978 + return r; 4979 + } 4980 + 4507 4981 4508 4982 static struct kvm_x86_init_ops svm_init_ops __initdata = { 4509 4983 .cpu_has_kvm_support = has_svm,

+2 -15

arch/x86/kvm/svm/svm.h

··· 32 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 33 33 extern bool npt_enabled; 34 34 extern bool intercept_smi; 35 - extern bool pmu; 36 35 37 36 /* 38 37 * Clean bits in VMCB. ··· 225 226 u32 dfr_reg; 226 227 struct page *avic_backing_page; 227 228 u64 *avic_physical_id_cache; 228 - bool avic_is_running; 229 229 230 230 /* 231 231 * Per-vcpu list of struct amd_svm_iommu_ir: ··· 572 574 573 575 #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL 574 576 575 - static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) 576 - { 577 - struct vcpu_svm *svm = to_svm(vcpu); 578 - u64 *entry = svm->avic_physical_id_cache; 579 - 580 - if (!entry) 581 - return false; 582 - 583 - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); 584 - } 585 - 586 577 int avic_ga_log_notifier(u32 ga_tag); 587 578 void avic_vm_destroy(struct kvm *kvm); 588 579 int avic_vm_init(struct kvm *kvm); ··· 592 605 bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); 593 606 int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, 594 607 uint32_t guest_irq, bool set); 595 - void svm_vcpu_blocking(struct kvm_vcpu *vcpu); 596 - void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); 608 + void avic_vcpu_blocking(struct kvm_vcpu *vcpu); 609 + void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); 597 610 598 611 /* sev.c */ 599 612

+4

arch/x86/kvm/vmx/capabilities.h

··· 5 5 #include <asm/vmx.h> 6 6 7 7 #include "lapic.h" 8 + #include "x86.h" 8 9 9 10 extern bool __read_mostly enable_vpid; 10 11 extern bool __read_mostly flexpriority_enabled; ··· 389 388 static inline u64 vmx_get_perf_capabilities(void) 390 389 { 391 390 u64 perf_cap = 0; 391 + 392 + if (!enable_pmu) 393 + return perf_cap; 392 394 393 395 if (boot_cpu_has(X86_FEATURE_PDCM)) 394 396 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);

+13 -7

arch/x86/kvm/vmx/pmu_intel.c

··· 21 21 #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) 22 22 23 23 static struct kvm_event_hw_type_mapping intel_arch_events[] = { 24 - /* Index must match CPUID 0x0A.EBX bit vector */ 25 24 [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, 26 25 [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, 27 26 [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, ··· 28 29 [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, 29 30 [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 30 31 [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, 32 + /* The above index must match CPUID 0x0A.EBX bit vector */ 31 33 [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, 32 34 }; 33 35 ··· 75 75 u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; 76 76 int i; 77 77 78 - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) 79 - if (intel_arch_events[i].eventsel == event_select && 80 - intel_arch_events[i].unit_mask == unit_mask && 81 - (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) 82 - break; 78 + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { 79 + if (intel_arch_events[i].eventsel != event_select || 80 + intel_arch_events[i].unit_mask != unit_mask) 81 + continue; 82 + 83 + /* disable event that reported as not present by cpuid */ 84 + if ((i < 7) && !(pmu->available_event_types & (1 << i))) 85 + return PERF_COUNT_HW_MAX + 1; 86 + 87 + break; 88 + } 83 89 84 90 if (i == ARRAY_SIZE(intel_arch_events)) 85 91 return PERF_COUNT_HW_MAX; ··· 487 481 pmu->reserved_bits = 0xffffffff00200000ull; 488 482 489 483 entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); 490 - if (!entry) 484 + if (!entry || !enable_pmu) 491 485 return; 492 486 eax.full = entry->eax; 493 487 edx.full = entry->edx;

+83 -104

arch/x86/kvm/vmx/posted_intr.c

··· 19 19 * wake the target vCPUs. vCPUs are removed from the list and the notification 20 20 * vector is reset when the vCPU is scheduled in. 21 21 */ 22 - static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); 22 + static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); 23 23 /* 24 24 * Protect the per-CPU list with a per-CPU spinlock to handle task migration. 25 25 * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the ··· 27 27 * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will 28 28 * occur if a wakeup IRQ arrives and attempts to acquire the lock. 29 29 */ 30 - static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); 30 + static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); 31 31 32 32 static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) 33 33 { ··· 51 51 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 52 52 { 53 53 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 54 + struct vcpu_vmx *vmx = to_vmx(vcpu); 54 55 struct pi_desc old, new; 56 + unsigned long flags; 55 57 unsigned int dest; 56 58 57 59 /* ··· 64 62 if (!enable_apicv || !lapic_in_kernel(vcpu)) 65 63 return; 66 64 67 - /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ 68 - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) 69 - return; 70 - 71 65 /* 72 - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change 73 - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the 74 - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that 75 - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up 76 - * correctly. 66 + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the 67 + * full update can be skipped as neither the vector nor the destination 68 + * needs to be changed. 77 69 */ 78 - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { 79 - pi_clear_sn(pi_desc); 80 - goto after_clear_sn; 70 + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { 71 + /* 72 + * Clear SN if it was set due to being preempted. Again, do 73 + * this even if there is no assigned device for simplicity. 74 + */ 75 + if (pi_test_and_clear_sn(pi_desc)) 76 + goto after_clear_sn; 77 + return; 81 78 } 82 79 83 - /* The full case. Set the new destination and clear SN. */ 80 + local_irq_save(flags); 81 + 82 + /* 83 + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup 84 + * list of the _previous_ pCPU, which will not be the same as the 85 + * current pCPU if the task was migrated. 86 + */ 87 + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { 88 + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 89 + list_del(&vmx->pi_wakeup_list); 90 + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 91 + } 92 + 84 93 dest = cpu_physical_id(cpu); 85 94 if (!x2apic_mode) 86 95 dest = (dest << 8) & 0xFF00; ··· 99 86 do { 100 87 old.control = new.control = READ_ONCE(pi_desc->control); 101 88 89 + /* 90 + * Clear SN (as above) and refresh the destination APIC ID to 91 + * handle task migration (@cpu != vcpu->cpu). 92 + */ 102 93 new.ndst = dest; 103 94 new.sn = 0; 95 + 96 + /* 97 + * Restore the notification vector; in the blocking case, the 98 + * descriptor was modified on "put" to use the wakeup vector. 99 + */ 100 + new.nv = POSTED_INTR_VECTOR; 104 101 } while (pi_try_set_control(pi_desc, old.control, new.control)); 102 + 103 + local_irq_restore(flags); 105 104 106 105 after_clear_sn: 107 106 ··· 136 111 irq_remapping_cap(IRQ_POSTING_CAP); 137 112 } 138 113 139 - void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 140 - { 141 - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 142 - 143 - if (!vmx_can_use_vtd_pi(vcpu->kvm)) 144 - return; 145 - 146 - /* Set SN when the vCPU is preempted */ 147 - if (vcpu->preempted) 148 - pi_set_sn(pi_desc); 149 - } 150 - 151 - static void __pi_post_block(struct kvm_vcpu *vcpu) 152 - { 153 - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 154 - struct pi_desc old, new; 155 - unsigned int dest; 156 - 157 - /* 158 - * Remove the vCPU from the wakeup list of the _previous_ pCPU, which 159 - * will not be the same as the current pCPU if the task was migrated. 160 - */ 161 - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 162 - list_del(&vcpu->blocked_vcpu_list); 163 - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 164 - 165 - dest = cpu_physical_id(vcpu->cpu); 166 - if (!x2apic_mode) 167 - dest = (dest << 8) & 0xFF00; 168 - 169 - WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, 170 - "Wakeup handler not enabled while the vCPU was blocking"); 171 - 172 - do { 173 - old.control = new.control = READ_ONCE(pi_desc->control); 174 - 175 - new.ndst = dest; 176 - 177 - /* set 'NV' to 'notification vector' */ 178 - new.nv = POSTED_INTR_VECTOR; 179 - } while (pi_try_set_control(pi_desc, old.control, new.control)); 180 - 181 - vcpu->pre_pcpu = -1; 182 - } 183 - 184 114 /* 185 - * This routine does the following things for vCPU which is going 186 - * to be blocked if VT-d PI is enabled. 187 - * - Store the vCPU to the wakeup list, so when interrupts happen 188 - * we can find the right vCPU to wake up. 189 - * - Change the Posted-interrupt descriptor as below: 190 - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR 191 - * - If 'ON' is set during this process, which means at least one 192 - * interrupt is posted for this vCPU, we cannot block it, in 193 - * this case, return 1, otherwise, return 0. 194 - * 115 + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set 116 + * WAKEUP as the notification vector in the PI descriptor. 195 117 */ 196 - int pi_pre_block(struct kvm_vcpu *vcpu) 118 + static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) 197 119 { 198 - struct pi_desc old, new; 199 120 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 121 + struct vcpu_vmx *vmx = to_vmx(vcpu); 122 + struct pi_desc old, new; 200 123 unsigned long flags; 201 - 202 - if (!vmx_can_use_vtd_pi(vcpu->kvm) || 203 - vmx_interrupt_blocked(vcpu)) 204 - return 0; 205 124 206 125 local_irq_save(flags); 207 126 208 - vcpu->pre_pcpu = vcpu->cpu; 209 - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); 210 - list_add_tail(&vcpu->blocked_vcpu_list, 211 - &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); 212 - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); 127 + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 128 + list_add_tail(&vmx->pi_wakeup_list, 129 + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); 130 + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); 213 131 214 - WARN(pi_desc->sn == 1, 215 - "Posted Interrupt Suppress Notification set before blocking"); 132 + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); 216 133 217 134 do { 218 135 old.control = new.control = READ_ONCE(pi_desc->control); ··· 163 196 new.nv = POSTED_INTR_WAKEUP_VECTOR; 164 197 } while (pi_try_set_control(pi_desc, old.control, new.control)); 165 198 166 - /* We should not block the vCPU if an interrupt is posted for it. */ 167 - if (pi_test_on(pi_desc)) 168 - __pi_post_block(vcpu); 199 + /* 200 + * Send a wakeup IPI to this CPU if an interrupt may have been posted 201 + * before the notification vector was updated, in which case the IRQ 202 + * will arrive on the non-wakeup vector. An IPI is needed as calling 203 + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not 204 + * enabled until it is safe to call try_to_wake_up() on the task being 205 + * scheduled out). 206 + */ 207 + if (pi_test_on(&new)) 208 + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); 169 209 170 210 local_irq_restore(flags); 171 - return (vcpu->pre_pcpu == -1); 172 211 } 173 212 174 - void pi_post_block(struct kvm_vcpu *vcpu) 213 + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 175 214 { 176 - unsigned long flags; 215 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 177 216 178 - if (vcpu->pre_pcpu == -1) 217 + if (!vmx_can_use_vtd_pi(vcpu->kvm)) 179 218 return; 180 219 181 - local_irq_save(flags); 182 - __pi_post_block(vcpu); 183 - local_irq_restore(flags); 220 + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) 221 + pi_enable_wakeup_handler(vcpu); 222 + 223 + /* 224 + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen 225 + * as blocking and preempted, e.g. if it's preempted between setting 226 + * its wait state and manually scheduling out. 227 + */ 228 + if (vcpu->preempted) 229 + pi_set_sn(pi_desc); 184 230 } 185 231 186 232 /* ··· 201 221 */ 202 222 void pi_wakeup_handler(void) 203 223 { 204 - struct kvm_vcpu *vcpu; 205 224 int cpu = smp_processor_id(); 225 + struct vcpu_vmx *vmx; 206 226 207 - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 208 - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), 209 - blocked_vcpu_list) { 210 - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 227 + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); 228 + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), 229 + pi_wakeup_list) { 211 230 212 - if (pi_test_on(pi_desc)) 213 - kvm_vcpu_kick(vcpu); 231 + if (pi_test_on(&vmx->pi_desc)) 232 + kvm_vcpu_wake_up(&vmx->vcpu); 214 233 } 215 - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 234 + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); 216 235 } 217 236 218 237 void __init pi_init_cpu(int cpu) 219 238 { 220 - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 221 - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 239 + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); 240 + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); 222 241 } 223 242 224 243 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) ··· 233 254 * Bail out of the block loop if the VM has an assigned 234 255 * device, but the blocking vCPU didn't reconfigure the 235 256 * PI.NV to the wakeup vector, i.e. the assigned device 236 - * came along after the initial check in pi_pre_block(). 257 + * came along after the initial check in vmx_vcpu_pi_put(). 237 258 */ 238 259 void vmx_pi_start_assignment(struct kvm *kvm) 239 260 {

+6 -2

arch/x86/kvm/vmx/posted_intr.h

··· 40 40 (unsigned long *)&pi_desc->control); 41 41 } 42 42 43 + static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) 44 + { 45 + return test_and_clear_bit(POSTED_INTR_SN, 46 + (unsigned long *)&pi_desc->control); 47 + } 48 + 43 49 static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 44 50 { 45 51 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); ··· 94 88 95 89 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); 96 90 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); 97 - int pi_pre_block(struct kvm_vcpu *vcpu); 98 - void pi_post_block(struct kvm_vcpu *vcpu); 99 91 void pi_wakeup_handler(void); 100 92 void __init pi_init_cpu(int cpu); 101 93 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);

+33 -35

arch/x86/kvm/vmx/vmx.c

··· 3931 3931 pt_update_intercept_for_msr(vcpu); 3932 3932 } 3933 3933 3934 - static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 3935 - bool nested) 3934 + static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 3935 + int pi_vec) 3936 3936 { 3937 3937 #ifdef CONFIG_SMP 3938 - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; 3939 - 3940 3938 if (vcpu->mode == IN_GUEST_MODE) { 3941 3939 /* 3942 3940 * The vector of interrupt to be delivered to vcpu had ··· 3962 3964 */ 3963 3965 3964 3966 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 3965 - return true; 3967 + return; 3966 3968 } 3967 3969 #endif 3968 - return false; 3970 + /* 3971 + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 3972 + * otherwise do nothing as KVM will grab the highest priority pending 3973 + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 3974 + */ 3975 + kvm_vcpu_wake_up(vcpu); 3969 3976 } 3970 3977 3971 3978 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, ··· 4000 3997 smp_mb__after_atomic(); 4001 3998 4002 3999 /* the PIR and ON have been set by L1. */ 4003 - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) 4004 - kvm_vcpu_kick(vcpu); 4000 + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4005 4001 return 0; 4006 4002 } 4007 4003 return -1; ··· 4037 4035 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4038 4036 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4039 4037 */ 4040 - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) 4041 - kvm_vcpu_kick(vcpu); 4042 - 4038 + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4043 4039 return 0; 4044 4040 } 4045 4041 ··· 5426 5426 return 1; 5427 5427 } 5428 5428 5429 + static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) 5430 + { 5431 + struct vcpu_vmx *vmx = to_vmx(vcpu); 5432 + 5433 + return vmx->emulation_required && !vmx->rmode.vm86_active && 5434 + vcpu->arch.exception.pending; 5435 + } 5436 + 5429 5437 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5430 5438 { 5431 5439 struct vcpu_vmx *vmx = to_vmx(vcpu); ··· 5453 5445 if (!kvm_emulate_instruction(vcpu, 0)) 5454 5446 return 0; 5455 5447 5456 - if (vmx->emulation_required && !vmx->rmode.vm86_active && 5457 - vcpu->arch.exception.pending) { 5448 + if (vmx_emulation_required_with_pending_exception(vcpu)) { 5458 5449 kvm_prepare_emulation_failure_exit(vcpu); 5459 5450 return 0; 5460 5451 } ··· 5470 5463 */ 5471 5464 if (__xfer_to_guest_mode_work_pending()) 5472 5465 return 1; 5466 + } 5467 + 5468 + return 1; 5469 + } 5470 + 5471 + static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5472 + { 5473 + if (vmx_emulation_required_with_pending_exception(vcpu)) { 5474 + kvm_prepare_emulation_failure_exit(vcpu); 5475 + return 0; 5473 5476 } 5474 5477 5475 5478 return 1; ··· 6945 6928 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 6946 6929 vmx = to_vmx(vcpu); 6947 6930 6931 + INIT_LIST_HEAD(&vmx->pi_wakeup_list); 6932 + 6948 6933 err = -ENOMEM; 6949 6934 6950 6935 vmx->vpid = allocate_vpid(); ··· 7568 7549 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 7569 7550 } 7570 7551 7571 - static int vmx_pre_block(struct kvm_vcpu *vcpu) 7572 - { 7573 - if (pi_pre_block(vcpu)) 7574 - return 1; 7575 - 7576 - if (kvm_lapic_hv_timer_in_use(vcpu)) 7577 - kvm_lapic_switch_to_sw_timer(vcpu); 7578 - 7579 - return 0; 7580 - } 7581 - 7582 - static void vmx_post_block(struct kvm_vcpu *vcpu) 7583 - { 7584 - if (kvm_x86_ops.set_hv_timer) 7585 - kvm_lapic_switch_to_hv_timer(vcpu); 7586 - 7587 - pi_post_block(vcpu); 7588 - } 7589 - 7590 7552 static void vmx_setup_mce(struct kvm_vcpu *vcpu) 7591 7553 { 7592 7554 if (vcpu->arch.mcg_cap & MCG_LMCE_P) ··· 7710 7710 .tlb_flush_gva = vmx_flush_tlb_gva, 7711 7711 .tlb_flush_guest = vmx_flush_tlb_guest, 7712 7712 7713 + .vcpu_pre_run = vmx_vcpu_pre_run, 7713 7714 .run = vmx_vcpu_run, 7714 7715 .handle_exit = vmx_handle_exit, 7715 7716 .skip_emulated_instruction = vmx_skip_emulated_instruction, ··· 7768 7767 7769 7768 .cpu_dirty_log_size = PML_ENTITY_NUM, 7770 7769 .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, 7771 - 7772 - .pre_block = vmx_pre_block, 7773 - .post_block = vmx_post_block, 7774 7770 7775 7771 .pmu_ops = &intel_pmu_ops, 7776 7772 .nested_ops = &vmx_nested_ops,

+3

arch/x86/kvm/vmx/vmx.h

··· 317 317 /* Posted interrupt descriptor */ 318 318 struct pi_desc pi_desc; 319 319 320 + /* Used if this vCPU is waiting for PI notification wakeup. */ 321 + struct list_head pi_wakeup_list; 322 + 320 323 /* Support for a guest hypervisor (nested VMX) */ 321 324 struct nested_vmx nested; 322 325

+40 -30

arch/x86/kvm/x86.c

··· 187 187 int __read_mostly pi_inject_timer = -1; 188 188 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); 189 189 190 + /* Enable/disable PMU virtualization */ 191 + bool __read_mostly enable_pmu = true; 192 + EXPORT_SYMBOL_GPL(enable_pmu); 193 + module_param(enable_pmu, bool, 0444); 194 + 190 195 /* 191 196 * Restoring the host value for MSRs that are only consumed when running in 192 197 * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU ··· 5235 5230 struct kvm_cpuid __user *cpuid_arg = argp; 5236 5231 struct kvm_cpuid cpuid; 5237 5232 5238 - /* 5239 - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as 5240 - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't 5241 - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page 5242 - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with 5243 - * the core vCPU model on the fly, so fail. 5244 - */ 5245 - r = -EINVAL; 5246 - if (vcpu->arch.last_vmentry_cpu != -1) 5247 - goto out; 5248 - 5249 5233 r = -EFAULT; 5250 5234 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) 5251 5235 goto out; ··· 5244 5250 case KVM_SET_CPUID2: { 5245 5251 struct kvm_cpuid2 __user *cpuid_arg = argp; 5246 5252 struct kvm_cpuid2 cpuid; 5247 - 5248 - /* 5249 - * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in 5250 - * KVM_SET_CPUID case above. 5251 - */ 5252 - r = -EINVAL; 5253 - if (vcpu->arch.last_vmentry_cpu != -1) 5254 - goto out; 5255 5253 5256 5254 r = -EFAULT; 5257 5255 if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) ··· 9931 9945 smp_mb__after_srcu_read_unlock(); 9932 9946 9933 9947 /* 9934 - * This handles the case where a posted interrupt was 9935 - * notified with kvm_vcpu_kick. Assigned devices can 9936 - * use the POSTED_INTR_VECTOR even if APICv is disabled, 9937 - * so do it even if APICv is disabled on this vCPU. 9948 + * Process pending posted interrupts to handle the case where the 9949 + * notification IRQ arrived in the host, or was never sent (because the 9950 + * target vCPU wasn't running). Do this regardless of the vCPU's APICv 9951 + * status, KVM doesn't update assigned devices when APICv is inhibited, 9952 + * i.e. they can post interrupts even if APICv is temporarily disabled. 9938 9953 */ 9939 9954 if (kvm_lapic_enabled(vcpu)) 9940 9955 static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); ··· 10100 10113 10101 10114 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) 10102 10115 { 10103 - if (!kvm_arch_vcpu_runnable(vcpu) && 10104 - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { 10116 + bool hv_timer; 10117 + 10118 + if (!kvm_arch_vcpu_runnable(vcpu)) { 10119 + /* 10120 + * Switch to the software timer before halt-polling/blocking as 10121 + * the guest's timer may be a break event for the vCPU, and the 10122 + * hypervisor timer runs only when the CPU is in guest mode. 10123 + * Switch before halt-polling so that KVM recognizes an expired 10124 + * timer before blocking. 10125 + */ 10126 + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); 10127 + if (hv_timer) 10128 + kvm_lapic_switch_to_sw_timer(vcpu); 10129 + 10105 10130 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 10106 10131 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 10107 10132 kvm_vcpu_halt(vcpu); ··· 10121 10122 kvm_vcpu_block(vcpu); 10122 10123 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 10123 10124 10124 - if (kvm_x86_ops.post_block) 10125 - static_call(kvm_x86_post_block)(vcpu); 10125 + if (hv_timer) 10126 + kvm_lapic_switch_to_hv_timer(vcpu); 10126 10127 10127 10128 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) 10128 10129 return 1; ··· 10315 10316 r = -EINTR; 10316 10317 goto out; 10317 10318 } 10319 + /* 10320 + * It should be impossible for the hypervisor timer to be in 10321 + * use before KVM has ever run the vCPU. 10322 + */ 10323 + WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); 10318 10324 kvm_vcpu_block(vcpu); 10319 10325 if (kvm_apic_accept_events(vcpu) < 0) { 10320 10326 r = 0; ··· 10364 10360 } else 10365 10361 WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); 10366 10362 10367 - if (kvm_run->immediate_exit) 10363 + if (kvm_run->immediate_exit) { 10368 10364 r = -EINTR; 10369 - else 10370 - r = vcpu_run(vcpu); 10365 + goto out; 10366 + } 10367 + 10368 + r = static_call(kvm_x86_vcpu_pre_run)(vcpu); 10369 + if (r <= 0) 10370 + goto out; 10371 + 10372 + r = vcpu_run(vcpu); 10371 10373 10372 10374 out: 10373 10375 kvm_put_guest_fpu(vcpu);

+1

arch/x86/kvm/x86.h

··· 336 336 extern u64 supported_xcr0; 337 337 extern u64 host_xss; 338 338 extern u64 supported_xss; 339 + extern bool enable_pmu; 339 340 340 341 static inline bool kvm_mpx_supported(void) 341 342 {

-3

include/linux/kvm_host.h

··· 309 309 u64 requests; 310 310 unsigned long guest_debug; 311 311 312 - int pre_pcpu; 313 - struct list_head blocked_vcpu_list; 314 - 315 312 struct mutex mutex; 316 313 struct kvm_run *run; 317 314

+16 -3

tools/include/uapi/linux/kvm.h

··· 1131 1131 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 1132 1132 #define KVM_CAP_ARM_MTE 205 1133 1133 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 1134 - #define KVM_CAP_XSAVE2 207 1134 + #define KVM_CAP_VM_GPA_BITS 207 1135 + #define KVM_CAP_XSAVE2 208 1135 1136 1136 1137 #ifdef KVM_CAP_IRQ_ROUTING 1137 1138 ··· 1164 1163 __u32 sint; 1165 1164 }; 1166 1165 1166 + struct kvm_irq_routing_xen_evtchn { 1167 + __u32 port; 1168 + __u32 vcpu; 1169 + __u32 priority; 1170 + }; 1171 + 1172 + #define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) 1173 + 1167 1174 /* gsi routing entry types */ 1168 1175 #define KVM_IRQ_ROUTING_IRQCHIP 1 1169 1176 #define KVM_IRQ_ROUTING_MSI 2 1170 1177 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 1171 1178 #define KVM_IRQ_ROUTING_HV_SINT 4 1179 + #define KVM_IRQ_ROUTING_XEN_EVTCHN 5 1172 1180 1173 1181 struct kvm_irq_routing_entry { 1174 1182 __u32 gsi; ··· 1189 1179 struct kvm_irq_routing_msi msi; 1190 1180 struct kvm_irq_routing_s390_adapter adapter; 1191 1181 struct kvm_irq_routing_hv_sint hv_sint; 1182 + struct kvm_irq_routing_xen_evtchn xen_evtchn; 1192 1183 __u32 pad[8]; 1193 1184 } u; 1194 1185 }; ··· 1220 1209 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) 1221 1210 #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) 1222 1211 #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) 1212 + #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) 1223 1213 1224 1214 struct kvm_xen_hvm_config { 1225 1215 __u32 flags; ··· 1564 1552 /* Available with KVM_CAP_XSAVE */ 1565 1553 #define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) 1566 1554 #define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) 1567 - /* Available with KVM_CAP_XSAVE2 */ 1568 - #define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) 1569 1555 /* Available with KVM_CAP_XCRS */ 1570 1556 #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) 1571 1557 #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) ··· 1622 1612 /* Available with KVM_CAP_S390_VCPU_RESETS */ 1623 1613 #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) 1624 1614 #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) 1615 + 1616 + /* Available with KVM_CAP_XSAVE2 */ 1617 + #define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) 1625 1618 1626 1619 struct kvm_s390_pv_sec_parm { 1627 1620 __u64 origin;

+4 -1

tools/testing/selftests/kvm/.gitignore

··· 8 8 /s390x/memop 9 9 /s390x/resets 10 10 /s390x/sync_regs_test 11 + /x86_64/amx_test 12 + /x86_64/cpuid_test 11 13 /x86_64/cr4_cpuid_sync_test 12 14 /x86_64/debug_regs 13 15 /x86_64/evmcs_test 14 16 /x86_64/emulator_error_test 15 - /x86_64/get_cpuid_test 16 17 /x86_64/get_msr_index_features 17 18 /x86_64/kvm_clock_test 18 19 /x86_64/kvm_pv_test ··· 23 22 /x86_64/mmio_warning_test 24 23 /x86_64/mmu_role_test 25 24 /x86_64/platform_info_test 25 + /x86_64/pmu_event_filter_test 26 26 /x86_64/set_boot_cpu_id 27 27 /x86_64/set_sregs_test 28 28 /x86_64/sev_migrate_tests ··· 38 36 /x86_64/vmx_apic_access_test 39 37 /x86_64/vmx_close_while_nested_test 40 38 /x86_64/vmx_dirty_log_test 39 + /x86_64/vmx_exception_with_invalid_guest_state 41 40 /x86_64/vmx_invalid_nested_guest_state 42 41 /x86_64/vmx_preemption_timer_test 43 42 /x86_64/vmx_set_nested_state_test

+4 -2

tools/testing/selftests/kvm/Makefile

··· 43 43 LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c 44 44 LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c 45 45 46 - TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test 46 + TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test 47 + TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test 47 48 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features 48 49 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test 49 50 TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test 50 - TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test 51 51 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock 52 52 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid 53 53 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features ··· 56 56 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test 57 57 TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test 58 58 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test 59 + TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test 59 60 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id 60 61 TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test 61 62 TEST_GEN_PROGS_x86_64 += x86_64/smm_test ··· 70 69 TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test 71 70 TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test 72 71 TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test 72 + TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state 73 73 TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state 74 74 TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test 75 75 TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test

+25

tools/testing/selftests/kvm/include/x86_64/processor.h

··· 364 364 } 365 365 366 366 bool is_intel_cpu(void); 367 + bool is_amd_cpu(void); 368 + 369 + static inline unsigned int x86_family(unsigned int eax) 370 + { 371 + unsigned int x86; 372 + 373 + x86 = (eax >> 8) & 0xf; 374 + 375 + if (x86 == 0xf) 376 + x86 += (eax >> 20) & 0xff; 377 + 378 + return x86; 379 + } 380 + 381 + static inline unsigned int x86_model(unsigned int eax) 382 + { 383 + return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); 384 + } 367 385 368 386 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); 369 387 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, ··· 393 375 struct kvm_cpuid2 *kvm_get_supported_cpuid(void); 394 376 395 377 struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); 378 + int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, 379 + struct kvm_cpuid2 *cpuid); 396 380 void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, 397 381 struct kvm_cpuid2 *cpuid); 398 382 ··· 438 418 void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, 439 419 uint64_t pte); 440 420 421 + /* 422 + * get_cpuid() - find matching CPUID entry and return pointer to it. 423 + */ 424 + struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, 425 + uint32_t index); 441 426 /* 442 427 * set_cpuid() - overwrites a matching cpuid entry with the provided value. 443 428 * matches based on ent->function && ent->index. returns true

+7 -3

tools/testing/selftests/kvm/lib/kvm_util.c

··· 393 393 struct kvm_vm *vm; 394 394 int i; 395 395 396 + #ifdef __x86_64__ 396 397 /* 397 398 * Permission needs to be requested before KVM_SET_CPUID2. 398 399 */ 399 400 vm_xsave_req_perm(); 401 + #endif 400 402 401 403 /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ 402 404 if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) ··· 499 497 void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, 500 498 uint64_t first_page, uint32_t num_pages) 501 499 { 502 - struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, 503 - .first_page = first_page, 504 - .num_pages = num_pages }; 500 + struct kvm_clear_dirty_log args = { 501 + .dirty_bitmap = log, .slot = slot, 502 + .first_page = first_page, 503 + .num_pages = num_pages 504 + }; 505 505 int ret; 506 506 507 507 ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);

+76 -58

tools/testing/selftests/kvm/lib/x86_64/processor.c

··· 886 886 return entry; 887 887 } 888 888 889 + 890 + int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, 891 + struct kvm_cpuid2 *cpuid) 892 + { 893 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 894 + 895 + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 896 + 897 + return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); 898 + } 899 + 889 900 /* 890 901 * VM VCPU CPUID Set 891 902 * ··· 914 903 void vcpu_set_cpuid(struct kvm_vm *vm, 915 904 uint32_t vcpuid, struct kvm_cpuid2 *cpuid) 916 905 { 917 - struct vcpu *vcpu = vcpu_find(vm, vcpuid); 918 906 int rc; 919 907 920 - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 921 - 922 - rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); 908 + rc = __vcpu_set_cpuid(vm, vcpuid, cpuid); 923 909 TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", 924 910 rc, errno); 925 911 ··· 1144 1136 list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); 1145 1137 list->nmsrs = nmsrs; 1146 1138 r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); 1147 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", 1148 - r); 1139 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", 1140 + r); 1149 1141 1150 1142 state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); 1151 1143 r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); 1152 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", 1153 - r); 1144 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", 1145 + r); 1154 1146 1155 1147 r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); 1156 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", 1157 - r); 1148 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", 1149 + r); 1158 1150 1159 1151 r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); 1160 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", 1161 - r); 1152 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", 1153 + r); 1162 1154 1163 1155 r = vcpu_save_xsave_state(vm, vcpu, state); 1164 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", 1165 - r); 1156 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", 1157 + r); 1166 1158 1167 1159 if (kvm_check_cap(KVM_CAP_XCRS)) { 1168 1160 r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); ··· 1171 1163 } 1172 1164 1173 1165 r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); 1174 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", 1175 - r); 1166 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", 1167 + r); 1176 1168 1177 1169 if (nested_size) { 1178 1170 state->nested.size = sizeof(state->nested_); 1179 1171 r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); 1180 1172 TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", 1181 - r); 1173 + r); 1182 1174 TEST_ASSERT(state->nested.size <= nested_size, 1183 - "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", 1184 - state->nested.size, nested_size); 1175 + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", 1176 + state->nested.size, nested_size); 1185 1177 } else 1186 1178 state->nested.size = 0; 1187 1179 ··· 1189 1181 for (i = 0; i < nmsrs; i++) 1190 1182 state->msrs.entries[i].index = list->indices[i]; 1191 1183 r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); 1192 - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", 1193 - r, r == nmsrs ? -1 : list->indices[r]); 1184 + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", 1185 + r, r == nmsrs ? -1 : list->indices[r]); 1194 1186 1195 1187 r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); 1196 - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", 1197 - r); 1188 + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", 1189 + r); 1198 1190 1199 1191 free(list); 1200 1192 return state; ··· 1207 1199 1208 1200 r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); 1209 1201 TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", 1210 - r); 1202 + r); 1211 1203 1212 1204 r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); 1213 1205 TEST_ASSERT(r == state->msrs.nmsrs, ··· 1222 1214 1223 1215 r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave); 1224 1216 TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", 1225 - r); 1217 + r); 1226 1218 1227 1219 r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); 1228 - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", 1229 - r); 1220 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", 1221 + r); 1230 1222 1231 1223 r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); 1232 - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", 1233 - r); 1224 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", 1225 + r); 1234 1226 1235 1227 r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); 1236 - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", 1237 - r); 1228 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", 1229 + r); 1238 1230 1239 1231 r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); 1240 - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", 1241 - r); 1232 + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", 1233 + r); 1242 1234 1243 1235 if (state->nested.size) { 1244 1236 r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); 1245 1237 TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", 1246 - r); 1238 + r); 1247 1239 } 1248 1240 } 1249 1241 ··· 1253 1245 free(state); 1254 1246 } 1255 1247 1256 - bool is_intel_cpu(void) 1248 + static bool cpu_vendor_string_is(const char *vendor) 1257 1249 { 1250 + const uint32_t *chunk = (const uint32_t *)vendor; 1258 1251 int eax, ebx, ecx, edx; 1259 - const uint32_t *chunk; 1260 1252 const int leaf = 0; 1261 1253 1262 1254 __asm__ __volatile__( ··· 1265 1257 "=c"(ecx), "=d"(edx) 1266 1258 : /* input */ "0"(leaf), "2"(0)); 1267 1259 1268 - chunk = (const uint32_t *)("GenuineIntel"); 1269 1260 return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); 1261 + } 1262 + 1263 + bool is_intel_cpu(void) 1264 + { 1265 + return cpu_vendor_string_is("GenuineIntel"); 1266 + } 1267 + 1268 + /* 1269 + * Exclude early K5 samples with a vendor string of "AMDisbetter!" 1270 + */ 1271 + bool is_amd_cpu(void) 1272 + { 1273 + return cpu_vendor_string_is("AuthenticAMD"); 1270 1274 } 1271 1275 1272 1276 uint32_t kvm_get_cpuid_max_basic(void) ··· 1404 1384 } 1405 1385 } 1406 1386 1387 + struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, 1388 + uint32_t index) 1389 + { 1390 + int i; 1391 + 1392 + for (i = 0; i < cpuid->nent; i++) { 1393 + struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; 1394 + 1395 + if (cur->function == function && cur->index == index) 1396 + return cur; 1397 + } 1398 + 1399 + TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); 1400 + 1401 + return NULL; 1402 + } 1403 + 1407 1404 bool set_cpuid(struct kvm_cpuid2 *cpuid, 1408 1405 struct kvm_cpuid_entry2 *ent) 1409 1406 { ··· 1516 1479 return cpuid; 1517 1480 } 1518 1481 1519 - #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 1520 - #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 1521 - #define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 1522 - 1523 - static inline unsigned x86_family(unsigned int eax) 1524 - { 1525 - unsigned int x86; 1526 - 1527 - x86 = (eax >> 8) & 0xf; 1528 - 1529 - if (x86 == 0xf) 1530 - x86 += (eax >> 20) & 0xff; 1531 - 1532 - return x86; 1533 - } 1534 - 1535 1482 unsigned long vm_compute_max_gfn(struct kvm_vm *vm) 1536 1483 { 1537 1484 const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ ··· 1525 1504 max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; 1526 1505 1527 1506 /* Avoid reserved HyperTransport region on AMD processors. */ 1528 - eax = ecx = 0; 1529 - cpuid(&eax, &ebx, &ecx, &edx); 1530 - if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx || 1531 - ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx || 1532 - edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) 1507 + if (!is_amd_cpu()) 1533 1508 return max_gfn; 1534 1509 1535 1510 /* On parts with <40 physical address bits, the area is fully hidden */ ··· 1535 1518 /* Before family 17h, the HyperTransport area is just below 1T. */ 1536 1519 ht_gfn = (1 << 28) - num_ht_pages; 1537 1520 eax = 1; 1521 + ecx = 0; 1538 1522 cpuid(&eax, &ebx, &ecx, &edx); 1539 1523 if (x86_family(eax) < 0x17) 1540 1524 goto done;

+30

tools/testing/selftests/kvm/x86_64/get_cpuid_test.c tools/testing/selftests/kvm/x86_64/cpuid_test.c

··· 154 154 return guest_cpuids; 155 155 } 156 156 157 + static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid) 158 + { 159 + struct kvm_cpuid_entry2 *ent; 160 + int rc; 161 + u32 eax, ebx, x; 162 + 163 + /* Setting unmodified CPUID is allowed */ 164 + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); 165 + TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); 166 + 167 + /* Changing CPU features is forbidden */ 168 + ent = get_cpuid(cpuid, 0x7, 0); 169 + ebx = ent->ebx; 170 + ent->ebx--; 171 + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); 172 + TEST_ASSERT(rc, "Changing CPU features should fail"); 173 + ent->ebx = ebx; 174 + 175 + /* Changing MAXPHYADDR is forbidden */ 176 + ent = get_cpuid(cpuid, 0x80000008, 0); 177 + eax = ent->eax; 178 + x = eax & 0xff; 179 + ent->eax = (eax & ~0xffu) | (x - 1); 180 + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); 181 + TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); 182 + ent->eax = eax; 183 + } 184 + 157 185 int main(void) 158 186 { 159 187 struct kvm_cpuid2 *supp_cpuid, *cpuid2; ··· 202 174 203 175 for (stage = 0; stage < 3; stage++) 204 176 run_vcpu(vm, VCPU_ID, stage); 177 + 178 + set_cpuid_after_run(vm, cpuid2); 205 179 206 180 kvm_vm_free(vm); 207 181 }

+434

tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Test for x86 KVM_SET_PMU_EVENT_FILTER. 4 + * 5 + * Copyright (C) 2022, Google LLC. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2. 8 + * 9 + * Verifies the expected behavior of allow lists and deny lists for 10 + * virtual PMU events. 11 + */ 12 + 13 + #define _GNU_SOURCE /* for program_invocation_short_name */ 14 + #include "test_util.h" 15 + #include "kvm_util.h" 16 + #include "processor.h" 17 + 18 + /* 19 + * In lieu of copying perf_event.h into tools... 20 + */ 21 + #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) 22 + #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) 23 + 24 + union cpuid10_eax { 25 + struct { 26 + unsigned int version_id:8; 27 + unsigned int num_counters:8; 28 + unsigned int bit_width:8; 29 + unsigned int mask_length:8; 30 + } split; 31 + unsigned int full; 32 + }; 33 + 34 + union cpuid10_ebx { 35 + struct { 36 + unsigned int no_unhalted_core_cycles:1; 37 + unsigned int no_instructions_retired:1; 38 + unsigned int no_unhalted_reference_cycles:1; 39 + unsigned int no_llc_reference:1; 40 + unsigned int no_llc_misses:1; 41 + unsigned int no_branch_instruction_retired:1; 42 + unsigned int no_branch_misses_retired:1; 43 + } split; 44 + unsigned int full; 45 + }; 46 + 47 + /* End of stuff taken from perf_event.h. */ 48 + 49 + /* Oddly, this isn't in perf_event.h. */ 50 + #define ARCH_PERFMON_BRANCHES_RETIRED 5 51 + 52 + #define VCPU_ID 0 53 + #define NUM_BRANCHES 42 54 + 55 + /* 56 + * This is how the event selector and unit mask are stored in an AMD 57 + * core performance event-select register. Intel's format is similar, 58 + * but the event selector is only 8 bits. 59 + */ 60 + #define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \ 61 + (umask & 0xff) << 8) 62 + 63 + /* 64 + * "Branch instructions retired", from the Intel SDM, volume 3, 65 + * "Pre-defined Architectural Performance Events." 66 + */ 67 + 68 + #define INTEL_BR_RETIRED EVENT(0xc4, 0) 69 + 70 + /* 71 + * "Retired branch instructions", from Processor Programming Reference 72 + * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, 73 + * Preliminary Processor Programming Reference (PPR) for AMD Family 74 + * 17h Model 31h, Revision B0 Processors, and Preliminary Processor 75 + * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision 76 + * B1 Processors Volume 1 of 2. 77 + */ 78 + 79 + #define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0) 80 + 81 + /* 82 + * This event list comprises Intel's eight architectural events plus 83 + * AMD's "retired branch instructions" for Zen[123] (and possibly 84 + * other AMD CPUs). 85 + */ 86 + static const uint64_t event_list[] = { 87 + EVENT(0x3c, 0), 88 + EVENT(0xc0, 0), 89 + EVENT(0x3c, 1), 90 + EVENT(0x2e, 0x4f), 91 + EVENT(0x2e, 0x41), 92 + EVENT(0xc4, 0), 93 + EVENT(0xc5, 0), 94 + EVENT(0xa4, 1), 95 + AMD_ZEN_BR_RETIRED, 96 + }; 97 + 98 + /* 99 + * If we encounter a #GP during the guest PMU sanity check, then the guest 100 + * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0). 101 + */ 102 + static void guest_gp_handler(struct ex_regs *regs) 103 + { 104 + GUEST_SYNC(0); 105 + } 106 + 107 + /* 108 + * Check that we can write a new value to the given MSR and read it back. 109 + * The caller should provide a non-empty set of bits that are safe to flip. 110 + * 111 + * Return on success. GUEST_SYNC(0) on error. 112 + */ 113 + static void check_msr(uint32_t msr, uint64_t bits_to_flip) 114 + { 115 + uint64_t v = rdmsr(msr) ^ bits_to_flip; 116 + 117 + wrmsr(msr, v); 118 + if (rdmsr(msr) != v) 119 + GUEST_SYNC(0); 120 + 121 + v ^= bits_to_flip; 122 + wrmsr(msr, v); 123 + if (rdmsr(msr) != v) 124 + GUEST_SYNC(0); 125 + } 126 + 127 + static void intel_guest_code(void) 128 + { 129 + check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1); 130 + check_msr(MSR_P6_EVNTSEL0, 0xffff); 131 + check_msr(MSR_IA32_PMC0, 0xffff); 132 + GUEST_SYNC(1); 133 + 134 + for (;;) { 135 + uint64_t br0, br1; 136 + 137 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); 138 + wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | 139 + ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED); 140 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1); 141 + br0 = rdmsr(MSR_IA32_PMC0); 142 + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); 143 + br1 = rdmsr(MSR_IA32_PMC0); 144 + GUEST_SYNC(br1 - br0); 145 + } 146 + } 147 + 148 + /* 149 + * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23], 150 + * this code uses the always-available, legacy K7 PMU MSRs, which alias to 151 + * the first four of the six extended core PMU MSRs. 152 + */ 153 + static void amd_guest_code(void) 154 + { 155 + check_msr(MSR_K7_EVNTSEL0, 0xffff); 156 + check_msr(MSR_K7_PERFCTR0, 0xffff); 157 + GUEST_SYNC(1); 158 + 159 + for (;;) { 160 + uint64_t br0, br1; 161 + 162 + wrmsr(MSR_K7_EVNTSEL0, 0); 163 + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | 164 + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED); 165 + br0 = rdmsr(MSR_K7_PERFCTR0); 166 + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); 167 + br1 = rdmsr(MSR_K7_PERFCTR0); 168 + GUEST_SYNC(br1 - br0); 169 + } 170 + } 171 + 172 + /* 173 + * Run the VM to the next GUEST_SYNC(value), and return the value passed 174 + * to the sync. Any other exit from the guest is fatal. 175 + */ 176 + static uint64_t run_vm_to_sync(struct kvm_vm *vm) 177 + { 178 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 179 + struct ucall uc; 180 + 181 + vcpu_run(vm, VCPU_ID); 182 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 183 + "Exit_reason other than KVM_EXIT_IO: %u (%s)\n", 184 + run->exit_reason, 185 + exit_reason_str(run->exit_reason)); 186 + get_ucall(vm, VCPU_ID, &uc); 187 + TEST_ASSERT(uc.cmd == UCALL_SYNC, 188 + "Received ucall other than UCALL_SYNC: %lu", uc.cmd); 189 + return uc.args[1]; 190 + } 191 + 192 + /* 193 + * In a nested environment or if the vPMU is disabled, the guest PMU 194 + * might not work as architected (accessing the PMU MSRs may raise 195 + * #GP, or writes could simply be discarded). In those situations, 196 + * there is no point in running these tests. The guest code will perform 197 + * a sanity check and then GUEST_SYNC(success). In the case of failure, 198 + * the behavior of the guest on resumption is undefined. 199 + */ 200 + static bool sanity_check_pmu(struct kvm_vm *vm) 201 + { 202 + bool success; 203 + 204 + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); 205 + success = run_vm_to_sync(vm); 206 + vm_install_exception_handler(vm, GP_VECTOR, NULL); 207 + 208 + return success; 209 + } 210 + 211 + static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents) 212 + { 213 + struct kvm_pmu_event_filter *f; 214 + int size = sizeof(*f) + nevents * sizeof(f->events[0]); 215 + 216 + f = malloc(size); 217 + TEST_ASSERT(f, "Out of memory"); 218 + memset(f, 0, size); 219 + f->nevents = nevents; 220 + return f; 221 + } 222 + 223 + static struct kvm_pmu_event_filter *event_filter(uint32_t action) 224 + { 225 + struct kvm_pmu_event_filter *f; 226 + int i; 227 + 228 + f = make_pmu_event_filter(ARRAY_SIZE(event_list)); 229 + f->action = action; 230 + for (i = 0; i < ARRAY_SIZE(event_list); i++) 231 + f->events[i] = event_list[i]; 232 + 233 + return f; 234 + } 235 + 236 + /* 237 + * Remove the first occurrence of 'event' (if any) from the filter's 238 + * event list. 239 + */ 240 + static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f, 241 + uint64_t event) 242 + { 243 + bool found = false; 244 + int i; 245 + 246 + for (i = 0; i < f->nevents; i++) { 247 + if (found) 248 + f->events[i - 1] = f->events[i]; 249 + else 250 + found = f->events[i] == event; 251 + } 252 + if (found) 253 + f->nevents--; 254 + return f; 255 + } 256 + 257 + static void test_without_filter(struct kvm_vm *vm) 258 + { 259 + uint64_t count = run_vm_to_sync(vm); 260 + 261 + if (count != NUM_BRANCHES) 262 + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", 263 + __func__, count, NUM_BRANCHES); 264 + TEST_ASSERT(count, "Allowed PMU event is not counting"); 265 + } 266 + 267 + static uint64_t test_with_filter(struct kvm_vm *vm, 268 + struct kvm_pmu_event_filter *f) 269 + { 270 + vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f); 271 + return run_vm_to_sync(vm); 272 + } 273 + 274 + static void test_member_deny_list(struct kvm_vm *vm) 275 + { 276 + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); 277 + uint64_t count = test_with_filter(vm, f); 278 + 279 + free(f); 280 + if (count) 281 + pr_info("%s: Branch instructions retired = %lu (expected 0)\n", 282 + __func__, count); 283 + TEST_ASSERT(!count, "Disallowed PMU Event is counting"); 284 + } 285 + 286 + static void test_member_allow_list(struct kvm_vm *vm) 287 + { 288 + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); 289 + uint64_t count = test_with_filter(vm, f); 290 + 291 + free(f); 292 + if (count != NUM_BRANCHES) 293 + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", 294 + __func__, count, NUM_BRANCHES); 295 + TEST_ASSERT(count, "Allowed PMU event is not counting"); 296 + } 297 + 298 + static void test_not_member_deny_list(struct kvm_vm *vm) 299 + { 300 + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); 301 + uint64_t count; 302 + 303 + remove_event(f, INTEL_BR_RETIRED); 304 + remove_event(f, AMD_ZEN_BR_RETIRED); 305 + count = test_with_filter(vm, f); 306 + free(f); 307 + if (count != NUM_BRANCHES) 308 + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", 309 + __func__, count, NUM_BRANCHES); 310 + TEST_ASSERT(count, "Allowed PMU event is not counting"); 311 + } 312 + 313 + static void test_not_member_allow_list(struct kvm_vm *vm) 314 + { 315 + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); 316 + uint64_t count; 317 + 318 + remove_event(f, INTEL_BR_RETIRED); 319 + remove_event(f, AMD_ZEN_BR_RETIRED); 320 + count = test_with_filter(vm, f); 321 + free(f); 322 + if (count) 323 + pr_info("%s: Branch instructions retired = %lu (expected 0)\n", 324 + __func__, count); 325 + TEST_ASSERT(!count, "Disallowed PMU Event is counting"); 326 + } 327 + 328 + /* 329 + * Check for a non-zero PMU version, at least one general-purpose 330 + * counter per logical processor, an EBX bit vector of length greater 331 + * than 5, and EBX[5] clear. 332 + */ 333 + static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) 334 + { 335 + union cpuid10_eax eax = { .full = entry->eax }; 336 + union cpuid10_ebx ebx = { .full = entry->ebx }; 337 + 338 + return eax.split.version_id && eax.split.num_counters > 0 && 339 + eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED && 340 + !ebx.split.no_branch_instruction_retired; 341 + } 342 + 343 + /* 344 + * Note that CPUID leaf 0xa is Intel-specific. This leaf should be 345 + * clear on AMD hardware. 346 + */ 347 + static bool use_intel_pmu(void) 348 + { 349 + struct kvm_cpuid_entry2 *entry; 350 + 351 + entry = kvm_get_supported_cpuid_index(0xa, 0); 352 + return is_intel_cpu() && entry && check_intel_pmu_leaf(entry); 353 + } 354 + 355 + static bool is_zen1(uint32_t eax) 356 + { 357 + return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f; 358 + } 359 + 360 + static bool is_zen2(uint32_t eax) 361 + { 362 + return x86_family(eax) == 0x17 && 363 + x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f; 364 + } 365 + 366 + static bool is_zen3(uint32_t eax) 367 + { 368 + return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f; 369 + } 370 + 371 + /* 372 + * Determining AMD support for a PMU event requires consulting the AMD 373 + * PPR for the CPU or reference material derived therefrom. The AMD 374 + * test code herein has been verified to work on Zen1, Zen2, and Zen3. 375 + * 376 + * Feel free to add more AMD CPUs that are documented to support event 377 + * select 0xc2 umask 0 as "retired branch instructions." 378 + */ 379 + static bool use_amd_pmu(void) 380 + { 381 + struct kvm_cpuid_entry2 *entry; 382 + 383 + entry = kvm_get_supported_cpuid_index(1, 0); 384 + return is_amd_cpu() && entry && 385 + (is_zen1(entry->eax) || 386 + is_zen2(entry->eax) || 387 + is_zen3(entry->eax)); 388 + } 389 + 390 + int main(int argc, char *argv[]) 391 + { 392 + void (*guest_code)(void) = NULL; 393 + struct kvm_vm *vm; 394 + int r; 395 + 396 + /* Tell stdout not to buffer its content */ 397 + setbuf(stdout, NULL); 398 + 399 + r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER); 400 + if (!r) { 401 + print_skip("KVM_CAP_PMU_EVENT_FILTER not supported"); 402 + exit(KSFT_SKIP); 403 + } 404 + 405 + if (use_intel_pmu()) 406 + guest_code = intel_guest_code; 407 + else if (use_amd_pmu()) 408 + guest_code = amd_guest_code; 409 + 410 + if (!guest_code) { 411 + print_skip("Don't know how to test this guest PMU"); 412 + exit(KSFT_SKIP); 413 + } 414 + 415 + vm = vm_create_default(VCPU_ID, 0, guest_code); 416 + 417 + vm_init_descriptor_tables(vm); 418 + vcpu_init_descriptor_tables(vm, VCPU_ID); 419 + 420 + if (!sanity_check_pmu(vm)) { 421 + print_skip("Guest PMU is not functional"); 422 + exit(KSFT_SKIP); 423 + } 424 + 425 + test_without_filter(vm); 426 + test_member_deny_list(vm); 427 + test_member_allow_list(vm); 428 + test_not_member_deny_list(vm); 429 + test_not_member_allow_list(vm); 430 + 431 + kvm_vm_free(vm); 432 + 433 + return 0; 434 + }

+2 -2

tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c

··· 77 77 switch (get_ucall(vm, vcpuid, &uc)) { 78 78 case UCALL_SYNC: 79 79 TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && 80 - uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", 81 - stage + 1, (ulong)uc.args[1]); 80 + uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", 81 + stage + 1, (ulong)uc.args[1]); 82 82 return; 83 83 case UCALL_DONE: 84 84 return;

+2 -2

tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c

··· 30 30 static void l2_guest_code(void) 31 31 { 32 32 /* Exit to L0 */ 33 - asm volatile("inb %%dx, %%al" 34 - : : [port] "d" (PORT_L0_EXIT) : "rax"); 33 + asm volatile("inb %%dx, %%al" 34 + : : [port] "d" (PORT_L0_EXIT) : "rax"); 35 35 } 36 36 37 37 static void l1_guest_code(struct vmx_pages *vmx_pages)

+139

tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include "test_util.h" 3 + #include "kvm_util.h" 4 + #include "processor.h" 5 + 6 + #include <signal.h> 7 + #include <string.h> 8 + #include <sys/ioctl.h> 9 + #include <sys/time.h> 10 + 11 + #include "kselftest.h" 12 + 13 + #define VCPU_ID 0 14 + 15 + static struct kvm_vm *vm; 16 + 17 + static void guest_ud_handler(struct ex_regs *regs) 18 + { 19 + /* Loop on the ud2 until guest state is made invalid. */ 20 + } 21 + 22 + static void guest_code(void) 23 + { 24 + asm volatile("ud2"); 25 + } 26 + 27 + static void __run_vcpu_with_invalid_state(void) 28 + { 29 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 30 + 31 + vcpu_run(vm, VCPU_ID); 32 + 33 + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, 34 + "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n", 35 + run->exit_reason, exit_reason_str(run->exit_reason)); 36 + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, 37 + "Expected emulation failure, got %d\n", 38 + run->emulation_failure.suberror); 39 + } 40 + 41 + static void run_vcpu_with_invalid_state(void) 42 + { 43 + /* 44 + * Always run twice to verify KVM handles the case where _KVM_ queues 45 + * an exception with invalid state and then exits to userspace, i.e. 46 + * that KVM doesn't explode if userspace ignores the initial error. 47 + */ 48 + __run_vcpu_with_invalid_state(); 49 + __run_vcpu_with_invalid_state(); 50 + } 51 + 52 + static void set_timer(void) 53 + { 54 + struct itimerval timer; 55 + 56 + timer.it_value.tv_sec = 0; 57 + timer.it_value.tv_usec = 200; 58 + timer.it_interval = timer.it_value; 59 + ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0); 60 + } 61 + 62 + static void set_or_clear_invalid_guest_state(bool set) 63 + { 64 + static struct kvm_sregs sregs; 65 + 66 + if (!sregs.cr0) 67 + vcpu_sregs_get(vm, VCPU_ID, &sregs); 68 + sregs.tr.unusable = !!set; 69 + vcpu_sregs_set(vm, VCPU_ID, &sregs); 70 + } 71 + 72 + static void set_invalid_guest_state(void) 73 + { 74 + set_or_clear_invalid_guest_state(true); 75 + } 76 + 77 + static void clear_invalid_guest_state(void) 78 + { 79 + set_or_clear_invalid_guest_state(false); 80 + } 81 + 82 + static void sigalrm_handler(int sig) 83 + { 84 + struct kvm_vcpu_events events; 85 + 86 + TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig); 87 + 88 + vcpu_events_get(vm, VCPU_ID, &events); 89 + 90 + /* 91 + * If an exception is pending, attempt KVM_RUN with invalid guest, 92 + * otherwise rearm the timer and keep doing so until the timer fires 93 + * between KVM queueing an exception and re-entering the guest. 94 + */ 95 + if (events.exception.pending) { 96 + set_invalid_guest_state(); 97 + run_vcpu_with_invalid_state(); 98 + } else { 99 + set_timer(); 100 + } 101 + } 102 + 103 + int main(int argc, char *argv[]) 104 + { 105 + if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) { 106 + print_skip("Must be run with kvm_intel.unrestricted_guest=0"); 107 + exit(KSFT_SKIP); 108 + } 109 + 110 + vm = vm_create_default(VCPU_ID, 0, (void *)guest_code); 111 + 112 + vm_init_descriptor_tables(vm); 113 + vcpu_init_descriptor_tables(vm, VCPU_ID); 114 + 115 + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); 116 + 117 + /* 118 + * Stuff invalid guest state for L2 by making TR unusuable. The next 119 + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support 120 + * emulating invalid guest state for L2. 121 + */ 122 + set_invalid_guest_state(); 123 + run_vcpu_with_invalid_state(); 124 + 125 + /* 126 + * Verify KVM also handles the case where userspace gains control while 127 + * an exception is pending and stuffs invalid state. Run with valid 128 + * guest state and a timer firing every 200us, and attempt to enter the 129 + * guest with invalid state when the handler interrupts KVM with an 130 + * exception pending. 131 + */ 132 + clear_invalid_guest_state(); 133 + TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR, 134 + "Failed to register SIGALRM handler, errno = %d (%s)", 135 + errno, strerror(errno)); 136 + 137 + set_timer(); 138 + run_vcpu_with_invalid_state(); 139 + }

+17 -17

tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c

··· 46 46 #define MIN_STEAL_TIME 50000 47 47 48 48 struct pvclock_vcpu_time_info { 49 - u32 version; 50 - u32 pad0; 51 - u64 tsc_timestamp; 52 - u64 system_time; 53 - u32 tsc_to_system_mul; 54 - s8 tsc_shift; 55 - u8 flags; 56 - u8 pad[2]; 49 + u32 version; 50 + u32 pad0; 51 + u64 tsc_timestamp; 52 + u64 system_time; 53 + u32 tsc_to_system_mul; 54 + s8 tsc_shift; 55 + u8 flags; 56 + u8 pad[2]; 57 57 } __attribute__((__packed__)); /* 32 bytes */ 58 58 59 59 struct pvclock_wall_clock { 60 - u32 version; 61 - u32 sec; 62 - u32 nsec; 60 + u32 version; 61 + u32 sec; 62 + u32 nsec; 63 63 } __attribute__((__packed__)); 64 64 65 65 struct vcpu_runstate_info { ··· 74 74 }; 75 75 76 76 struct vcpu_info { 77 - uint8_t evtchn_upcall_pending; 78 - uint8_t evtchn_upcall_mask; 79 - unsigned long evtchn_pending_sel; 80 - struct arch_vcpu_info arch; 81 - struct pvclock_vcpu_time_info time; 77 + uint8_t evtchn_upcall_pending; 78 + uint8_t evtchn_upcall_mask; 79 + unsigned long evtchn_pending_sel; 80 + struct arch_vcpu_info arch; 81 + struct pvclock_vcpu_time_info time; 82 82 }; /* 64 bytes (x86) */ 83 83 84 84 struct shared_info { ··· 493 493 494 494 vm_ts.tv_sec = wc->sec; 495 495 vm_ts.tv_nsec = wc->nsec; 496 - TEST_ASSERT(wc->version && !(wc->version & 1), 496 + TEST_ASSERT(wc->version && !(wc->version & 1), 497 497 "Bad wallclock version %x", wc->version); 498 498 TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); 499 499 TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");

+2 -3

virt/kvm/kvm_main.c

··· 427 427 #endif 428 428 kvm_async_pf_vcpu_init(vcpu); 429 429 430 - vcpu->pre_pcpu = -1; 431 - INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 432 - 433 430 kvm_vcpu_set_in_spin_loop(vcpu, false); 434 431 kvm_vcpu_set_dy_eligible(vcpu, false); 435 432 vcpu->preempted = false; ··· 3160 3163 { 3161 3164 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 3162 3165 3166 + #ifdef CONFIG_HAVE_KVM_DIRTY_RING 3163 3167 if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) 3164 3168 return; 3169 + #endif 3165 3170 3166 3171 if (memslot && kvm_slot_dirty_track_enabled(memslot)) { 3167 3172 unsigned long rel_gfn = gfn - memslot->base_gfn;