Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvm-x86-pmu-6.8' of https://github.com/kvm-x86/linux into HEAD

KVM x86 PMU changes for 6.8:

- Fix a variety of bugs where KVM fail to stop/reset counters and other state
prior to refreshing the vPMU model.

- Fix a double-overflow PMU bug by tracking emulated counter events using a
dedicated field instead of snapshotting the "previous" counter. If the
hardware PMC count triggers overflow that is recognized in the same VM-Exit
that KVM manually bumps an event count, KVM would pend PMIs for both the
hardware-triggered overflow and for KVM-triggered overflow.

+137 -109
+1 -1
arch/x86/include/asm/kvm-x86-pmu-ops.h
··· 22 22 KVM_X86_PMU_OP(set_msr) 23 23 KVM_X86_PMU_OP(refresh) 24 24 KVM_X86_PMU_OP(init) 25 - KVM_X86_PMU_OP(reset) 25 + KVM_X86_PMU_OP_OPTIONAL(reset) 26 26 KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) 27 27 KVM_X86_PMU_OP_OPTIONAL(cleanup) 28 28
+16 -1
arch/x86/include/asm/kvm_host.h
··· 500 500 u8 idx; 501 501 bool is_paused; 502 502 bool intr; 503 + /* 504 + * Base value of the PMC counter, relative to the *consumed* count in 505 + * the associated perf_event. This value includes counter updates from 506 + * the perf_event and emulated_count since the last time the counter 507 + * was reprogrammed, but it is *not* the current value as seen by the 508 + * guest or userspace. 509 + * 510 + * The count is relative to the associated perf_event so that KVM 511 + * doesn't need to reprogram the perf_event every time the guest writes 512 + * to the counter. 513 + */ 503 514 u64 counter; 504 - u64 prev_counter; 515 + /* 516 + * PMC events triggered by KVM emulation that haven't been fully 517 + * processed, i.e. haven't undergone overflow detection. 518 + */ 519 + u64 emulated_counter; 505 520 u64 eventsel; 506 521 struct perf_event *perf_event; 507 522 struct kvm_vcpu *vcpu;
+117 -23
arch/x86/kvm/pmu.c
··· 127 127 struct kvm_pmc *pmc = perf_event->overflow_handler_context; 128 128 129 129 /* 130 - * Ignore overflow events for counters that are scheduled to be 131 - * reprogrammed, e.g. if a PMI for the previous event races with KVM's 132 - * handling of a related guest WRMSR. 130 + * Ignore asynchronous overflow events for counters that are scheduled 131 + * to be reprogrammed, e.g. if a PMI for the previous event races with 132 + * KVM's handling of a related guest WRMSR. 133 133 */ 134 134 if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) 135 135 return; ··· 159 159 * comes from the host counters or the guest. 160 160 */ 161 161 return 1; 162 + } 163 + 164 + static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) 165 + { 166 + u64 sample_period = (-counter_value) & pmc_bitmask(pmc); 167 + 168 + if (!sample_period) 169 + sample_period = pmc_bitmask(pmc) + 1; 170 + return sample_period; 162 171 } 163 172 164 173 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, ··· 224 215 return 0; 225 216 } 226 217 227 - static void pmc_pause_counter(struct kvm_pmc *pmc) 218 + static bool pmc_pause_counter(struct kvm_pmc *pmc) 228 219 { 229 220 u64 counter = pmc->counter; 230 - 231 - if (!pmc->perf_event || pmc->is_paused) 232 - return; 221 + u64 prev_counter; 233 222 234 223 /* update counter, reset event value to avoid redundant accumulation */ 235 - counter += perf_event_pause(pmc->perf_event, true); 224 + if (pmc->perf_event && !pmc->is_paused) 225 + counter += perf_event_pause(pmc->perf_event, true); 226 + 227 + /* 228 + * Snapshot the previous counter *after* accumulating state from perf. 229 + * If overflow already happened, hardware (via perf) is responsible for 230 + * generating a PMI. KVM just needs to detect overflow on emulated 231 + * counter events that haven't yet been processed. 232 + */ 233 + prev_counter = counter & pmc_bitmask(pmc); 234 + 235 + counter += pmc->emulated_counter; 236 236 pmc->counter = counter & pmc_bitmask(pmc); 237 + 238 + pmc->emulated_counter = 0; 237 239 pmc->is_paused = true; 240 + 241 + return pmc->counter < prev_counter; 238 242 } 239 243 240 244 static bool pmc_resume_counter(struct kvm_pmc *pmc) ··· 271 249 272 250 return true; 273 251 } 252 + 253 + static void pmc_release_perf_event(struct kvm_pmc *pmc) 254 + { 255 + if (pmc->perf_event) { 256 + perf_event_release_kernel(pmc->perf_event); 257 + pmc->perf_event = NULL; 258 + pmc->current_config = 0; 259 + pmc_to_pmu(pmc)->event_count--; 260 + } 261 + } 262 + 263 + static void pmc_stop_counter(struct kvm_pmc *pmc) 264 + { 265 + if (pmc->perf_event) { 266 + pmc->counter = pmc_read_counter(pmc); 267 + pmc_release_perf_event(pmc); 268 + } 269 + } 270 + 271 + static void pmc_update_sample_period(struct kvm_pmc *pmc) 272 + { 273 + if (!pmc->perf_event || pmc->is_paused || 274 + !is_sampling_event(pmc->perf_event)) 275 + return; 276 + 277 + perf_event_period(pmc->perf_event, 278 + get_sample_period(pmc, pmc->counter)); 279 + } 280 + 281 + void pmc_write_counter(struct kvm_pmc *pmc, u64 val) 282 + { 283 + /* 284 + * Drop any unconsumed accumulated counts, the WRMSR is a write, not a 285 + * read-modify-write. Adjust the counter value so that its value is 286 + * relative to the current count, as reading the current count from 287 + * perf is faster than pausing and repgrogramming the event in order to 288 + * reset it to '0'. Note, this very sneakily offsets the accumulated 289 + * emulated count too, by using pmc_read_counter()! 290 + */ 291 + pmc->emulated_counter = 0; 292 + pmc->counter += val - pmc_read_counter(pmc); 293 + pmc->counter &= pmc_bitmask(pmc); 294 + pmc_update_sample_period(pmc); 295 + } 296 + EXPORT_SYMBOL_GPL(pmc_write_counter); 274 297 275 298 static int filter_cmp(const void *pa, const void *pb, u64 mask) 276 299 { ··· 450 383 struct kvm_pmu *pmu = pmc_to_pmu(pmc); 451 384 u64 eventsel = pmc->eventsel; 452 385 u64 new_config = eventsel; 386 + bool emulate_overflow; 453 387 u8 fixed_ctr_ctrl; 454 388 455 - pmc_pause_counter(pmc); 389 + emulate_overflow = pmc_pause_counter(pmc); 456 390 457 391 if (!pmc_event_is_allowed(pmc)) 458 392 goto reprogram_complete; 459 393 460 - if (pmc->counter < pmc->prev_counter) 394 + if (emulate_overflow) 461 395 __kvm_perf_overflow(pmc, false); 462 396 463 397 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) ··· 498 430 499 431 reprogram_complete: 500 432 clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); 501 - pmc->prev_counter = 0; 502 433 } 503 434 504 435 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) ··· 706 639 return 0; 707 640 } 708 641 709 - /* refresh PMU settings. This function generally is called when underlying 710 - * settings are changed (such as changes of PMU CPUID by guest VMs), which 711 - * should rarely happen. 642 + static void kvm_pmu_reset(struct kvm_vcpu *vcpu) 643 + { 644 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 645 + struct kvm_pmc *pmc; 646 + int i; 647 + 648 + pmu->need_cleanup = false; 649 + 650 + bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); 651 + 652 + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { 653 + pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); 654 + if (!pmc) 655 + continue; 656 + 657 + pmc_stop_counter(pmc); 658 + pmc->counter = 0; 659 + pmc->emulated_counter = 0; 660 + 661 + if (pmc_is_gp(pmc)) 662 + pmc->eventsel = 0; 663 + } 664 + 665 + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; 666 + 667 + static_call_cond(kvm_x86_pmu_reset)(vcpu); 668 + } 669 + 670 + 671 + /* 672 + * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID 673 + * and/or PERF_CAPABILITIES. 712 674 */ 713 675 void kvm_pmu_refresh(struct kvm_vcpu *vcpu) 714 676 { 715 677 if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) 716 678 return; 717 679 680 + /* 681 + * Stop/release all existing counters/events before realizing the new 682 + * vPMU model. 683 + */ 684 + kvm_pmu_reset(vcpu); 685 + 718 686 bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); 719 687 static_call(kvm_x86_pmu_refresh)(vcpu); 720 - } 721 - 722 - void kvm_pmu_reset(struct kvm_vcpu *vcpu) 723 - { 724 - static_call(kvm_x86_pmu_reset)(vcpu); 725 688 } 726 689 727 690 void kvm_pmu_init(struct kvm_vcpu *vcpu) ··· 760 663 761 664 memset(pmu, 0, sizeof(*pmu)); 762 665 static_call(kvm_x86_pmu_init)(vcpu); 763 - pmu->event_count = 0; 764 - pmu->need_cleanup = false; 765 666 kvm_pmu_refresh(vcpu); 766 667 } 767 668 ··· 795 700 796 701 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) 797 702 { 798 - pmc->prev_counter = pmc->counter; 799 - pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); 703 + pmc->emulated_counter++; 800 704 kvm_pmu_request_counter_reprogram(pmc); 801 705 } 802 706
+3 -44
arch/x86/kvm/pmu.h
··· 66 66 { 67 67 u64 counter, enabled, running; 68 68 69 - counter = pmc->counter; 69 + counter = pmc->counter + pmc->emulated_counter; 70 + 70 71 if (pmc->perf_event && !pmc->is_paused) 71 72 counter += perf_event_read_value(pmc->perf_event, 72 73 &enabled, &running); ··· 75 74 return counter & pmc_bitmask(pmc); 76 75 } 77 76 78 - static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) 79 - { 80 - pmc->counter += val - pmc_read_counter(pmc); 81 - pmc->counter &= pmc_bitmask(pmc); 82 - } 83 - 84 - static inline void pmc_release_perf_event(struct kvm_pmc *pmc) 85 - { 86 - if (pmc->perf_event) { 87 - perf_event_release_kernel(pmc->perf_event); 88 - pmc->perf_event = NULL; 89 - pmc->current_config = 0; 90 - pmc_to_pmu(pmc)->event_count--; 91 - } 92 - } 93 - 94 - static inline void pmc_stop_counter(struct kvm_pmc *pmc) 95 - { 96 - if (pmc->perf_event) { 97 - pmc->counter = pmc_read_counter(pmc); 98 - pmc_release_perf_event(pmc); 99 - } 100 - } 77 + void pmc_write_counter(struct kvm_pmc *pmc, u64 val); 101 78 102 79 static inline bool pmc_is_gp(struct kvm_pmc *pmc) 103 80 { ··· 123 144 } 124 145 125 146 return NULL; 126 - } 127 - 128 - static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) 129 - { 130 - u64 sample_period = (-counter_value) & pmc_bitmask(pmc); 131 - 132 - if (!sample_period) 133 - sample_period = pmc_bitmask(pmc) + 1; 134 - return sample_period; 135 - } 136 - 137 - static inline void pmc_update_sample_period(struct kvm_pmc *pmc) 138 - { 139 - if (!pmc->perf_event || pmc->is_paused || 140 - !is_sampling_event(pmc->perf_event)) 141 - return; 142 - 143 - perf_event_period(pmc->perf_event, 144 - get_sample_period(pmc, pmc->counter)); 145 147 } 146 148 147 149 static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) ··· 221 261 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 222 262 int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); 223 263 void kvm_pmu_refresh(struct kvm_vcpu *vcpu); 224 - void kvm_pmu_reset(struct kvm_vcpu *vcpu); 225 264 void kvm_pmu_init(struct kvm_vcpu *vcpu); 226 265 void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); 227 266 void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
-17
arch/x86/kvm/svm/pmu.c
··· 161 161 pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); 162 162 if (pmc) { 163 163 pmc_write_counter(pmc, data); 164 - pmc_update_sample_period(pmc); 165 164 return 0; 166 165 } 167 166 /* MSR_EVNTSELn */ ··· 232 233 } 233 234 } 234 235 235 - static void amd_pmu_reset(struct kvm_vcpu *vcpu) 236 - { 237 - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 238 - int i; 239 - 240 - for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { 241 - struct kvm_pmc *pmc = &pmu->gp_counters[i]; 242 - 243 - pmc_stop_counter(pmc); 244 - pmc->counter = pmc->prev_counter = pmc->eventsel = 0; 245 - } 246 - 247 - pmu->global_ctrl = pmu->global_status = 0; 248 - } 249 - 250 236 struct kvm_pmu_ops amd_pmu_ops __initdata = { 251 237 .hw_event_available = amd_hw_event_available, 252 238 .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, ··· 243 259 .set_msr = amd_pmu_set_msr, 244 260 .refresh = amd_pmu_refresh, 245 261 .init = amd_pmu_init, 246 - .reset = amd_pmu_reset, 247 262 .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, 248 263 .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, 249 264 .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
-22
arch/x86/kvm/vmx/pmu_intel.c
··· 437 437 !(msr & MSR_PMC_FULL_WIDTH_BIT)) 438 438 data = (s64)(s32)data; 439 439 pmc_write_counter(pmc, data); 440 - pmc_update_sample_period(pmc); 441 440 break; 442 441 } else if ((pmc = get_fixed_pmc(pmu, msr))) { 443 442 pmc_write_counter(pmc, data); 444 - pmc_update_sample_period(pmc); 445 443 break; 446 444 } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { 447 445 reserved_bits = pmu->reserved_bits; ··· 630 632 631 633 static void intel_pmu_reset(struct kvm_vcpu *vcpu) 632 634 { 633 - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 634 - struct kvm_pmc *pmc = NULL; 635 - int i; 636 - 637 - for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { 638 - pmc = &pmu->gp_counters[i]; 639 - 640 - pmc_stop_counter(pmc); 641 - pmc->counter = pmc->prev_counter = pmc->eventsel = 0; 642 - } 643 - 644 - for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { 645 - pmc = &pmu->fixed_counters[i]; 646 - 647 - pmc_stop_counter(pmc); 648 - pmc->counter = pmc->prev_counter = 0; 649 - } 650 - 651 - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; 652 - 653 635 intel_pmu_release_guest_lbr_event(vcpu); 654 636 } 655 637
-1
arch/x86/kvm/x86.c
··· 12252 12252 } 12253 12253 12254 12254 if (!init_event) { 12255 - kvm_pmu_reset(vcpu); 12256 12255 vcpu->arch.smbase = 0x30000; 12257 12256 12258 12257 vcpu->arch.msr_misc_features_enables = 0;