Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'kvm-x86-pmu-6.3' of https://github.com/kvm-x86/linux into HEAD

KVM x86 PMU changes for 6.3:

- Add support for created masked events for the PMU filter to allow
userspace to heavily restrict what events the guest can use without
needing to create an absurd number of events

- Clean up KVM's handling of "PMU MSRs to save", especially when vPMU
support is disabled

- Add PEBS support for Intel SPR

+896 -198
+71 -7
Documentation/virt/kvm/api.rst
··· 5005 5005 :Parameters: struct kvm_pmu_event_filter (in) 5006 5006 :Returns: 0 on success, -1 on error 5007 5007 5008 + Errors: 5009 + 5010 + ====== ============================================================ 5011 + EFAULT args[0] cannot be accessed 5012 + EINVAL args[0] contains invalid data in the filter or filter events 5013 + E2BIG nevents is too large 5014 + EBUSY not enough memory to allocate the filter 5015 + ====== ============================================================ 5016 + 5008 5017 :: 5009 5018 5010 5019 struct kvm_pmu_event_filter { ··· 5025 5016 __u64 events[0]; 5026 5017 }; 5027 5018 5028 - This ioctl restricts the set of PMU events that the guest can program. 5029 - The argument holds a list of events which will be allowed or denied. 5030 - The eventsel+umask of each event the guest attempts to program is compared 5031 - against the events field to determine whether the guest should have access. 5032 - The events field only controls general purpose counters; fixed purpose 5033 - counters are controlled by the fixed_counter_bitmap. 5019 + This ioctl restricts the set of PMU events the guest can program by limiting 5020 + which event select and unit mask combinations are permitted. 5034 5021 5035 - No flags are defined yet, the field must be zero. 5022 + The argument holds a list of filter events which will be allowed or denied. 5023 + 5024 + Filter events only control general purpose counters; fixed purpose counters 5025 + are controlled by the fixed_counter_bitmap. 5026 + 5027 + Valid values for 'flags':: 5028 + 5029 + ``0`` 5030 + 5031 + To use this mode, clear the 'flags' field. 5032 + 5033 + In this mode each event will contain an event select + unit mask. 5034 + 5035 + When the guest attempts to program the PMU the guest's event select + 5036 + unit mask is compared against the filter events to determine whether the 5037 + guest should have access. 5038 + 5039 + ``KVM_PMU_EVENT_FLAG_MASKED_EVENTS`` 5040 + :Capability: KVM_CAP_PMU_EVENT_MASKED_EVENTS 5041 + 5042 + In this mode each filter event will contain an event select, mask, match, and 5043 + exclude value. To encode a masked event use:: 5044 + 5045 + KVM_PMU_ENCODE_MASKED_ENTRY() 5046 + 5047 + An encoded event will follow this layout:: 5048 + 5049 + Bits Description 5050 + ---- ----------- 5051 + 7:0 event select (low bits) 5052 + 15:8 umask match 5053 + 31:16 unused 5054 + 35:32 event select (high bits) 5055 + 36:54 unused 5056 + 55 exclude bit 5057 + 63:56 umask mask 5058 + 5059 + When the guest attempts to program the PMU, these steps are followed in 5060 + determining if the guest should have access: 5061 + 5062 + 1. Match the event select from the guest against the filter events. 5063 + 2. If a match is found, match the guest's unit mask to the mask and match 5064 + values of the included filter events. 5065 + I.e. (unit mask & mask) == match && !exclude. 5066 + 3. If a match is found, match the guest's unit mask to the mask and match 5067 + values of the excluded filter events. 5068 + I.e. (unit mask & mask) == match && exclude. 5069 + 4. 5070 + a. If an included match is found and an excluded match is not found, filter 5071 + the event. 5072 + b. For everything else, do not filter the event. 5073 + 5. 5074 + a. If the event is filtered and it's an allow list, allow the guest to 5075 + program the event. 5076 + b. If the event is filtered and it's a deny list, do not allow the guest to 5077 + program the event. 5078 + 5079 + When setting a new pmu event filter, -EINVAL will be returned if any of the 5080 + unused fields are set or if any of the high bits (35:32) in the event 5081 + select are set when called on Intel. 5036 5082 5037 5083 Valid values for 'action':: 5038 5084
+1
arch/x86/events/intel/core.c
··· 6348 6348 x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; 6349 6349 x86_pmu.extra_regs = intel_spr_extra_regs; 6350 6350 x86_pmu.limit_period = spr_limit_period; 6351 + x86_pmu.pebs_ept = 1; 6351 6352 x86_pmu.pebs_aliases = NULL; 6352 6353 x86_pmu.pebs_prec_dist = true; 6353 6354 x86_pmu.pebs_block = true;
+3 -1
arch/x86/events/intel/ds.c
··· 2303 2303 x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; 2304 2304 break; 2305 2305 2306 - case 4: 2307 2306 case 5: 2307 + x86_pmu.pebs_ept = 1; 2308 + fallthrough; 2309 + case 4: 2308 2310 x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; 2309 2311 x86_pmu.pebs_record_size = sizeof(struct pebs_basic); 2310 2312 if (x86_pmu.intel_cap.pebs_baseline) {
+14 -1
arch/x86/include/asm/kvm_host.h
··· 514 514 #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) 515 515 #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) 516 516 #define KVM_PMC_MAX_FIXED 3 517 + #define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) 517 518 #define KVM_AMD_PMC_MAX_GENERIC 6 518 519 struct kvm_pmu { 519 520 unsigned nr_arch_gp_counters; ··· 1152 1151 struct msr_bitmap_range ranges[16]; 1153 1152 }; 1154 1153 1154 + struct kvm_x86_pmu_event_filter { 1155 + __u32 action; 1156 + __u32 nevents; 1157 + __u32 fixed_counter_bitmap; 1158 + __u32 flags; 1159 + __u32 nr_includes; 1160 + __u32 nr_excludes; 1161 + __u64 *includes; 1162 + __u64 *excludes; 1163 + __u64 events[]; 1164 + }; 1165 + 1155 1166 enum kvm_apicv_inhibit { 1156 1167 1157 1168 /********************************************************************/ ··· 1381 1368 /* Guest can access the SGX PROVISIONKEY. */ 1382 1369 bool sgx_provisioning_allowed; 1383 1370 1384 - struct kvm_pmu_event_filter __rcu *pmu_event_filter; 1371 + struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; 1385 1372 struct task_struct *nx_huge_page_recovery_thread; 1386 1373 1387 1374 #ifdef CONFIG_X86_64
+29
arch/x86/include/uapi/asm/kvm.h
··· 526 526 #define KVM_PMU_EVENT_ALLOW 0 527 527 #define KVM_PMU_EVENT_DENY 1 528 528 529 + #define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) 530 + #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) 531 + 532 + /* 533 + * Masked event layout. 534 + * Bits Description 535 + * ---- ----------- 536 + * 7:0 event select (low bits) 537 + * 15:8 umask match 538 + * 31:16 unused 539 + * 35:32 event select (high bits) 540 + * 36:54 unused 541 + * 55 exclude bit 542 + * 63:56 umask mask 543 + */ 544 + 545 + #define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \ 546 + (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \ 547 + (((mask) & 0xFFULL) << 56) | \ 548 + (((match) & 0xFFULL) << 8) | \ 549 + ((__u64)(!!(exclude)) << 55)) 550 + 551 + #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ 552 + (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) 553 + #define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) 554 + #define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) 555 + #define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) 556 + #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) 557 + 529 558 /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ 530 559 #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ 531 560 #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+4 -6
arch/x86/kvm/hyperv.c
··· 1448 1448 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: 1449 1449 return syndbg_set_msr(vcpu, msr, data, host); 1450 1450 default: 1451 - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", 1452 - msr, data); 1451 + kvm_pr_unimpl_wrmsr(vcpu, msr, data); 1453 1452 return 1; 1454 1453 } 1455 1454 return 0; ··· 1569 1570 return 1; 1570 1571 break; 1571 1572 default: 1572 - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", 1573 - msr, data); 1573 + kvm_pr_unimpl_wrmsr(vcpu, msr, data); 1574 1574 return 1; 1575 1575 } 1576 1576 ··· 1624 1626 case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: 1625 1627 return syndbg_get_msr(vcpu, msr, pdata, host); 1626 1628 default: 1627 - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1629 + kvm_pr_unimpl_rdmsr(vcpu, msr); 1628 1630 return 1; 1629 1631 } 1630 1632 ··· 1689 1691 data = APIC_BUS_FREQUENCY; 1690 1692 break; 1691 1693 default: 1692 - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); 1694 + kvm_pr_unimpl_rdmsr(vcpu, msr); 1693 1695 return 1; 1694 1696 } 1695 1697 *pdata = data;
+236 -50
arch/x86/kvm/pmu.c
··· 29 29 struct x86_pmu_capability __read_mostly kvm_pmu_cap; 30 30 EXPORT_SYMBOL_GPL(kvm_pmu_cap); 31 31 32 - static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { 32 + /* Precise Distribution of Instructions Retired (PDIR) */ 33 + static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { 33 34 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), 34 35 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), 36 + /* Instruction-Accurate PDIR (PDIR++) */ 37 + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), 38 + {} 39 + }; 40 + 41 + /* Precise Distribution (PDist) */ 42 + static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { 43 + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), 35 44 {} 36 45 }; 37 46 ··· 165 156 kvm_make_request(KVM_REQ_PMU, pmc->vcpu); 166 157 } 167 158 159 + static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) 160 + { 161 + /* 162 + * For some model specific pebs counters with special capabilities 163 + * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise 164 + * level to the maximum value (currently 3, backwards compatible) 165 + * so that the perf subsystem would assign specific hardware counter 166 + * with that capability for vPMC. 167 + */ 168 + if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) || 169 + (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) 170 + return 3; 171 + 172 + /* 173 + * The non-zero precision level of guest event makes the ordinary 174 + * guest event becomes a guest PEBS event and triggers the host 175 + * PEBS PMI handler to determine whether the PEBS overflow PMI 176 + * comes from the host counters or the guest. 177 + */ 178 + return 1; 179 + } 180 + 168 181 static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, 169 182 bool exclude_user, bool exclude_kernel, 170 183 bool intr) ··· 218 187 } 219 188 if (pebs) { 220 189 /* 221 - * The non-zero precision level of guest event makes the ordinary 222 - * guest event becomes a guest PEBS event and triggers the host 223 - * PEBS PMI handler to determine whether the PEBS overflow PMI 224 - * comes from the host counters or the guest. 225 - * 226 190 * For most PEBS hardware events, the difference in the software 227 191 * precision levels of guest and host PEBS events will not affect 228 192 * the accuracy of the PEBS profiling result, because the "event IP" 229 193 * in the PEBS record is calibrated on the guest side. 230 - * 231 - * On Icelake everything is fine. Other hardware (GLC+, TNT+) that 232 - * could possibly care here is unsupported and needs changes. 233 194 */ 234 - attr.precise_ip = 1; 235 - if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) 236 - attr.precise_ip = 3; 195 + attr.precise_ip = pmc_get_pebs_precise_level(pmc); 237 196 } 238 197 239 198 event = perf_event_create_kernel_counter(&attr, -1, current, ··· 276 255 return true; 277 256 } 278 257 279 - static int cmp_u64(const void *pa, const void *pb) 258 + static int filter_cmp(const void *pa, const void *pb, u64 mask) 280 259 { 281 - u64 a = *(u64 *)pa; 282 - u64 b = *(u64 *)pb; 260 + u64 a = *(u64 *)pa & mask; 261 + u64 b = *(u64 *)pb & mask; 283 262 284 263 return (a > b) - (a < b); 285 264 } 286 265 266 + 267 + static int filter_sort_cmp(const void *pa, const void *pb) 268 + { 269 + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | 270 + KVM_PMU_MASKED_ENTRY_EXCLUDE)); 271 + } 272 + 273 + /* 274 + * For the event filter, searching is done on the 'includes' list and 275 + * 'excludes' list separately rather than on the 'events' list (which 276 + * has both). As a result the exclude bit can be ignored. 277 + */ 278 + static int filter_event_cmp(const void *pa, const void *pb) 279 + { 280 + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); 281 + } 282 + 283 + static int find_filter_index(u64 *events, u64 nevents, u64 key) 284 + { 285 + u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]), 286 + filter_event_cmp); 287 + 288 + if (!fe) 289 + return -1; 290 + 291 + return fe - events; 292 + } 293 + 294 + static bool is_filter_entry_match(u64 filter_event, u64 umask) 295 + { 296 + u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); 297 + u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; 298 + 299 + BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> 300 + (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != 301 + ARCH_PERFMON_EVENTSEL_UMASK); 302 + 303 + return (umask & mask) == match; 304 + } 305 + 306 + static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) 307 + { 308 + u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; 309 + u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; 310 + int i, index; 311 + 312 + index = find_filter_index(events, nevents, event_select); 313 + if (index < 0) 314 + return false; 315 + 316 + /* 317 + * Entries are sorted by the event select. Walk the list in both 318 + * directions to process all entries with the targeted event select. 319 + */ 320 + for (i = index; i < nevents; i++) { 321 + if (filter_event_cmp(&events[i], &event_select)) 322 + break; 323 + 324 + if (is_filter_entry_match(events[i], umask)) 325 + return true; 326 + } 327 + 328 + for (i = index - 1; i >= 0; i--) { 329 + if (filter_event_cmp(&events[i], &event_select)) 330 + break; 331 + 332 + if (is_filter_entry_match(events[i], umask)) 333 + return true; 334 + } 335 + 336 + return false; 337 + } 338 + 339 + static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, 340 + u64 eventsel) 341 + { 342 + if (filter_contains_match(f->includes, f->nr_includes, eventsel) && 343 + !filter_contains_match(f->excludes, f->nr_excludes, eventsel)) 344 + return f->action == KVM_PMU_EVENT_ALLOW; 345 + 346 + return f->action == KVM_PMU_EVENT_DENY; 347 + } 348 + 349 + static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, 350 + int idx) 351 + { 352 + int fixed_idx = idx - INTEL_PMC_IDX_FIXED; 353 + 354 + if (filter->action == KVM_PMU_EVENT_DENY && 355 + test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) 356 + return false; 357 + if (filter->action == KVM_PMU_EVENT_ALLOW && 358 + !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) 359 + return false; 360 + 361 + return true; 362 + } 363 + 287 364 static bool check_pmu_event_filter(struct kvm_pmc *pmc) 288 365 { 289 - struct kvm_pmu_event_filter *filter; 366 + struct kvm_x86_pmu_event_filter *filter; 290 367 struct kvm *kvm = pmc->vcpu->kvm; 291 - bool allow_event = true; 292 - __u64 key; 293 - int idx; 294 368 295 369 if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) 296 370 return false; 297 371 298 372 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); 299 373 if (!filter) 300 - goto out; 374 + return true; 301 375 302 - if (pmc_is_gp(pmc)) { 303 - key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; 304 - if (bsearch(&key, filter->events, filter->nevents, 305 - sizeof(__u64), cmp_u64)) 306 - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; 307 - else 308 - allow_event = filter->action == KVM_PMU_EVENT_DENY; 309 - } else { 310 - idx = pmc->idx - INTEL_PMC_IDX_FIXED; 311 - if (filter->action == KVM_PMU_EVENT_DENY && 312 - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) 313 - allow_event = false; 314 - if (filter->action == KVM_PMU_EVENT_ALLOW && 315 - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) 316 - allow_event = false; 317 - } 376 + if (pmc_is_gp(pmc)) 377 + return is_gp_event_allowed(filter, pmc->eventsel); 318 378 319 - out: 320 - return allow_event; 379 + return is_fixed_event_allowed(filter, pmc->idx); 321 380 } 322 381 323 382 static void reprogram_counter(struct kvm_pmc *pmc) ··· 694 593 } 695 594 EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); 696 595 596 + static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) 597 + { 598 + u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | 599 + KVM_PMU_MASKED_ENTRY_UMASK_MASK | 600 + KVM_PMU_MASKED_ENTRY_UMASK_MATCH | 601 + KVM_PMU_MASKED_ENTRY_EXCLUDE; 602 + int i; 603 + 604 + for (i = 0; i < filter->nevents; i++) { 605 + if (filter->events[i] & ~mask) 606 + return false; 607 + } 608 + 609 + return true; 610 + } 611 + 612 + static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) 613 + { 614 + int i, j; 615 + 616 + for (i = 0, j = 0; i < filter->nevents; i++) { 617 + /* 618 + * Skip events that are impossible to match against a guest 619 + * event. When filtering, only the event select + unit mask 620 + * of the guest event is used. To maintain backwards 621 + * compatibility, impossible filters can't be rejected :-( 622 + */ 623 + if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | 624 + ARCH_PERFMON_EVENTSEL_UMASK)) 625 + continue; 626 + /* 627 + * Convert userspace events to a common in-kernel event so 628 + * only one code path is needed to support both events. For 629 + * the in-kernel events use masked events because they are 630 + * flexible enough to handle both cases. To convert to masked 631 + * events all that's needed is to add an "all ones" umask_mask, 632 + * (unmasked filter events don't support EXCLUDE). 633 + */ 634 + filter->events[j++] = filter->events[i] | 635 + (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); 636 + } 637 + 638 + filter->nevents = j; 639 + } 640 + 641 + static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) 642 + { 643 + int i; 644 + 645 + if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) 646 + convert_to_masked_filter(filter); 647 + else if (!is_masked_filter_valid(filter)) 648 + return -EINVAL; 649 + 650 + /* 651 + * Sort entries by event select and includes vs. excludes so that all 652 + * entries for a given event select can be processed efficiently during 653 + * filtering. The EXCLUDE flag uses a more significant bit than the 654 + * event select, and so the sorted list is also effectively split into 655 + * includes and excludes sub-lists. 656 + */ 657 + sort(&filter->events, filter->nevents, sizeof(filter->events[0]), 658 + filter_sort_cmp, NULL); 659 + 660 + i = filter->nevents; 661 + /* Find the first EXCLUDE event (only supported for masked events). */ 662 + if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { 663 + for (i = 0; i < filter->nevents; i++) { 664 + if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) 665 + break; 666 + } 667 + } 668 + 669 + filter->nr_includes = i; 670 + filter->nr_excludes = filter->nevents - filter->nr_includes; 671 + filter->includes = filter->events; 672 + filter->excludes = filter->events + filter->nr_includes; 673 + 674 + return 0; 675 + } 676 + 697 677 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) 698 678 { 699 - struct kvm_pmu_event_filter tmp, *filter; 679 + struct kvm_pmu_event_filter __user *user_filter = argp; 680 + struct kvm_x86_pmu_event_filter *filter; 681 + struct kvm_pmu_event_filter tmp; 700 682 struct kvm_vcpu *vcpu; 701 683 unsigned long i; 702 684 size_t size; 703 685 int r; 704 686 705 - if (copy_from_user(&tmp, argp, sizeof(tmp))) 687 + if (copy_from_user(&tmp, user_filter, sizeof(tmp))) 706 688 return -EFAULT; 707 689 708 690 if (tmp.action != KVM_PMU_EVENT_ALLOW && 709 691 tmp.action != KVM_PMU_EVENT_DENY) 710 692 return -EINVAL; 711 693 712 - if (tmp.flags != 0) 694 + if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) 713 695 return -EINVAL; 714 696 715 697 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) 716 698 return -E2BIG; 717 699 718 700 size = struct_size(filter, events, tmp.nevents); 719 - filter = kmalloc(size, GFP_KERNEL_ACCOUNT); 701 + filter = kzalloc(size, GFP_KERNEL_ACCOUNT); 720 702 if (!filter) 721 703 return -ENOMEM; 722 704 705 + filter->action = tmp.action; 706 + filter->nevents = tmp.nevents; 707 + filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; 708 + filter->flags = tmp.flags; 709 + 723 710 r = -EFAULT; 724 - if (copy_from_user(filter, argp, size)) 711 + if (copy_from_user(filter->events, user_filter->events, 712 + sizeof(filter->events[0]) * filter->nevents)) 725 713 goto cleanup; 726 714 727 - /* Ensure nevents can't be changed between the user copies. */ 728 - *filter = tmp; 729 - 730 - /* 731 - * Sort the in-kernel list so that we can search it with bsearch. 732 - */ 733 - sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); 715 + r = prepare_filter_lists(filter); 716 + if (r) 717 + goto cleanup; 734 718 735 719 mutex_lock(&kvm->lock); 736 720 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
+6 -7
arch/x86/kvm/pmu.h
··· 18 18 #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 19 19 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 20 20 21 - struct kvm_event_hw_type_mapping { 22 - u8 eventsel; 23 - u8 unit_mask; 24 - unsigned event_type; 25 - }; 26 - 27 21 struct kvm_pmu_ops { 28 22 bool (*hw_event_available)(struct kvm_pmc *pmc); 29 23 bool (*pmc_is_enabled)(struct kvm_pmc *pmc); ··· 34 40 void (*reset)(struct kvm_vcpu *vcpu); 35 41 void (*deliver_pmi)(struct kvm_vcpu *vcpu); 36 42 void (*cleanup)(struct kvm_vcpu *vcpu); 43 + 44 + const u64 EVENTSEL_EVENT; 45 + const int MAX_NR_GP_COUNTERS; 37 46 }; 38 47 39 48 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); ··· 158 161 159 162 extern struct x86_pmu_capability kvm_pmu_cap; 160 163 161 - static inline void kvm_init_pmu_capability(void) 164 + static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) 162 165 { 163 166 bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; 164 167 ··· 177 180 } 178 181 179 182 kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); 183 + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, 184 + pmu_ops->MAX_NR_GP_COUNTERS); 180 185 kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, 181 186 KVM_PMC_MAX_FIXED); 182 187 }
+2
arch/x86/kvm/svm/pmu.c
··· 231 231 .refresh = amd_pmu_refresh, 232 232 .init = amd_pmu_init, 233 233 .reset = amd_pmu_reset, 234 + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, 235 + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, 234 236 };
+2 -3
arch/x86/kvm/svm/svm.c
··· 3015 3015 break; 3016 3016 case MSR_IA32_DEBUGCTLMSR: 3017 3017 if (!lbrv) { 3018 - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3019 - __func__, data); 3018 + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3020 3019 break; 3021 3020 } 3022 3021 if (data & DEBUGCTL_RESERVED_BITS) ··· 3044 3045 case MSR_VM_CR: 3045 3046 return svm_set_vm_cr(vcpu, data); 3046 3047 case MSR_VM_IGNNE: 3047 - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3048 + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3048 3049 break; 3049 3050 case MSR_AMD64_DE_CFG: { 3050 3051 struct kvm_msr_entry msr_entry;
+14 -9
arch/x86/kvm/vmx/pmu_intel.c
··· 22 22 23 23 #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) 24 24 25 - static struct kvm_event_hw_type_mapping intel_arch_events[] = { 26 - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, 27 - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, 28 - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, 29 - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, 30 - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, 31 - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 32 - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, 25 + static struct { 26 + u8 eventsel; 27 + u8 unit_mask; 28 + } const intel_arch_events[] = { 29 + [0] = { 0x3c, 0x00 }, 30 + [1] = { 0xc0, 0x00 }, 31 + [2] = { 0x3c, 0x01 }, 32 + [3] = { 0x2e, 0x4f }, 33 + [4] = { 0x2e, 0x41 }, 34 + [5] = { 0xc4, 0x00 }, 35 + [6] = { 0xc5, 0x00 }, 33 36 /* The above index must match CPUID 0x0A.EBX bit vector */ 34 - [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, 37 + [7] = { 0x00, 0x03 }, 35 38 }; 36 39 37 40 /* mapping between fixed pmc index and intel_arch_events array */ ··· 814 811 .reset = intel_pmu_reset, 815 812 .deliver_pmi = intel_pmu_deliver_pmi, 816 813 .cleanup = intel_pmu_cleanup, 814 + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, 815 + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, 817 816 };
+1 -3
arch/x86/kvm/vmx/vmx.c
··· 2206 2206 2207 2207 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2208 2208 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2209 - if (report_ignored_msrs) 2210 - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", 2211 - __func__, data); 2209 + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2212 2210 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2213 2211 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2214 2212 }
+124 -106
arch/x86/kvm/x86.c
··· 1419 1419 * may depend on host virtualization features rather than host cpu features. 1420 1420 */ 1421 1421 1422 - static const u32 msrs_to_save_all[] = { 1422 + static const u32 msrs_to_save_base[] = { 1423 1423 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 1424 1424 MSR_STAR, 1425 1425 #ifdef CONFIG_X86_64 ··· 1436 1436 MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, 1437 1437 MSR_IA32_UMWAIT_CONTROL, 1438 1438 1439 + MSR_IA32_XFD, MSR_IA32_XFD_ERR, 1440 + }; 1441 + 1442 + static const u32 msrs_to_save_pmu[] = { 1439 1443 MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, 1440 1444 MSR_ARCH_PERFMON_FIXED_CTR0 + 2, 1441 1445 MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, ··· 1464 1460 MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, 1465 1461 MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, 1466 1462 MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, 1467 - 1468 - MSR_IA32_XFD, MSR_IA32_XFD_ERR, 1469 1463 }; 1470 1464 1471 - static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; 1465 + static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + 1466 + ARRAY_SIZE(msrs_to_save_pmu)]; 1472 1467 static unsigned num_msrs_to_save; 1473 1468 1474 1469 static const u32 emulated_msrs_all[] = { ··· 3562 3559 mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); 3563 3560 } 3564 3561 3562 + static bool kvm_is_msr_to_save(u32 msr_index) 3563 + { 3564 + unsigned int i; 3565 + 3566 + for (i = 0; i < num_msrs_to_save; i++) { 3567 + if (msrs_to_save[i] == msr_index) 3568 + return true; 3569 + } 3570 + 3571 + return false; 3572 + } 3573 + 3565 3574 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 3566 3575 { 3567 - bool pr = false; 3568 3576 u32 msr = msr_info->index; 3569 3577 u64 data = msr_info->data; 3570 3578 ··· 3621 3607 if (data == BIT_ULL(18)) { 3622 3608 vcpu->arch.msr_hwcr = data; 3623 3609 } else if (data != 0) { 3624 - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 3625 - data); 3610 + kvm_pr_unimpl_wrmsr(vcpu, msr, data); 3626 3611 return 1; 3627 3612 } 3628 3613 break; 3629 3614 case MSR_FAM10H_MMIO_CONF_BASE: 3630 3615 if (data != 0) { 3631 - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " 3632 - "0x%llx\n", data); 3616 + kvm_pr_unimpl_wrmsr(vcpu, msr, data); 3633 3617 return 1; 3634 3618 } 3635 3619 break; ··· 3807 3795 3808 3796 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: 3809 3797 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: 3810 - pr = true; 3811 - fallthrough; 3812 3798 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: 3813 3799 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: 3814 3800 if (kvm_pmu_is_valid_msr(vcpu, msr)) 3815 3801 return kvm_pmu_set_msr(vcpu, msr_info); 3816 3802 3817 - if (pr || data != 0) 3818 - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " 3819 - "0x%x data 0x%llx\n", msr, data); 3803 + if (data) 3804 + kvm_pr_unimpl_wrmsr(vcpu, msr, data); 3820 3805 break; 3821 3806 case MSR_K7_CLK_CTL: 3822 3807 /* ··· 3841 3832 /* Drop writes to this legacy MSR -- see rdmsr 3842 3833 * counterpart for further detail. 3843 3834 */ 3844 - if (report_ignored_msrs) 3845 - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", 3846 - msr, data); 3835 + kvm_pr_unimpl_wrmsr(vcpu, msr, data); 3847 3836 break; 3848 3837 case MSR_AMD64_OSVW_ID_LENGTH: 3849 3838 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) ··· 3889 3882 vcpu->arch.guest_fpu.xfd_err = data; 3890 3883 break; 3891 3884 #endif 3892 - case MSR_IA32_PEBS_ENABLE: 3893 - case MSR_IA32_DS_AREA: 3894 - case MSR_PEBS_DATA_CFG: 3895 - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: 3885 + default: 3896 3886 if (kvm_pmu_is_valid_msr(vcpu, msr)) 3897 3887 return kvm_pmu_set_msr(vcpu, msr_info); 3888 + 3898 3889 /* 3899 3890 * Userspace is allowed to write '0' to MSRs that KVM reports 3900 3891 * as to-be-saved, even if an MSRs isn't fully supported. 3901 3892 */ 3902 - return !msr_info->host_initiated || data; 3903 - default: 3904 - if (kvm_pmu_is_valid_msr(vcpu, msr)) 3905 - return kvm_pmu_set_msr(vcpu, msr_info); 3893 + if (msr_info->host_initiated && !data && 3894 + kvm_is_msr_to_save(msr)) 3895 + break; 3896 + 3906 3897 return KVM_MSR_RET_INVALID; 3907 3898 } 3908 3899 return 0; ··· 3988 3983 case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ 3989 3984 case MSR_PKG_ENERGY_STATUS: /* Total package */ 3990 3985 case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ 3991 - msr_info->data = 0; 3992 - break; 3993 - case MSR_IA32_PEBS_ENABLE: 3994 - case MSR_IA32_DS_AREA: 3995 - case MSR_PEBS_DATA_CFG: 3996 - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: 3997 - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 3998 - return kvm_pmu_get_msr(vcpu, msr_info); 3999 - /* 4000 - * Userspace is allowed to read MSRs that KVM reports as 4001 - * to-be-saved, even if an MSR isn't fully supported. 4002 - */ 4003 - if (!msr_info->host_initiated) 4004 - return 1; 4005 3986 msr_info->data = 0; 4006 3987 break; 4007 3988 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: ··· 4245 4254 default: 4246 4255 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) 4247 4256 return kvm_pmu_get_msr(vcpu, msr_info); 4257 + 4258 + /* 4259 + * Userspace is allowed to read MSRs that KVM reports as 4260 + * to-be-saved, even if an MSR isn't fully supported. 4261 + */ 4262 + if (msr_info->host_initiated && 4263 + kvm_is_msr_to_save(msr_info->index)) { 4264 + msr_info->data = 0; 4265 + break; 4266 + } 4267 + 4248 4268 return KVM_MSR_RET_INVALID; 4249 4269 } 4250 4270 return 0; ··· 4403 4401 case KVM_CAP_SPLIT_IRQCHIP: 4404 4402 case KVM_CAP_IMMEDIATE_EXIT: 4405 4403 case KVM_CAP_PMU_EVENT_FILTER: 4404 + case KVM_CAP_PMU_EVENT_MASKED_EVENTS: 4406 4405 case KVM_CAP_GET_MSR_FEATURES: 4407 4406 case KVM_CAP_MSR_PLATFORM_INFO: 4408 4407 case KVM_CAP_EXCEPTION_PAYLOAD: ··· 6996 6993 return r; 6997 6994 } 6998 6995 6999 - static void kvm_init_msr_list(void) 6996 + static void kvm_probe_msr_to_save(u32 msr_index) 7000 6997 { 7001 6998 u32 dummy[2]; 6999 + 7000 + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) 7001 + return; 7002 + 7003 + /* 7004 + * Even MSRs that are valid in the host may not be exposed to guests in 7005 + * some cases. 7006 + */ 7007 + switch (msr_index) { 7008 + case MSR_IA32_BNDCFGS: 7009 + if (!kvm_mpx_supported()) 7010 + return; 7011 + break; 7012 + case MSR_TSC_AUX: 7013 + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && 7014 + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) 7015 + return; 7016 + break; 7017 + case MSR_IA32_UMWAIT_CONTROL: 7018 + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) 7019 + return; 7020 + break; 7021 + case MSR_IA32_RTIT_CTL: 7022 + case MSR_IA32_RTIT_STATUS: 7023 + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) 7024 + return; 7025 + break; 7026 + case MSR_IA32_RTIT_CR3_MATCH: 7027 + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 7028 + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) 7029 + return; 7030 + break; 7031 + case MSR_IA32_RTIT_OUTPUT_BASE: 7032 + case MSR_IA32_RTIT_OUTPUT_MASK: 7033 + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 7034 + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && 7035 + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) 7036 + return; 7037 + break; 7038 + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 7039 + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 7040 + (msr_index - MSR_IA32_RTIT_ADDR0_A >= 7041 + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) 7042 + return; 7043 + break; 7044 + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: 7045 + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= 7046 + kvm_pmu_cap.num_counters_gp) 7047 + return; 7048 + break; 7049 + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: 7050 + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= 7051 + kvm_pmu_cap.num_counters_gp) 7052 + return; 7053 + break; 7054 + case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: 7055 + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= 7056 + kvm_pmu_cap.num_counters_fixed) 7057 + return; 7058 + break; 7059 + case MSR_IA32_XFD: 7060 + case MSR_IA32_XFD_ERR: 7061 + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) 7062 + return; 7063 + break; 7064 + default: 7065 + break; 7066 + } 7067 + 7068 + msrs_to_save[num_msrs_to_save++] = msr_index; 7069 + } 7070 + 7071 + static void kvm_init_msr_list(void) 7072 + { 7002 7073 unsigned i; 7003 7074 7004 7075 BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, 7005 - "Please update the fixed PMCs in msrs_to_saved_all[]"); 7076 + "Please update the fixed PMCs in msrs_to_save_pmu[]"); 7006 7077 7007 7078 num_msrs_to_save = 0; 7008 7079 num_emulated_msrs = 0; 7009 7080 num_msr_based_features = 0; 7010 7081 7011 - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { 7012 - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) 7013 - continue; 7082 + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) 7083 + kvm_probe_msr_to_save(msrs_to_save_base[i]); 7014 7084 7015 - /* 7016 - * Even MSRs that are valid in the host may not be exposed 7017 - * to the guests in some cases. 7018 - */ 7019 - switch (msrs_to_save_all[i]) { 7020 - case MSR_IA32_BNDCFGS: 7021 - if (!kvm_mpx_supported()) 7022 - continue; 7023 - break; 7024 - case MSR_TSC_AUX: 7025 - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && 7026 - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) 7027 - continue; 7028 - break; 7029 - case MSR_IA32_UMWAIT_CONTROL: 7030 - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) 7031 - continue; 7032 - break; 7033 - case MSR_IA32_RTIT_CTL: 7034 - case MSR_IA32_RTIT_STATUS: 7035 - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) 7036 - continue; 7037 - break; 7038 - case MSR_IA32_RTIT_CR3_MATCH: 7039 - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 7040 - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) 7041 - continue; 7042 - break; 7043 - case MSR_IA32_RTIT_OUTPUT_BASE: 7044 - case MSR_IA32_RTIT_OUTPUT_MASK: 7045 - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 7046 - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && 7047 - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) 7048 - continue; 7049 - break; 7050 - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 7051 - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || 7052 - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= 7053 - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) 7054 - continue; 7055 - break; 7056 - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: 7057 - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= 7058 - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) 7059 - continue; 7060 - break; 7061 - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: 7062 - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= 7063 - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) 7064 - continue; 7065 - break; 7066 - case MSR_IA32_XFD: 7067 - case MSR_IA32_XFD_ERR: 7068 - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) 7069 - continue; 7070 - break; 7071 - default: 7072 - break; 7073 - } 7074 - 7075 - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; 7085 + if (enable_pmu) { 7086 + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) 7087 + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); 7076 7088 } 7077 7089 7078 7090 for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { ··· 9395 9377 if (boot_cpu_has(X86_FEATURE_XSAVES)) 9396 9378 rdmsrl(MSR_IA32_XSS, host_xss); 9397 9379 9398 - kvm_init_pmu_capability(); 9380 + kvm_init_pmu_capability(ops->pmu_ops); 9399 9381 9400 9382 r = ops->hardware_setup(); 9401 9383 if (r != 0)
+12
arch/x86/kvm/x86.h
··· 331 331 332 332 extern bool eager_page_split; 333 333 334 + static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 335 + { 336 + if (report_ignored_msrs) 337 + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); 338 + } 339 + 340 + static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) 341 + { 342 + if (report_ignored_msrs) 343 + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); 344 + } 345 + 334 346 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) 335 347 { 336 348 return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+1
include/uapi/linux/kvm.h
··· 1175 1175 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 1176 1176 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 1177 1177 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 1178 + #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 1178 1179 1179 1180 #ifdef KVM_CAP_IRQ_ROUTING 1180 1181
+376 -5
tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
··· 198 198 199 199 200 200 static struct kvm_pmu_event_filter * 201 - create_pmu_event_filter(const uint64_t event_list[], 202 - int nevents, uint32_t action) 201 + create_pmu_event_filter(const uint64_t event_list[], int nevents, 202 + uint32_t action, uint32_t flags) 203 203 { 204 204 struct kvm_pmu_event_filter *f; 205 205 int i; 206 206 207 207 f = alloc_pmu_event_filter(nevents); 208 208 f->action = action; 209 + f->flags = flags; 209 210 for (i = 0; i < nevents; i++) 210 211 f->events[i] = event_list[i]; 211 212 ··· 217 216 { 218 217 return create_pmu_event_filter(event_list, 219 218 ARRAY_SIZE(event_list), 220 - action); 219 + action, 0); 221 220 } 222 221 223 222 /* ··· 264 263 struct kvm_pmu_event_filter *f; 265 264 uint64_t count; 266 265 267 - f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY); 266 + f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY, 0); 268 267 count = test_with_filter(vcpu, f); 269 268 270 269 free(f); ··· 404 403 is_zen3(family, model)); 405 404 } 406 405 406 + /* 407 + * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and 408 + * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/ 409 + * supported on Intel Xeon processors: 410 + * - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake. 411 + */ 412 + #define MEM_INST_RETIRED 0xD0 413 + #define MEM_INST_RETIRED_LOAD EVENT(MEM_INST_RETIRED, 0x81) 414 + #define MEM_INST_RETIRED_STORE EVENT(MEM_INST_RETIRED, 0x82) 415 + #define MEM_INST_RETIRED_LOAD_STORE EVENT(MEM_INST_RETIRED, 0x83) 416 + 417 + static bool supports_event_mem_inst_retired(void) 418 + { 419 + uint32_t eax, ebx, ecx, edx; 420 + 421 + cpuid(1, &eax, &ebx, &ecx, &edx); 422 + if (x86_family(eax) == 0x6) { 423 + switch (x86_model(eax)) { 424 + /* Sapphire Rapids */ 425 + case 0x8F: 426 + /* Ice Lake */ 427 + case 0x6A: 428 + /* Skylake */ 429 + /* Cascade Lake */ 430 + case 0x55: 431 + return true; 432 + } 433 + } 434 + 435 + return false; 436 + } 437 + 438 + /* 439 + * "LS Dispatch", from Processor Programming Reference 440 + * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, 441 + * Preliminary Processor Programming Reference (PPR) for AMD Family 442 + * 17h Model 31h, Revision B0 Processors, and Preliminary Processor 443 + * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision 444 + * B1 Processors Volume 1 of 2. 445 + */ 446 + #define LS_DISPATCH 0x29 447 + #define LS_DISPATCH_LOAD EVENT(LS_DISPATCH, BIT(0)) 448 + #define LS_DISPATCH_STORE EVENT(LS_DISPATCH, BIT(1)) 449 + #define LS_DISPATCH_LOAD_STORE EVENT(LS_DISPATCH, BIT(2)) 450 + 451 + #define INCLUDE_MASKED_ENTRY(event_select, mask, match) \ 452 + KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false) 453 + #define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \ 454 + KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true) 455 + 456 + struct perf_counter { 457 + union { 458 + uint64_t raw; 459 + struct { 460 + uint64_t loads:22; 461 + uint64_t stores:22; 462 + uint64_t loads_stores:20; 463 + }; 464 + }; 465 + }; 466 + 467 + static uint64_t masked_events_guest_test(uint32_t msr_base) 468 + { 469 + uint64_t ld0, ld1, st0, st1, ls0, ls1; 470 + struct perf_counter c; 471 + int val; 472 + 473 + /* 474 + * The acutal value of the counters don't determine the outcome of 475 + * the test. Only that they are zero or non-zero. 476 + */ 477 + ld0 = rdmsr(msr_base + 0); 478 + st0 = rdmsr(msr_base + 1); 479 + ls0 = rdmsr(msr_base + 2); 480 + 481 + __asm__ __volatile__("movl $0, %[v];" 482 + "movl %[v], %%eax;" 483 + "incl %[v];" 484 + : [v]"+m"(val) :: "eax"); 485 + 486 + ld1 = rdmsr(msr_base + 0); 487 + st1 = rdmsr(msr_base + 1); 488 + ls1 = rdmsr(msr_base + 2); 489 + 490 + c.loads = ld1 - ld0; 491 + c.stores = st1 - st0; 492 + c.loads_stores = ls1 - ls0; 493 + 494 + return c.raw; 495 + } 496 + 497 + static void intel_masked_events_guest_code(void) 498 + { 499 + uint64_t r; 500 + 501 + for (;;) { 502 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); 503 + 504 + wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE | 505 + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD); 506 + wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE | 507 + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE); 508 + wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE | 509 + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE); 510 + 511 + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7); 512 + 513 + r = masked_events_guest_test(MSR_IA32_PMC0); 514 + 515 + GUEST_SYNC(r); 516 + } 517 + } 518 + 519 + static void amd_masked_events_guest_code(void) 520 + { 521 + uint64_t r; 522 + 523 + for (;;) { 524 + wrmsr(MSR_K7_EVNTSEL0, 0); 525 + wrmsr(MSR_K7_EVNTSEL1, 0); 526 + wrmsr(MSR_K7_EVNTSEL2, 0); 527 + 528 + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | 529 + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD); 530 + wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | 531 + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE); 532 + wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE | 533 + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE); 534 + 535 + r = masked_events_guest_test(MSR_K7_PERFCTR0); 536 + 537 + GUEST_SYNC(r); 538 + } 539 + } 540 + 541 + static struct perf_counter run_masked_events_test(struct kvm_vcpu *vcpu, 542 + const uint64_t masked_events[], 543 + const int nmasked_events) 544 + { 545 + struct kvm_pmu_event_filter *f; 546 + struct perf_counter r; 547 + 548 + f = create_pmu_event_filter(masked_events, nmasked_events, 549 + KVM_PMU_EVENT_ALLOW, 550 + KVM_PMU_EVENT_FLAG_MASKED_EVENTS); 551 + r.raw = test_with_filter(vcpu, f); 552 + free(f); 553 + 554 + return r; 555 + } 556 + 557 + /* Matches KVM_PMU_EVENT_FILTER_MAX_EVENTS in pmu.c */ 558 + #define MAX_FILTER_EVENTS 300 559 + #define MAX_TEST_EVENTS 10 560 + 561 + #define ALLOW_LOADS BIT(0) 562 + #define ALLOW_STORES BIT(1) 563 + #define ALLOW_LOADS_STORES BIT(2) 564 + 565 + struct masked_events_test { 566 + uint64_t intel_events[MAX_TEST_EVENTS]; 567 + uint64_t intel_event_end; 568 + uint64_t amd_events[MAX_TEST_EVENTS]; 569 + uint64_t amd_event_end; 570 + const char *msg; 571 + uint32_t flags; 572 + }; 573 + 574 + /* 575 + * These are the test cases for the masked events tests. 576 + * 577 + * For each test, the guest enables 3 PMU counters (loads, stores, 578 + * loads + stores). The filter is then set in KVM with the masked events 579 + * provided. The test then verifies that the counters agree with which 580 + * ones should be counting and which ones should be filtered. 581 + */ 582 + const struct masked_events_test test_cases[] = { 583 + { 584 + .intel_events = { 585 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81), 586 + }, 587 + .amd_events = { 588 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)), 589 + }, 590 + .msg = "Only allow loads.", 591 + .flags = ALLOW_LOADS, 592 + }, { 593 + .intel_events = { 594 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82), 595 + }, 596 + .amd_events = { 597 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)), 598 + }, 599 + .msg = "Only allow stores.", 600 + .flags = ALLOW_STORES, 601 + }, { 602 + .intel_events = { 603 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83), 604 + }, 605 + .amd_events = { 606 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)), 607 + }, 608 + .msg = "Only allow loads + stores.", 609 + .flags = ALLOW_LOADS_STORES, 610 + }, { 611 + .intel_events = { 612 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), 613 + EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83), 614 + }, 615 + .amd_events = { 616 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0), 617 + }, 618 + .msg = "Only allow loads and stores.", 619 + .flags = ALLOW_LOADS | ALLOW_STORES, 620 + }, { 621 + .intel_events = { 622 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), 623 + EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82), 624 + }, 625 + .amd_events = { 626 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), 627 + EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)), 628 + }, 629 + .msg = "Only allow loads and loads + stores.", 630 + .flags = ALLOW_LOADS | ALLOW_LOADS_STORES 631 + }, { 632 + .intel_events = { 633 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82), 634 + }, 635 + .amd_events = { 636 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), 637 + EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)), 638 + }, 639 + .msg = "Only allow stores and loads + stores.", 640 + .flags = ALLOW_STORES | ALLOW_LOADS_STORES 641 + }, { 642 + .intel_events = { 643 + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), 644 + }, 645 + .amd_events = { 646 + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), 647 + }, 648 + .msg = "Only allow loads, stores, and loads + stores.", 649 + .flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES 650 + }, 651 + }; 652 + 653 + static int append_test_events(const struct masked_events_test *test, 654 + uint64_t *events, int nevents) 655 + { 656 + const uint64_t *evts; 657 + int i; 658 + 659 + evts = use_intel_pmu() ? test->intel_events : test->amd_events; 660 + for (i = 0; i < MAX_TEST_EVENTS; i++) { 661 + if (evts[i] == 0) 662 + break; 663 + 664 + events[nevents + i] = evts[i]; 665 + } 666 + 667 + return nevents + i; 668 + } 669 + 670 + static bool bool_eq(bool a, bool b) 671 + { 672 + return a == b; 673 + } 674 + 675 + static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events, 676 + int nevents) 677 + { 678 + int ntests = ARRAY_SIZE(test_cases); 679 + struct perf_counter c; 680 + int i, n; 681 + 682 + for (i = 0; i < ntests; i++) { 683 + const struct masked_events_test *test = &test_cases[i]; 684 + 685 + /* Do any test case events overflow MAX_TEST_EVENTS? */ 686 + assert(test->intel_event_end == 0); 687 + assert(test->amd_event_end == 0); 688 + 689 + n = append_test_events(test, events, nevents); 690 + 691 + c = run_masked_events_test(vcpu, events, n); 692 + TEST_ASSERT(bool_eq(c.loads, test->flags & ALLOW_LOADS) && 693 + bool_eq(c.stores, test->flags & ALLOW_STORES) && 694 + bool_eq(c.loads_stores, 695 + test->flags & ALLOW_LOADS_STORES), 696 + "%s loads: %u, stores: %u, loads + stores: %u", 697 + test->msg, c.loads, c.stores, c.loads_stores); 698 + } 699 + } 700 + 701 + static void add_dummy_events(uint64_t *events, int nevents) 702 + { 703 + int i; 704 + 705 + for (i = 0; i < nevents; i++) { 706 + int event_select = i % 0xFF; 707 + bool exclude = ((i % 4) == 0); 708 + 709 + if (event_select == MEM_INST_RETIRED || 710 + event_select == LS_DISPATCH) 711 + event_select++; 712 + 713 + events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0, 714 + 0, exclude); 715 + } 716 + } 717 + 718 + static void test_masked_events(struct kvm_vcpu *vcpu) 719 + { 720 + int nevents = MAX_FILTER_EVENTS - MAX_TEST_EVENTS; 721 + uint64_t events[MAX_FILTER_EVENTS]; 722 + 723 + /* Run the test cases against a sparse PMU event filter. */ 724 + run_masked_events_tests(vcpu, events, 0); 725 + 726 + /* Run the test cases against a dense PMU event filter. */ 727 + add_dummy_events(events, MAX_FILTER_EVENTS); 728 + run_masked_events_tests(vcpu, events, nevents); 729 + } 730 + 731 + static int run_filter_test(struct kvm_vcpu *vcpu, const uint64_t *events, 732 + int nevents, uint32_t flags) 733 + { 734 + struct kvm_pmu_event_filter *f; 735 + int r; 736 + 737 + f = create_pmu_event_filter(events, nevents, KVM_PMU_EVENT_ALLOW, flags); 738 + r = __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f); 739 + free(f); 740 + 741 + return r; 742 + } 743 + 744 + static void test_filter_ioctl(struct kvm_vcpu *vcpu) 745 + { 746 + uint64_t e = ~0ul; 747 + int r; 748 + 749 + /* 750 + * Unfortunately having invalid bits set in event data is expected to 751 + * pass when flags == 0 (bits other than eventsel+umask). 752 + */ 753 + r = run_filter_test(vcpu, &e, 1, 0); 754 + TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing"); 755 + 756 + r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS); 757 + TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail"); 758 + 759 + e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf); 760 + r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS); 761 + TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing"); 762 + } 763 + 407 764 int main(int argc, char *argv[]) 408 765 { 409 766 void (*guest_code)(void); 410 - struct kvm_vcpu *vcpu; 767 + struct kvm_vcpu *vcpu, *vcpu2 = NULL; 411 768 struct kvm_vm *vm; 412 769 413 770 TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER)); 771 + TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS)); 414 772 415 773 TEST_REQUIRE(use_intel_pmu() || use_amd_pmu()); 416 774 guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code; ··· 789 429 test_member_allow_list(vcpu); 790 430 test_not_member_deny_list(vcpu); 791 431 test_not_member_allow_list(vcpu); 432 + 433 + if (use_intel_pmu() && 434 + supports_event_mem_inst_retired() && 435 + kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3) 436 + vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code); 437 + else if (use_amd_pmu()) 438 + vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code); 439 + 440 + if (vcpu2) 441 + test_masked_events(vcpu2); 442 + test_filter_ioctl(vcpu); 792 443 793 444 kvm_vm_free(vm); 794 445