Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
"x86 PMU driver fixes plus a core code race fix"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86/intel: Fix incorrect lbr_sel_mask value
perf/x86/intel/pt: Don't die on VMXON
perf/core: Fix perf_event_open() vs. execve() race
perf/x86/amd: Set the size of event map array to PERF_COUNT_HW_MAX
perf/core: Make sysctl_perf_cpu_time_max_percent conform to documentation
perf/x86/intel/rapl: Add missing Haswell model
perf/x86/intel: Add model number for Skylake Server to perf

+120 -31
+1 -1
arch/x86/events/amd/core.c
··· 115 115 /* 116 116 * AMD Performance Monitor K7 and later. 117 117 */ 118 - static const u64 amd_perfmon_event_map[] = 118 + static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = 119 119 { 120 120 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, 121 121 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+1
arch/x86/events/intel/core.c
··· 3639 3639 3640 3640 case 78: /* 14nm Skylake Mobile */ 3641 3641 case 94: /* 14nm Skylake Desktop */ 3642 + case 85: /* 14nm Skylake Server */ 3642 3643 x86_pmu.late_ack = true; 3643 3644 memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); 3644 3645 memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+4 -2
arch/x86/events/intel/lbr.c
··· 63 63 64 64 #define LBR_PLM (LBR_KERNEL | LBR_USER) 65 65 66 - #define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ 66 + #define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */ 67 67 #define LBR_NOT_SUPP -1 /* LBR filter not supported */ 68 68 #define LBR_IGN 0 /* ignored */ 69 69 ··· 610 610 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate 611 611 * in suppress mode. So LBR_SELECT should be set to 612 612 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK) 613 + * But the 10th bit LBR_CALL_STACK does not operate 614 + * in suppress mode. 613 615 */ 614 - reg->config = mask ^ x86_pmu.lbr_sel_mask; 616 + reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK); 615 617 616 618 if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && 617 619 (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+64 -11
arch/x86/events/intel/pt.c
··· 136 136 struct dev_ext_attribute *de_attrs; 137 137 struct attribute **attrs; 138 138 size_t size; 139 + u64 reg; 139 140 int ret; 140 141 long i; 142 + 143 + if (boot_cpu_has(X86_FEATURE_VMX)) { 144 + /* 145 + * Intel SDM, 36.5 "Tracing post-VMXON" says that 146 + * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace 147 + * post-VMXON. 148 + */ 149 + rdmsrl(MSR_IA32_VMX_MISC, reg); 150 + if (reg & BIT(14)) 151 + pt_pmu.vmx = true; 152 + } 141 153 142 154 attrs = NULL; 143 155 ··· 281 269 282 270 reg |= (event->attr.config & PT_CONFIG_MASK); 283 271 272 + event->hw.config = reg; 284 273 wrmsrl(MSR_IA32_RTIT_CTL, reg); 285 274 } 286 275 287 - static void pt_config_start(bool start) 276 + static void pt_config_stop(struct perf_event *event) 288 277 { 289 - u64 ctl; 278 + u64 ctl = READ_ONCE(event->hw.config); 290 279 291 - rdmsrl(MSR_IA32_RTIT_CTL, ctl); 292 - if (start) 293 - ctl |= RTIT_CTL_TRACEEN; 294 - else 295 - ctl &= ~RTIT_CTL_TRACEEN; 280 + /* may be already stopped by a PMI */ 281 + if (!(ctl & RTIT_CTL_TRACEEN)) 282 + return; 283 + 284 + ctl &= ~RTIT_CTL_TRACEEN; 296 285 wrmsrl(MSR_IA32_RTIT_CTL, ctl); 286 + 287 + WRITE_ONCE(event->hw.config, ctl); 297 288 298 289 /* 299 290 * A wrmsr that disables trace generation serializes other PT ··· 306 291 * The below WMB, separating data store and aux_head store matches 307 292 * the consumer's RMB that separates aux_head load and data load. 308 293 */ 309 - if (!start) 310 - wmb(); 294 + wmb(); 311 295 } 312 296 313 297 static void pt_config_buffer(void *buf, unsigned int topa_idx, ··· 956 942 if (!ACCESS_ONCE(pt->handle_nmi)) 957 943 return; 958 944 959 - pt_config_start(false); 945 + /* 946 + * If VMX is on and PT does not support it, don't touch anything. 947 + */ 948 + if (READ_ONCE(pt->vmx_on)) 949 + return; 960 950 961 951 if (!event) 962 952 return; 953 + 954 + pt_config_stop(event); 963 955 964 956 buf = perf_get_aux(&pt->handle); 965 957 if (!buf) ··· 1003 983 } 1004 984 } 1005 985 986 + void intel_pt_handle_vmx(int on) 987 + { 988 + struct pt *pt = this_cpu_ptr(&pt_ctx); 989 + struct perf_event *event; 990 + unsigned long flags; 991 + 992 + /* PT plays nice with VMX, do nothing */ 993 + if (pt_pmu.vmx) 994 + return; 995 + 996 + /* 997 + * VMXON will clear RTIT_CTL.TraceEn; we need to make 998 + * sure to not try to set it while VMX is on. Disable 999 + * interrupts to avoid racing with pmu callbacks; 1000 + * concurrent PMI should be handled fine. 1001 + */ 1002 + local_irq_save(flags); 1003 + WRITE_ONCE(pt->vmx_on, on); 1004 + 1005 + if (on) { 1006 + /* prevent pt_config_stop() from writing RTIT_CTL */ 1007 + event = pt->handle.event; 1008 + if (event) 1009 + event->hw.config = 0; 1010 + } 1011 + local_irq_restore(flags); 1012 + } 1013 + EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); 1014 + 1006 1015 /* 1007 1016 * PMU callbacks 1008 1017 */ ··· 1040 991 { 1041 992 struct pt *pt = this_cpu_ptr(&pt_ctx); 1042 993 struct pt_buffer *buf = perf_get_aux(&pt->handle); 994 + 995 + if (READ_ONCE(pt->vmx_on)) 996 + return; 1043 997 1044 998 if (!buf || pt_buffer_is_full(buf, pt)) { 1045 999 event->hw.state = PERF_HES_STOPPED; ··· 1066 1014 * see comment in intel_pt_interrupt(). 1067 1015 */ 1068 1016 ACCESS_ONCE(pt->handle_nmi) = 0; 1069 - pt_config_start(false); 1017 + 1018 + pt_config_stop(event); 1070 1019 1071 1020 if (event->hw.state == PERF_HES_STOPPED) 1072 1021 return;
+3
arch/x86/events/intel/pt.h
··· 65 65 struct pt_pmu { 66 66 struct pmu pmu; 67 67 u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; 68 + bool vmx; 68 69 }; 69 70 70 71 /** ··· 108 107 * struct pt - per-cpu pt context 109 108 * @handle: perf output handle 110 109 * @handle_nmi: do handle PT PMI on this cpu, there's an active event 110 + * @vmx_on: 1 if VMX is ON on this cpu 111 111 */ 112 112 struct pt { 113 113 struct perf_output_handle handle; 114 114 int handle_nmi; 115 + int vmx_on; 115 116 }; 116 117 117 118 #endif /* __INTEL_PT_H__ */
+1
arch/x86/events/intel/rapl.c
··· 718 718 break; 719 719 case 60: /* Haswell */ 720 720 case 69: /* Haswell-Celeron */ 721 + case 70: /* Haswell GT3e */ 721 722 case 61: /* Broadwell */ 722 723 case 71: /* Broadwell-H */ 723 724 rapl_cntr_mask = RAPL_IDX_HSW;
+4
arch/x86/include/asm/perf_event.h
··· 285 285 static inline void perf_check_microcode(void) { } 286 286 #endif 287 287 288 + #ifdef CONFIG_CPU_SUP_INTEL 289 + extern void intel_pt_handle_vmx(int on); 290 + #endif 291 + 288 292 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) 289 293 extern void amd_pmu_enable_virt(void); 290 294 extern void amd_pmu_disable_virt(void);
+4
arch/x86/kvm/vmx.c
··· 3103 3103 3104 3104 static void kvm_cpu_vmxon(u64 addr) 3105 3105 { 3106 + intel_pt_handle_vmx(1); 3107 + 3106 3108 asm volatile (ASM_VMX_VMXON_RAX 3107 3109 : : "a"(&addr), "m"(addr) 3108 3110 : "memory", "cc"); ··· 3174 3172 static void kvm_cpu_vmxoff(void) 3175 3173 { 3176 3174 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 3175 + 3176 + intel_pt_handle_vmx(0); 3177 3177 } 3178 3178 3179 3179 static void hardware_disable(void)
+38 -17
kernel/events/core.c
··· 412 412 if (ret || !write) 413 413 return ret; 414 414 415 - if (sysctl_perf_cpu_time_max_percent == 100) { 415 + if (sysctl_perf_cpu_time_max_percent == 100 || 416 + sysctl_perf_cpu_time_max_percent == 0) { 416 417 printk(KERN_WARNING 417 418 "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); 418 419 WRITE_ONCE(perf_sample_allowed_ns, 0); ··· 1106 1105 * function. 1107 1106 * 1108 1107 * Lock order: 1108 + * cred_guard_mutex 1109 1109 * task_struct::perf_event_mutex 1110 1110 * perf_event_context::mutex 1111 1111 * perf_event::child_mutex; ··· 3422 3420 find_lively_task_by_vpid(pid_t vpid) 3423 3421 { 3424 3422 struct task_struct *task; 3425 - int err; 3426 3423 3427 3424 rcu_read_lock(); 3428 3425 if (!vpid) ··· 3435 3434 if (!task) 3436 3435 return ERR_PTR(-ESRCH); 3437 3436 3438 - /* Reuse ptrace permission checks for now. */ 3439 - err = -EACCES; 3440 - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) 3441 - goto errout; 3442 - 3443 3437 return task; 3444 - errout: 3445 - put_task_struct(task); 3446 - return ERR_PTR(err); 3447 - 3448 3438 } 3449 3439 3450 3440 /* ··· 8405 8413 8406 8414 get_online_cpus(); 8407 8415 8416 + if (task) { 8417 + err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); 8418 + if (err) 8419 + goto err_cpus; 8420 + 8421 + /* 8422 + * Reuse ptrace permission checks for now. 8423 + * 8424 + * We must hold cred_guard_mutex across this and any potential 8425 + * perf_install_in_context() call for this new event to 8426 + * serialize against exec() altering our credentials (and the 8427 + * perf_event_exit_task() that could imply). 8428 + */ 8429 + err = -EACCES; 8430 + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) 8431 + goto err_cred; 8432 + } 8433 + 8408 8434 if (flags & PERF_FLAG_PID_CGROUP) 8409 8435 cgroup_fd = pid; 8410 8436 ··· 8430 8420 NULL, NULL, cgroup_fd); 8431 8421 if (IS_ERR(event)) { 8432 8422 err = PTR_ERR(event); 8433 - goto err_cpus; 8423 + goto err_cred; 8434 8424 } 8435 8425 8436 8426 if (is_sampling_event(event)) { ··· 8487 8477 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { 8488 8478 err = -EBUSY; 8489 8479 goto err_context; 8490 - } 8491 - 8492 - if (task) { 8493 - put_task_struct(task); 8494 - task = NULL; 8495 8480 } 8496 8481 8497 8482 /* ··· 8586 8581 8587 8582 WARN_ON_ONCE(ctx->parent_ctx); 8588 8583 8584 + /* 8585 + * This is the point on no return; we cannot fail hereafter. This is 8586 + * where we start modifying current state. 8587 + */ 8588 + 8589 8589 if (move_group) { 8590 8590 /* 8591 8591 * See perf_event_ctx_lock() for comments on the details ··· 8662 8652 mutex_unlock(&gctx->mutex); 8663 8653 mutex_unlock(&ctx->mutex); 8664 8654 8655 + if (task) { 8656 + mutex_unlock(&task->signal->cred_guard_mutex); 8657 + put_task_struct(task); 8658 + } 8659 + 8665 8660 put_online_cpus(); 8666 8661 8667 8662 mutex_lock(&current->perf_event_mutex); ··· 8699 8684 */ 8700 8685 if (!event_file) 8701 8686 free_event(event); 8687 + err_cred: 8688 + if (task) 8689 + mutex_unlock(&task->signal->cred_guard_mutex); 8702 8690 err_cpus: 8703 8691 put_online_cpus(); 8704 8692 err_task: ··· 8986 8968 8987 8969 /* 8988 8970 * When a child task exits, feed back event values to parent events. 8971 + * 8972 + * Can be called with cred_guard_mutex held when called from 8973 + * install_exec_creds(). 8989 8974 */ 8990 8975 void perf_event_exit_task(struct task_struct *child) 8991 8976 {