Merge tag 'perf-urgent-2021-08-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Thomas Gleixner:
"A set of perf fixes:

- Correct the permission checks for perf event which send SIGTRAP to
a different process and clean up that code to be more readable.

- Prevent an out of bound MSR access in the x86 perf code which
happened due to an incomplete limiting to the actually available
hardware counters.

- Prevent access to the AMD64_EVENTSEL_HOSTONLY bit when running
inside a guest.

- Handle small core counter re-enabling correctly by issuing an ACK
right before reenabling it to prevent a stale PEBS record being
kept around"

* tag 'perf-urgent-2021-08-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf/x86/intel: Apply mid ACK for small core
perf/x86/amd: Don't touch the AMD64_EVENTSEL_HOSTONLY bit inside the guest
perf/x86: Fix out of bound MSR access
perf: Refactor permissions check into perf_check_permission()
perf: Fix required permissions if sigtrap is requested

Changed files
+71 -17
arch
x86
events
kernel
events
+7 -5
arch/x86/events/core.c
··· 2489 2489 return; 2490 2490 2491 2491 for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { 2492 - /* Metrics and fake events don't have corresponding HW counters. */ 2493 - if (is_metric_idx(i) || (i == INTEL_PMC_IDX_FIXED_VLBR)) 2494 - continue; 2495 - else if (i >= INTEL_PMC_IDX_FIXED) 2492 + if (i >= INTEL_PMC_IDX_FIXED) { 2493 + /* Metrics and fake events don't have corresponding HW counters. */ 2494 + if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) 2495 + continue; 2496 + 2496 2497 wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); 2497 - else 2498 + } else { 2498 2499 wrmsrl(x86_pmu_event_addr(i), 0); 2500 + } 2499 2501 } 2500 2502 2501 2503 bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
+15 -8
arch/x86/events/intel/core.c
··· 2904 2904 */ 2905 2905 static int intel_pmu_handle_irq(struct pt_regs *regs) 2906 2906 { 2907 - struct cpu_hw_events *cpuc; 2907 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2908 + bool late_ack = hybrid_bit(cpuc->pmu, late_ack); 2909 + bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack); 2908 2910 int loops; 2909 2911 u64 status; 2910 2912 int handled; 2911 2913 int pmu_enabled; 2912 - 2913 - cpuc = this_cpu_ptr(&cpu_hw_events); 2914 2914 2915 2915 /* 2916 2916 * Save the PMU state. ··· 2918 2918 */ 2919 2919 pmu_enabled = cpuc->enabled; 2920 2920 /* 2921 - * No known reason to not always do late ACK, 2922 - * but just in case do it opt-in. 2921 + * In general, the early ACK is only applied for old platforms. 2922 + * For the big core starts from Haswell, the late ACK should be 2923 + * applied. 2924 + * For the small core after Tremont, we have to do the ACK right 2925 + * before re-enabling counters, which is in the middle of the 2926 + * NMI handler. 2923 2927 */ 2924 - if (!x86_pmu.late_ack) 2928 + if (!late_ack && !mid_ack) 2925 2929 apic_write(APIC_LVTPC, APIC_DM_NMI); 2926 2930 intel_bts_disable_local(); 2927 2931 cpuc->enabled = 0; ··· 2962 2958 goto again; 2963 2959 2964 2960 done: 2961 + if (mid_ack) 2962 + apic_write(APIC_LVTPC, APIC_DM_NMI); 2965 2963 /* Only restore PMU state when it's active. See x86_pmu_disable(). */ 2966 2964 cpuc->enabled = pmu_enabled; 2967 2965 if (pmu_enabled) ··· 2975 2969 * have been reset. This avoids spurious NMIs on 2976 2970 * Haswell CPUs. 2977 2971 */ 2978 - if (x86_pmu.late_ack) 2972 + if (late_ack) 2979 2973 apic_write(APIC_LVTPC, APIC_DM_NMI); 2980 2974 return handled; 2981 2975 } ··· 6135 6129 static_branch_enable(&perf_is_hybrid); 6136 6130 x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; 6137 6131 6138 - x86_pmu.late_ack = true; 6139 6132 x86_pmu.pebs_aliases = NULL; 6140 6133 x86_pmu.pebs_prec_dist = true; 6141 6134 x86_pmu.pebs_block = true; ··· 6172 6167 pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; 6173 6168 pmu->name = "cpu_core"; 6174 6169 pmu->cpu_type = hybrid_big; 6170 + pmu->late_ack = true; 6175 6171 if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { 6176 6172 pmu->num_counters = x86_pmu.num_counters + 2; 6177 6173 pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; ··· 6198 6192 pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; 6199 6193 pmu->name = "cpu_atom"; 6200 6194 pmu->cpu_type = hybrid_small; 6195 + pmu->mid_ack = true; 6201 6196 pmu->num_counters = x86_pmu.num_counters; 6202 6197 pmu->num_counters_fixed = x86_pmu.num_counters_fixed; 6203 6198 pmu->max_pebs_events = x86_pmu.max_pebs_events;
+17 -1
arch/x86/events/perf_event.h
··· 656 656 struct event_constraint *event_constraints; 657 657 struct event_constraint *pebs_constraints; 658 658 struct extra_reg *extra_regs; 659 + 660 + unsigned int late_ack :1, 661 + mid_ack :1, 662 + enabled_ack :1; 659 663 }; 660 664 661 665 static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) ··· 689 685 \ 690 686 __Fp; \ 691 687 })) 688 + 689 + #define hybrid_bit(_pmu, _field) \ 690 + ({ \ 691 + bool __Fp = x86_pmu._field; \ 692 + \ 693 + if (is_hybrid() && (_pmu)) \ 694 + __Fp = hybrid_pmu(_pmu)->_field; \ 695 + \ 696 + __Fp; \ 697 + }) 692 698 693 699 enum hybrid_pmu_type { 694 700 hybrid_big = 0x40, ··· 769 755 770 756 /* PMI handler bits */ 771 757 unsigned int late_ack :1, 758 + mid_ack :1, 772 759 enabled_ack :1; 773 760 /* 774 761 * sysfs attrs ··· 1130 1115 1131 1116 static inline void x86_pmu_disable_event(struct perf_event *event) 1132 1117 { 1118 + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); 1133 1119 struct hw_perf_event *hwc = &event->hw; 1134 1120 1135 - wrmsrl(hwc->config_base, hwc->config); 1121 + wrmsrl(hwc->config_base, hwc->config & ~disable_mask); 1136 1122 1137 1123 if (is_counter_pair(hwc)) 1138 1124 wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0);
+32 -3
kernel/events/core.c
··· 11917 11917 return gctx; 11918 11918 } 11919 11919 11920 + static bool 11921 + perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) 11922 + { 11923 + unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; 11924 + bool is_capable = perfmon_capable(); 11925 + 11926 + if (attr->sigtrap) { 11927 + /* 11928 + * perf_event_attr::sigtrap sends signals to the other task. 11929 + * Require the current task to also have CAP_KILL. 11930 + */ 11931 + rcu_read_lock(); 11932 + is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); 11933 + rcu_read_unlock(); 11934 + 11935 + /* 11936 + * If the required capabilities aren't available, checks for 11937 + * ptrace permissions: upgrade to ATTACH, since sending signals 11938 + * can effectively change the target task. 11939 + */ 11940 + ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; 11941 + } 11942 + 11943 + /* 11944 + * Preserve ptrace permission check for backwards compatibility. The 11945 + * ptrace check also includes checks that the current task and other 11946 + * task have matching uids, and is therefore not done here explicitly. 11947 + */ 11948 + return is_capable || ptrace_may_access(task, ptrace_mode); 11949 + } 11950 + 11920 11951 /** 11921 11952 * sys_perf_event_open - open a performance event, associate it to a task/cpu 11922 11953 * ··· 12194 12163 goto err_file; 12195 12164 12196 12165 /* 12197 - * Preserve ptrace permission check for backwards compatibility. 12198 - * 12199 12166 * We must hold exec_update_lock across this and any potential 12200 12167 * perf_install_in_context() call for this new event to 12201 12168 * serialize against exec() altering our credentials (and the 12202 12169 * perf_event_exit_task() that could imply). 12203 12170 */ 12204 12171 err = -EACCES; 12205 - if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) 12172 + if (!perf_check_permission(&attr, task)) 12206 12173 goto err_cred; 12207 12174 } 12208 12175