Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'perf-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf event updates from Ingo Molnar:
"HW support updates:

- Add uncore support for Intel Comet Lake

- Add RAPL support for Hygon Fam18h

- Add Intel "IIO stack to PMON mapping" support on Skylake-SP CPUs,
which enumerates per device performance counters via sysfs and
enables the perf stat --iiostat functionality

- Add support for Intel "Architectural LBRs", which generalized the
model specific LBR hardware tracing feature into a
model-independent, architected performance monitoring feature.

Usage is mostly seamless to tooling, as the pre-existing LBR
features are kept, but there's a couple of advantages under the
hood, such as faster context-switching, faster LBR reads, cleaner
exposure of LBR features to guest kernels, etc.

( Since architectural LBRs are supported via XSAVE, there's related
changes to the x86 FPU code as well. )

ftrace/perf updates:

- Add support to add a text poke event to record changes to kernel
text (i.e. self-modifying code) in order to support tracers like
Intel PT decoding through jump labels, kprobes and ftrace
trampolines.

Misc cleanups, smaller fixes..."

* tag 'perf-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (47 commits)
perf/x86/rapl: Add Hygon Fam18h RAPL support
kprobes: Remove unnecessary module_mutex locking from kprobe_optimizer()
x86/perf: Fix a typo
perf: <linux/perf_event.h>: drop a duplicated word
perf/x86/intel/lbr: Support XSAVES for arch LBR read
perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch
x86/fpu/xstate: Add helpers for LBR dynamic supervisor feature
x86/fpu/xstate: Support dynamic supervisor feature for LBR
x86/fpu: Use proper mask to replace full instruction mask
perf/x86: Remove task_ctx_size
perf/x86/intel/lbr: Create kmem_cache for the LBR context data
perf/core: Use kmem_cache to allocate the PMU specific data
perf/core: Factor out functions to allocate/free the task_ctx_data
perf/x86/intel/lbr: Support Architectural LBR
perf/x86/intel/lbr: Factor out intel_pmu_store_lbr
perf/x86/intel/lbr: Factor out rdlbr_all() and wrlbr_all()
perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from} wrappers __always_inline
perf/x86/intel/lbr: Unify the stored format of LBR information
perf/x86/intel/lbr: Support LBR_CTL
perf/x86: Expose CPUID enumeration bits for arch LBR
...

+1943 -280
+33
Documentation/ABI/testing/sysfs-devices-mapping
··· 1 + What: /sys/devices/uncore_iio_x/dieX 2 + Date: February 2020 3 + Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com> 4 + Description: 5 + Each IIO stack (PCIe root port) has its own IIO PMON block, so 6 + each dieX file (where X is die number) holds "Segment:Root Bus" 7 + for PCIe root port, which can be monitored by that IIO PMON 8 + block. 9 + For example, on 4-die Xeon platform with up to 6 IIO stacks per 10 + die and, therefore, 6 IIO PMON blocks per die, the mapping of 11 + IIO PMON block 0 exposes as the following: 12 + 13 + $ ls /sys/devices/uncore_iio_0/die* 14 + -r--r--r-- /sys/devices/uncore_iio_0/die0 15 + -r--r--r-- /sys/devices/uncore_iio_0/die1 16 + -r--r--r-- /sys/devices/uncore_iio_0/die2 17 + -r--r--r-- /sys/devices/uncore_iio_0/die3 18 + 19 + $ tail /sys/devices/uncore_iio_0/die* 20 + ==> /sys/devices/uncore_iio_0/die0 <== 21 + 0000:00 22 + ==> /sys/devices/uncore_iio_0/die1 <== 23 + 0000:40 24 + ==> /sys/devices/uncore_iio_0/die2 <== 25 + 0000:80 26 + ==> /sys/devices/uncore_iio_0/die3 <== 27 + 0000:c0 28 + 29 + Which means: 30 + IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000 31 + IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000 32 + IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000 33 + IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
+18 -10
arch/x86/events/core.c
··· 71 71 struct hw_perf_event *hwc = &event->hw; 72 72 int shift = 64 - x86_pmu.cntval_bits; 73 73 u64 prev_raw_count, new_raw_count; 74 - int idx = hwc->idx; 75 74 u64 delta; 76 75 77 - if (idx == INTEL_PMC_IDX_FIXED_BTS) 76 + if (unlikely(!hwc->event_base)) 78 77 return 0; 79 78 80 79 /* ··· 358 359 if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { 359 360 release_pmc_hardware(); 360 361 release_ds_buffers(); 362 + release_lbr_buffers(); 361 363 mutex_unlock(&pmc_reserve_mutex); 362 364 } 363 365 } ··· 1097 1097 struct cpu_hw_events *cpuc, int i) 1098 1098 { 1099 1099 struct hw_perf_event *hwc = &event->hw; 1100 + int idx; 1100 1101 1101 - hwc->idx = cpuc->assign[i]; 1102 + idx = hwc->idx = cpuc->assign[i]; 1102 1103 hwc->last_cpu = smp_processor_id(); 1103 1104 hwc->last_tag = ++cpuc->tags[i]; 1104 1105 1105 - if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { 1106 + switch (hwc->idx) { 1107 + case INTEL_PMC_IDX_FIXED_BTS: 1108 + case INTEL_PMC_IDX_FIXED_VLBR: 1106 1109 hwc->config_base = 0; 1107 1110 hwc->event_base = 0; 1108 - } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { 1111 + break; 1112 + 1113 + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: 1109 1114 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1110 - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); 1111 - hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; 1112 - } else { 1115 + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + 1116 + (idx - INTEL_PMC_IDX_FIXED); 1117 + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30; 1118 + break; 1119 + 1120 + default: 1113 1121 hwc->config_base = x86_pmu_config_addr(hwc->idx); 1114 1122 hwc->event_base = x86_pmu_event_addr(hwc->idx); 1115 1123 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); 1124 + break; 1116 1125 } 1117 1126 } 1118 1127 ··· 1242 1233 s64 period = hwc->sample_period; 1243 1234 int ret = 0, idx = hwc->idx; 1244 1235 1245 - if (idx == INTEL_PMC_IDX_FIXED_BTS) 1236 + if (unlikely(!hwc->event_base)) 1246 1237 return 0; 1247 1238 1248 1239 /* ··· 2372 2363 2373 2364 .event_idx = x86_pmu_event_idx, 2374 2365 .sched_task = x86_pmu_sched_task, 2375 - .task_ctx_size = sizeof(struct x86_perf_task_context), 2376 2366 .swap_task_ctx = x86_pmu_swap_task_ctx, 2377 2367 .check_period = x86_pmu_check_period, 2378 2368
+86 -41
arch/x86/events/intel/core.c
··· 2136 2136 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 2137 2137 } 2138 2138 2139 - static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) 2139 + static inline bool event_is_checkpointed(struct perf_event *event) 2140 2140 { 2141 + return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; 2142 + } 2143 + 2144 + static inline void intel_set_masks(struct perf_event *event, int idx) 2145 + { 2146 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2147 + 2148 + if (event->attr.exclude_host) 2149 + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); 2150 + if (event->attr.exclude_guest) 2151 + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); 2152 + if (event_is_checkpointed(event)) 2153 + __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status); 2154 + } 2155 + 2156 + static inline void intel_clear_masks(struct perf_event *event, int idx) 2157 + { 2158 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2159 + 2160 + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); 2161 + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); 2162 + __clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status); 2163 + } 2164 + 2165 + static void intel_pmu_disable_fixed(struct perf_event *event) 2166 + { 2167 + struct hw_perf_event *hwc = &event->hw; 2141 2168 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 2142 2169 u64 ctrl_val, mask; 2143 2170 ··· 2175 2148 wrmsrl(hwc->config_base, ctrl_val); 2176 2149 } 2177 2150 2178 - static inline bool event_is_checkpointed(struct perf_event *event) 2179 - { 2180 - return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; 2181 - } 2182 - 2183 2151 static void intel_pmu_disable_event(struct perf_event *event) 2184 2152 { 2185 2153 struct hw_perf_event *hwc = &event->hw; 2186 - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2154 + int idx = hwc->idx; 2187 2155 2188 - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { 2156 + if (idx < INTEL_PMC_IDX_FIXED) { 2157 + intel_clear_masks(event, idx); 2158 + x86_pmu_disable_event(event); 2159 + } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { 2160 + intel_clear_masks(event, idx); 2161 + intel_pmu_disable_fixed(event); 2162 + } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { 2189 2163 intel_pmu_disable_bts(); 2190 2164 intel_pmu_drain_bts_buffer(); 2191 - return; 2192 - } 2193 - 2194 - cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); 2195 - cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); 2196 - cpuc->intel_cp_status &= ~(1ull << hwc->idx); 2197 - 2198 - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) 2199 - intel_pmu_disable_fixed(hwc); 2200 - else 2201 - x86_pmu_disable_event(event); 2165 + } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) 2166 + intel_clear_masks(event, idx); 2202 2167 2203 2168 /* 2204 2169 * Needs to be called after x86_pmu_disable_event, ··· 2257 2238 static void intel_pmu_enable_event(struct perf_event *event) 2258 2239 { 2259 2240 struct hw_perf_event *hwc = &event->hw; 2260 - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2261 - 2262 - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { 2263 - if (!__this_cpu_read(cpu_hw_events.enabled)) 2264 - return; 2265 - 2266 - intel_pmu_enable_bts(hwc->config); 2267 - return; 2268 - } 2269 - 2270 - if (event->attr.exclude_host) 2271 - cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); 2272 - if (event->attr.exclude_guest) 2273 - cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); 2274 - 2275 - if (unlikely(event_is_checkpointed(event))) 2276 - cpuc->intel_cp_status |= (1ull << hwc->idx); 2241 + int idx = hwc->idx; 2277 2242 2278 2243 if (unlikely(event->attr.precise_ip)) 2279 2244 intel_pmu_pebs_enable(event); 2280 2245 2281 - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 2246 + if (idx < INTEL_PMC_IDX_FIXED) { 2247 + intel_set_masks(event, idx); 2248 + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 2249 + } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { 2250 + intel_set_masks(event, idx); 2282 2251 intel_pmu_enable_fixed(event); 2283 - return; 2284 - } 2285 - 2286 - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); 2252 + } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { 2253 + if (!__this_cpu_read(cpu_hw_events.enabled)) 2254 + return; 2255 + intel_pmu_enable_bts(hwc->config); 2256 + } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) 2257 + intel_set_masks(event, idx); 2287 2258 } 2288 2259 2289 2260 static void intel_pmu_add_event(struct perf_event *event) ··· 2623 2614 return NULL; 2624 2615 } 2625 2616 2617 + /* 2618 + * Note: matches a fake event, like Fixed2. 2619 + */ 2620 + static struct event_constraint * 2621 + intel_vlbr_constraints(struct perf_event *event) 2622 + { 2623 + struct event_constraint *c = &vlbr_constraint; 2624 + 2625 + if (unlikely(constraint_match(c, event->hw.config))) 2626 + return c; 2627 + 2628 + return NULL; 2629 + } 2630 + 2626 2631 static int intel_alt_er(int idx, u64 config) 2627 2632 { 2628 2633 int alt_idx = idx; ··· 2826 2803 struct perf_event *event) 2827 2804 { 2828 2805 struct event_constraint *c; 2806 + 2807 + c = intel_vlbr_constraints(event); 2808 + if (c) 2809 + return c; 2829 2810 2830 2811 c = intel_bts_constraints(event); 2831 2812 if (c) ··· 3978 3951 .cpu_dead = intel_pmu_cpu_dead, 3979 3952 3980 3953 .check_period = intel_pmu_check_period, 3954 + 3955 + .lbr_reset = intel_pmu_lbr_reset_64, 3956 + .lbr_read = intel_pmu_lbr_read_64, 3957 + .lbr_save = intel_pmu_lbr_save, 3958 + .lbr_restore = intel_pmu_lbr_restore, 3981 3959 }; 3982 3960 3983 3961 static __initconst const struct x86_pmu intel_pmu = { ··· 4028 3996 .check_period = intel_pmu_check_period, 4029 3997 4030 3998 .aux_output_match = intel_pmu_aux_output_match, 3999 + 4000 + .lbr_reset = intel_pmu_lbr_reset_64, 4001 + .lbr_read = intel_pmu_lbr_read_64, 4002 + .lbr_save = intel_pmu_lbr_save, 4003 + .lbr_restore = intel_pmu_lbr_restore, 4031 4004 }; 4032 4005 4033 4006 static __init void intel_clovertown_quirk(void) ··· 4658 4621 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); 4659 4622 x86_pmu.intel_cap.capabilities = capabilities; 4660 4623 } 4624 + 4625 + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { 4626 + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; 4627 + x86_pmu.lbr_read = intel_pmu_lbr_read_32; 4628 + } 4629 + 4630 + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) 4631 + intel_pmu_arch_lbr_init(); 4661 4632 4662 4633 intel_ds_init(); 4663 4634
+3 -3
arch/x86/events/intel/ds.c
··· 954 954 if (pebs_data_cfg & PEBS_DATACFG_XMMS) 955 955 sz += sizeof(struct pebs_xmm); 956 956 if (pebs_data_cfg & PEBS_DATACFG_LBRS) 957 - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); 957 + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); 958 958 959 959 cpuc->pebs_record_size = sz; 960 960 } ··· 1595 1595 } 1596 1596 1597 1597 if (format_size & PEBS_DATACFG_LBRS) { 1598 - struct pebs_lbr *lbr = next_record; 1598 + struct lbr_entry *lbr = next_record; 1599 1599 int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) 1600 1600 & 0xff) + 1; 1601 - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); 1601 + next_record = next_record + num_lbr * sizeof(struct lbr_entry); 1602 1602 1603 1603 if (has_branch_stack(event)) { 1604 1604 intel_pmu_store_pebs_lbrs(lbr);
+618 -115
arch/x86/events/intel/lbr.c
··· 8 8 9 9 #include "../perf_event.h" 10 10 11 - enum { 12 - LBR_FORMAT_32 = 0x00, 13 - LBR_FORMAT_LIP = 0x01, 14 - LBR_FORMAT_EIP = 0x02, 15 - LBR_FORMAT_EIP_FLAGS = 0x03, 16 - LBR_FORMAT_EIP_FLAGS2 = 0x04, 17 - LBR_FORMAT_INFO = 0x05, 18 - LBR_FORMAT_TIME = 0x06, 19 - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, 20 - }; 21 - 22 11 static const enum { 23 12 LBR_EIP_FLAGS = 1, 24 13 LBR_TSX = 2, ··· 132 143 X86_BR_IRQ |\ 133 144 X86_BR_INT) 134 145 146 + /* 147 + * Intel LBR_CTL bits 148 + * 149 + * Hardware branch filter for Arch LBR 150 + */ 151 + #define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */ 152 + #define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */ 153 + #define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */ 154 + #define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */ 155 + #define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */ 156 + #define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */ 157 + #define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */ 158 + #define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */ 159 + #define ARCH_LBR_RETURN_BIT 21 /* capture near returns */ 160 + #define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */ 161 + 162 + #define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT) 163 + #define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT) 164 + #define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT) 165 + #define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT) 166 + #define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT) 167 + #define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT) 168 + #define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT) 169 + #define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT) 170 + #define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT) 171 + #define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT) 172 + 173 + #define ARCH_LBR_ANY \ 174 + (ARCH_LBR_JCC |\ 175 + ARCH_LBR_REL_JMP |\ 176 + ARCH_LBR_IND_JMP |\ 177 + ARCH_LBR_REL_CALL |\ 178 + ARCH_LBR_IND_CALL |\ 179 + ARCH_LBR_RETURN |\ 180 + ARCH_LBR_OTHER_BRANCH) 181 + 182 + #define ARCH_LBR_CTL_MASK 0x7f000e 183 + 135 184 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); 185 + 186 + static __always_inline bool is_lbr_call_stack_bit_set(u64 config) 187 + { 188 + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) 189 + return !!(config & ARCH_LBR_CALL_STACK); 190 + 191 + return !!(config & LBR_CALL_STACK); 192 + } 136 193 137 194 /* 138 195 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI ··· 203 168 */ 204 169 if (cpuc->lbr_sel) 205 170 lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; 206 - if (!pmi && cpuc->lbr_sel) 171 + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) 207 172 wrmsrl(MSR_LBR_SELECT, lbr_select); 208 173 209 174 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 210 175 orig_debugctl = debugctl; 211 - debugctl |= DEBUGCTLMSR_LBR; 176 + 177 + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) 178 + debugctl |= DEBUGCTLMSR_LBR; 212 179 /* 213 180 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. 214 181 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions 215 182 * may cause superfluous increase/decrease of LBR_TOS. 216 183 */ 217 - if (!(lbr_select & LBR_CALL_STACK)) 184 + if (is_lbr_call_stack_bit_set(lbr_select)) 185 + debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 186 + else 218 187 debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 188 + 219 189 if (orig_debugctl != debugctl) 220 190 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 191 + 192 + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) 193 + wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); 221 194 } 222 195 223 196 static void __intel_pmu_lbr_disable(void) 224 197 { 225 198 u64 debugctl; 226 199 200 + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { 201 + wrmsrl(MSR_ARCH_LBR_CTL, 0); 202 + return; 203 + } 204 + 227 205 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 228 206 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 229 207 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 230 208 } 231 209 232 - static void intel_pmu_lbr_reset_32(void) 210 + void intel_pmu_lbr_reset_32(void) 233 211 { 234 212 int i; 235 213 ··· 250 202 wrmsrl(x86_pmu.lbr_from + i, 0); 251 203 } 252 204 253 - static void intel_pmu_lbr_reset_64(void) 205 + void intel_pmu_lbr_reset_64(void) 254 206 { 255 207 int i; 256 208 ··· 258 210 wrmsrl(x86_pmu.lbr_from + i, 0); 259 211 wrmsrl(x86_pmu.lbr_to + i, 0); 260 212 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 261 - wrmsrl(MSR_LBR_INFO_0 + i, 0); 213 + wrmsrl(x86_pmu.lbr_info + i, 0); 262 214 } 215 + } 216 + 217 + static void intel_pmu_arch_lbr_reset(void) 218 + { 219 + /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ 220 + wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); 263 221 } 264 222 265 223 void intel_pmu_lbr_reset(void) ··· 275 221 if (!x86_pmu.lbr_nr) 276 222 return; 277 223 278 - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) 279 - intel_pmu_lbr_reset_32(); 280 - else 281 - intel_pmu_lbr_reset_64(); 224 + x86_pmu.lbr_reset(); 282 225 283 226 cpuc->last_task_ctx = NULL; 284 227 cpuc->last_log_id = 0; ··· 359 308 return val; 360 309 } 361 310 362 - static inline void wrlbr_from(unsigned int idx, u64 val) 311 + static __always_inline void wrlbr_from(unsigned int idx, u64 val) 363 312 { 364 313 val = lbr_from_signext_quirk_wr(val); 365 314 wrmsrl(x86_pmu.lbr_from + idx, val); 366 315 } 367 316 368 - static inline void wrlbr_to(unsigned int idx, u64 val) 317 + static __always_inline void wrlbr_to(unsigned int idx, u64 val) 369 318 { 370 319 wrmsrl(x86_pmu.lbr_to + idx, val); 371 320 } 372 321 373 - static inline u64 rdlbr_from(unsigned int idx) 322 + static __always_inline void wrlbr_info(unsigned int idx, u64 val) 323 + { 324 + wrmsrl(x86_pmu.lbr_info + idx, val); 325 + } 326 + 327 + static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) 374 328 { 375 329 u64 val; 330 + 331 + if (lbr) 332 + return lbr->from; 376 333 377 334 rdmsrl(x86_pmu.lbr_from + idx, val); 378 335 379 336 return lbr_from_signext_quirk_rd(val); 380 337 } 381 338 382 - static inline u64 rdlbr_to(unsigned int idx) 339 + static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) 383 340 { 384 341 u64 val; 342 + 343 + if (lbr) 344 + return lbr->to; 385 345 386 346 rdmsrl(x86_pmu.lbr_to + idx, val); 387 347 388 348 return val; 389 349 } 390 350 391 - static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) 351 + static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) 392 352 { 353 + u64 val; 354 + 355 + if (lbr) 356 + return lbr->info; 357 + 358 + rdmsrl(x86_pmu.lbr_info + idx, val); 359 + 360 + return val; 361 + } 362 + 363 + static inline void 364 + wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) 365 + { 366 + wrlbr_from(idx, lbr->from); 367 + wrlbr_to(idx, lbr->to); 368 + if (need_info) 369 + wrlbr_info(idx, lbr->info); 370 + } 371 + 372 + static inline bool 373 + rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) 374 + { 375 + u64 from = rdlbr_from(idx, NULL); 376 + 377 + /* Don't read invalid entry */ 378 + if (!from) 379 + return false; 380 + 381 + lbr->from = from; 382 + lbr->to = rdlbr_to(idx, NULL); 383 + if (need_info) 384 + lbr->info = rdlbr_info(idx, NULL); 385 + 386 + return true; 387 + } 388 + 389 + void intel_pmu_lbr_restore(void *ctx) 390 + { 391 + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; 393 392 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 393 + struct x86_perf_task_context *task_ctx = ctx; 394 394 int i; 395 395 unsigned lbr_idx, mask; 396 - u64 tos; 397 - 398 - if (task_ctx->lbr_callstack_users == 0 || 399 - task_ctx->lbr_stack_state == LBR_NONE) { 400 - intel_pmu_lbr_reset(); 401 - return; 402 - } 403 - 404 - tos = task_ctx->tos; 405 - /* 406 - * Does not restore the LBR registers, if 407 - * - No one else touched them, and 408 - * - Did not enter C6 409 - */ 410 - if ((task_ctx == cpuc->last_task_ctx) && 411 - (task_ctx->log_id == cpuc->last_log_id) && 412 - rdlbr_from(tos)) { 413 - task_ctx->lbr_stack_state = LBR_NONE; 414 - return; 415 - } 396 + u64 tos = task_ctx->tos; 416 397 417 398 mask = x86_pmu.lbr_nr - 1; 418 399 for (i = 0; i < task_ctx->valid_lbrs; i++) { 419 400 lbr_idx = (tos - i) & mask; 420 - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); 421 - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); 422 - 423 - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 424 - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); 401 + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); 425 402 } 426 403 427 404 for (; i < x86_pmu.lbr_nr; i++) { ··· 457 378 wrlbr_from(lbr_idx, 0); 458 379 wrlbr_to(lbr_idx, 0); 459 380 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 460 - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); 381 + wrlbr_info(lbr_idx, 0); 461 382 } 462 383 463 384 wrmsrl(x86_pmu.lbr_tos, tos); 464 - task_ctx->lbr_stack_state = LBR_NONE; 385 + 386 + if (cpuc->lbr_select) 387 + wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); 465 388 } 466 389 467 - static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) 390 + static void intel_pmu_arch_lbr_restore(void *ctx) 468 391 { 469 - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 470 - unsigned lbr_idx, mask; 471 - u64 tos, from; 392 + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; 393 + struct lbr_entry *entries = task_ctx->entries; 472 394 int i; 473 395 474 - if (task_ctx->lbr_callstack_users == 0) { 475 - task_ctx->lbr_stack_state = LBR_NONE; 396 + /* Fast reset the LBRs before restore if the call stack is not full. */ 397 + if (!entries[x86_pmu.lbr_nr - 1].from) 398 + intel_pmu_arch_lbr_reset(); 399 + 400 + for (i = 0; i < x86_pmu.lbr_nr; i++) { 401 + if (!entries[i].from) 402 + break; 403 + wrlbr_all(&entries[i], i, true); 404 + } 405 + } 406 + 407 + /* 408 + * Restore the Architecture LBR state from the xsave area in the perf 409 + * context data for the task via the XRSTORS instruction. 410 + */ 411 + static void intel_pmu_arch_lbr_xrstors(void *ctx) 412 + { 413 + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; 414 + 415 + copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); 416 + } 417 + 418 + static __always_inline bool lbr_is_reset_in_cstate(void *ctx) 419 + { 420 + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) 421 + return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL); 422 + 423 + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); 424 + } 425 + 426 + static void __intel_pmu_lbr_restore(void *ctx) 427 + { 428 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 429 + 430 + if (task_context_opt(ctx)->lbr_callstack_users == 0 || 431 + task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { 432 + intel_pmu_lbr_reset(); 476 433 return; 477 434 } 435 + 436 + /* 437 + * Does not restore the LBR registers, if 438 + * - No one else touched them, and 439 + * - Was not cleared in Cstate 440 + */ 441 + if ((ctx == cpuc->last_task_ctx) && 442 + (task_context_opt(ctx)->log_id == cpuc->last_log_id) && 443 + !lbr_is_reset_in_cstate(ctx)) { 444 + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; 445 + return; 446 + } 447 + 448 + x86_pmu.lbr_restore(ctx); 449 + 450 + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; 451 + } 452 + 453 + void intel_pmu_lbr_save(void *ctx) 454 + { 455 + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; 456 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 457 + struct x86_perf_task_context *task_ctx = ctx; 458 + unsigned lbr_idx, mask; 459 + u64 tos; 460 + int i; 478 461 479 462 mask = x86_pmu.lbr_nr - 1; 480 463 tos = intel_pmu_lbr_tos(); 481 464 for (i = 0; i < x86_pmu.lbr_nr; i++) { 482 465 lbr_idx = (tos - i) & mask; 483 - from = rdlbr_from(lbr_idx); 484 - if (!from) 466 + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info)) 485 467 break; 486 - task_ctx->lbr_from[i] = from; 487 - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); 488 - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) 489 - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); 490 468 } 491 469 task_ctx->valid_lbrs = i; 492 470 task_ctx->tos = tos; 493 - task_ctx->lbr_stack_state = LBR_VALID; 494 471 495 - cpuc->last_task_ctx = task_ctx; 496 - cpuc->last_log_id = ++task_ctx->log_id; 472 + if (cpuc->lbr_select) 473 + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); 474 + } 475 + 476 + static void intel_pmu_arch_lbr_save(void *ctx) 477 + { 478 + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; 479 + struct lbr_entry *entries = task_ctx->entries; 480 + int i; 481 + 482 + for (i = 0; i < x86_pmu.lbr_nr; i++) { 483 + if (!rdlbr_all(&entries[i], i, true)) 484 + break; 485 + } 486 + 487 + /* LBR call stack is not full. Reset is required in restore. */ 488 + if (i < x86_pmu.lbr_nr) 489 + entries[x86_pmu.lbr_nr - 1].from = 0; 490 + } 491 + 492 + /* 493 + * Save the Architecture LBR state to the xsave area in the perf 494 + * context data for the task via the XSAVES instruction. 495 + */ 496 + static void intel_pmu_arch_lbr_xsaves(void *ctx) 497 + { 498 + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; 499 + 500 + copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); 501 + } 502 + 503 + static void __intel_pmu_lbr_save(void *ctx) 504 + { 505 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 506 + 507 + if (task_context_opt(ctx)->lbr_callstack_users == 0) { 508 + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; 509 + return; 510 + } 511 + 512 + x86_pmu.lbr_save(ctx); 513 + 514 + task_context_opt(ctx)->lbr_stack_state = LBR_VALID; 515 + 516 + cpuc->last_task_ctx = ctx; 517 + cpuc->last_log_id = ++task_context_opt(ctx)->log_id; 497 518 } 498 519 499 520 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, 500 521 struct perf_event_context *next) 501 522 { 502 - struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; 523 + void *prev_ctx_data, *next_ctx_data; 503 524 504 525 swap(prev->task_ctx_data, next->task_ctx_data); 505 526 ··· 615 436 if (!prev_ctx_data || !next_ctx_data) 616 437 return; 617 438 618 - swap(prev_ctx_data->lbr_callstack_users, 619 - next_ctx_data->lbr_callstack_users); 439 + swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, 440 + task_context_opt(next_ctx_data)->lbr_callstack_users); 620 441 } 621 442 622 443 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) 623 444 { 624 445 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 625 - struct x86_perf_task_context *task_ctx; 446 + void *task_ctx; 626 447 627 448 if (!cpuc->lbr_users) 628 449 return; ··· 658 479 659 480 void intel_pmu_lbr_add(struct perf_event *event) 660 481 { 482 + struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache; 661 483 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 662 - struct x86_perf_task_context *task_ctx; 663 484 664 485 if (!x86_pmu.lbr_nr) 665 486 return; 666 487 488 + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) 489 + cpuc->lbr_select = 1; 490 + 667 491 cpuc->br_sel = event->hw.branch_reg.reg; 668 492 669 - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { 670 - task_ctx = event->ctx->task_ctx_data; 671 - task_ctx->lbr_callstack_users++; 672 - } 493 + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) 494 + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; 673 495 674 496 /* 675 497 * Request pmu::sched_task() callback, which will fire inside the ··· 696 516 perf_sched_cb_inc(event->ctx->pmu); 697 517 if (!cpuc->lbr_users++ && !event->total_time_running) 698 518 intel_pmu_lbr_reset(); 519 + 520 + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && 521 + kmem_cache && !cpuc->lbr_xsave && 522 + (cpuc->lbr_users != cpuc->lbr_pebs_users)) 523 + cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL); 524 + } 525 + 526 + void release_lbr_buffers(void) 527 + { 528 + struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache; 529 + struct cpu_hw_events *cpuc; 530 + int cpu; 531 + 532 + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) 533 + return; 534 + 535 + for_each_possible_cpu(cpu) { 536 + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); 537 + if (kmem_cache && cpuc->lbr_xsave) { 538 + kmem_cache_free(kmem_cache, cpuc->lbr_xsave); 539 + cpuc->lbr_xsave = NULL; 540 + } 541 + } 699 542 } 700 543 701 544 void intel_pmu_lbr_del(struct perf_event *event) 702 545 { 703 546 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 704 - struct x86_perf_task_context *task_ctx; 705 547 706 548 if (!x86_pmu.lbr_nr) 707 549 return; 708 550 709 551 if (branch_user_callstack(cpuc->br_sel) && 710 - event->ctx->task_ctx_data) { 711 - task_ctx = event->ctx->task_ctx_data; 712 - task_ctx->lbr_callstack_users--; 713 - } 552 + event->ctx->task_ctx_data) 553 + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; 554 + 555 + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) 556 + cpuc->lbr_select = 0; 714 557 715 558 if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) 716 559 cpuc->lbr_pebs_users--; ··· 743 540 perf_sched_cb_dec(event->ctx->pmu); 744 541 } 745 542 543 + static inline bool vlbr_exclude_host(void) 544 + { 545 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 546 + 547 + return test_bit(INTEL_PMC_IDX_FIXED_VLBR, 548 + (unsigned long *)&cpuc->intel_ctrl_guest_mask); 549 + } 550 + 746 551 void intel_pmu_lbr_enable_all(bool pmi) 747 552 { 748 553 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 749 554 750 - if (cpuc->lbr_users) 555 + if (cpuc->lbr_users && !vlbr_exclude_host()) 751 556 __intel_pmu_lbr_enable(pmi); 752 557 } 753 558 ··· 763 552 { 764 553 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 765 554 766 - if (cpuc->lbr_users) 555 + if (cpuc->lbr_users && !vlbr_exclude_host()) 767 556 __intel_pmu_lbr_disable(); 768 557 } 769 558 770 - static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) 559 + void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) 771 560 { 772 561 unsigned long mask = x86_pmu.lbr_nr - 1; 773 562 u64 tos = intel_pmu_lbr_tos(); ··· 804 593 * is the same as the linear address, allowing us to merge the LIP and EIP 805 594 * LBR formats. 806 595 */ 807 - static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) 596 + void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) 808 597 { 809 598 bool need_info = false, call_stack = false; 810 599 unsigned long mask = x86_pmu.lbr_nr - 1; ··· 827 616 u16 cycles = 0; 828 617 int lbr_flags = lbr_desc[lbr_format]; 829 618 830 - from = rdlbr_from(lbr_idx); 831 - to = rdlbr_to(lbr_idx); 619 + from = rdlbr_from(lbr_idx, NULL); 620 + to = rdlbr_to(lbr_idx, NULL); 832 621 833 622 /* 834 623 * Read LBR call stack entries ··· 840 629 if (lbr_format == LBR_FORMAT_INFO && need_info) { 841 630 u64 info; 842 631 843 - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); 632 + info = rdlbr_info(lbr_idx, NULL); 844 633 mis = !!(info & LBR_INFO_MISPRED); 845 634 pred = !mis; 846 635 in_tx = !!(info & LBR_INFO_IN_TX); ··· 895 684 cpuc->lbr_stack.hw_idx = tos; 896 685 } 897 686 687 + static __always_inline int get_lbr_br_type(u64 info) 688 + { 689 + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) 690 + return 0; 691 + 692 + return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; 693 + } 694 + 695 + static __always_inline bool get_lbr_mispred(u64 info) 696 + { 697 + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) 698 + return 0; 699 + 700 + return !!(info & LBR_INFO_MISPRED); 701 + } 702 + 703 + static __always_inline bool get_lbr_predicted(u64 info) 704 + { 705 + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) 706 + return 0; 707 + 708 + return !(info & LBR_INFO_MISPRED); 709 + } 710 + 711 + static __always_inline bool get_lbr_cycles(u64 info) 712 + { 713 + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && 714 + !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) 715 + return 0; 716 + 717 + return info & LBR_INFO_CYCLES; 718 + } 719 + 720 + static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, 721 + struct lbr_entry *entries) 722 + { 723 + struct perf_branch_entry *e; 724 + struct lbr_entry *lbr; 725 + u64 from, to, info; 726 + int i; 727 + 728 + for (i = 0; i < x86_pmu.lbr_nr; i++) { 729 + lbr = entries ? &entries[i] : NULL; 730 + e = &cpuc->lbr_entries[i]; 731 + 732 + from = rdlbr_from(i, lbr); 733 + /* 734 + * Read LBR entries until invalid entry (0s) is detected. 735 + */ 736 + if (!from) 737 + break; 738 + 739 + to = rdlbr_to(i, lbr); 740 + info = rdlbr_info(i, lbr); 741 + 742 + e->from = from; 743 + e->to = to; 744 + e->mispred = get_lbr_mispred(info); 745 + e->predicted = get_lbr_predicted(info); 746 + e->in_tx = !!(info & LBR_INFO_IN_TX); 747 + e->abort = !!(info & LBR_INFO_ABORT); 748 + e->cycles = get_lbr_cycles(info); 749 + e->type = get_lbr_br_type(info); 750 + e->reserved = 0; 751 + } 752 + 753 + cpuc->lbr_stack.nr = i; 754 + } 755 + 756 + static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) 757 + { 758 + intel_pmu_store_lbr(cpuc, NULL); 759 + } 760 + 761 + static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) 762 + { 763 + struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave; 764 + 765 + if (!xsave) { 766 + intel_pmu_store_lbr(cpuc, NULL); 767 + return; 768 + } 769 + copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); 770 + 771 + intel_pmu_store_lbr(cpuc, xsave->lbr.entries); 772 + } 773 + 898 774 void intel_pmu_lbr_read(void) 899 775 { 900 776 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); ··· 992 694 * This could be smarter and actually check the event, 993 695 * but this simple approach seems to work for now. 994 696 */ 995 - if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) 697 + if (!cpuc->lbr_users || vlbr_exclude_host() || 698 + cpuc->lbr_users == cpuc->lbr_pebs_users) 996 699 return; 997 700 998 - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) 999 - intel_pmu_lbr_read_32(cpuc); 1000 - else 1001 - intel_pmu_lbr_read_64(cpuc); 701 + x86_pmu.lbr_read(cpuc); 1002 702 1003 703 intel_pmu_lbr_filter(cpuc); 1004 704 } ··· 1095 799 1096 800 reg = &event->hw.branch_reg; 1097 801 reg->idx = EXTRA_REG_LBR; 802 + 803 + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { 804 + reg->config = mask; 805 + return 0; 806 + } 1098 807 1099 808 /* 1100 809 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate ··· 1357 1056 return PERF_BR_UNKNOWN; 1358 1057 } 1359 1058 1059 + enum { 1060 + ARCH_LBR_BR_TYPE_JCC = 0, 1061 + ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, 1062 + ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2, 1063 + ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3, 1064 + ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4, 1065 + ARCH_LBR_BR_TYPE_NEAR_RET = 5, 1066 + ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET, 1067 + 1068 + ARCH_LBR_BR_TYPE_MAP_MAX = 16, 1069 + }; 1070 + 1071 + static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { 1072 + [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC, 1073 + [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP, 1074 + [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP, 1075 + [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL, 1076 + [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL, 1077 + [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET, 1078 + }; 1079 + 1360 1080 /* 1361 1081 * implement actual branch filter based on user demand. 1362 1082 * Hardware may not exactly satisfy that request, thus ··· 1390 1068 { 1391 1069 u64 from, to; 1392 1070 int br_sel = cpuc->br_sel; 1393 - int i, j, type; 1071 + int i, j, type, to_plm; 1394 1072 bool compress = false; 1395 1073 1396 1074 /* if sampling all branches, then nothing to filter */ ··· 1402 1080 1403 1081 from = cpuc->lbr_entries[i].from; 1404 1082 to = cpuc->lbr_entries[i].to; 1083 + type = cpuc->lbr_entries[i].type; 1405 1084 1406 - type = branch_type(from, to, cpuc->lbr_entries[i].abort); 1085 + /* 1086 + * Parse the branch type recorded in LBR_x_INFO MSR. 1087 + * Doesn't support OTHER_BRANCH decoding for now. 1088 + * OTHER_BRANCH branch type still rely on software decoding. 1089 + */ 1090 + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && 1091 + type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) { 1092 + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; 1093 + type = arch_lbr_br_type_map[type] | to_plm; 1094 + } else 1095 + type = branch_type(from, to, cpuc->lbr_entries[i].abort); 1407 1096 if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { 1408 1097 if (cpuc->lbr_entries[i].in_tx) 1409 1098 type |= X86_BR_IN_TX; ··· 1449 1116 } 1450 1117 } 1451 1118 1452 - void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) 1119 + void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) 1453 1120 { 1454 1121 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1455 - int i; 1456 1122 1457 - cpuc->lbr_stack.nr = x86_pmu.lbr_nr; 1458 - 1459 - /* Cannot get TOS for large PEBS */ 1460 - if (cpuc->n_pebs == cpuc->n_large_pebs) 1123 + /* Cannot get TOS for large PEBS and Arch LBR */ 1124 + if (static_cpu_has(X86_FEATURE_ARCH_LBR) || 1125 + (cpuc->n_pebs == cpuc->n_large_pebs)) 1461 1126 cpuc->lbr_stack.hw_idx = -1ULL; 1462 1127 else 1463 1128 cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); 1464 1129 1465 - for (i = 0; i < x86_pmu.lbr_nr; i++) { 1466 - u64 info = lbr->lbr[i].info; 1467 - struct perf_branch_entry *e = &cpuc->lbr_entries[i]; 1468 - 1469 - e->from = lbr->lbr[i].from; 1470 - e->to = lbr->lbr[i].to; 1471 - e->mispred = !!(info & LBR_INFO_MISPRED); 1472 - e->predicted = !(info & LBR_INFO_MISPRED); 1473 - e->in_tx = !!(info & LBR_INFO_IN_TX); 1474 - e->abort = !!(info & LBR_INFO_ABORT); 1475 - e->cycles = info & LBR_INFO_CYCLES; 1476 - e->reserved = 0; 1477 - } 1130 + intel_pmu_store_lbr(cpuc, lbr); 1478 1131 intel_pmu_lbr_filter(cpuc); 1479 1132 } 1480 1133 ··· 1517 1198 [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, 1518 1199 }; 1519 1200 1201 + static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { 1202 + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY, 1203 + [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER, 1204 + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL, 1205 + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, 1206 + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN | 1207 + ARCH_LBR_OTHER_BRANCH, 1208 + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL | 1209 + ARCH_LBR_IND_CALL | 1210 + ARCH_LBR_OTHER_BRANCH, 1211 + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL, 1212 + [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC, 1213 + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL | 1214 + ARCH_LBR_IND_CALL | 1215 + ARCH_LBR_RETURN | 1216 + ARCH_LBR_CALL_STACK, 1217 + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP, 1218 + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL, 1219 + }; 1220 + 1520 1221 /* core */ 1521 1222 void __init intel_pmu_lbr_init_core(void) 1522 1223 { ··· 1590 1251 */ 1591 1252 } 1592 1253 1254 + static inline struct kmem_cache * 1255 + create_lbr_kmem_cache(size_t size, size_t align) 1256 + { 1257 + return kmem_cache_create("x86_lbr", size, align, 0, NULL); 1258 + } 1259 + 1593 1260 /* haswell */ 1594 1261 void intel_pmu_lbr_init_hsw(void) 1595 1262 { 1263 + size_t size = sizeof(struct x86_perf_task_context); 1264 + 1596 1265 x86_pmu.lbr_nr = 16; 1597 1266 x86_pmu.lbr_tos = MSR_LBR_TOS; 1598 1267 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; ··· 1609 1262 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 1610 1263 x86_pmu.lbr_sel_map = hsw_lbr_sel_map; 1611 1264 1265 + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); 1266 + 1612 1267 if (lbr_from_signext_quirk_needed()) 1613 1268 static_branch_enable(&lbr_from_quirk_key); 1614 1269 } ··· 1618 1269 /* skylake */ 1619 1270 __init void intel_pmu_lbr_init_skl(void) 1620 1271 { 1272 + size_t size = sizeof(struct x86_perf_task_context); 1273 + 1621 1274 x86_pmu.lbr_nr = 32; 1622 1275 x86_pmu.lbr_tos = MSR_LBR_TOS; 1623 1276 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 1624 1277 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 1278 + x86_pmu.lbr_info = MSR_LBR_INFO_0; 1625 1279 1626 1280 x86_pmu.lbr_sel_mask = LBR_SEL_MASK; 1627 1281 x86_pmu.lbr_sel_map = hsw_lbr_sel_map; 1282 + 1283 + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); 1628 1284 1629 1285 /* 1630 1286 * SW branch filter usage: ··· 1697 1343 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) 1698 1344 x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; 1699 1345 } 1346 + 1347 + /* 1348 + * LBR state size is variable based on the max number of registers. 1349 + * This calculates the expected state size, which should match 1350 + * what the hardware enumerates for the size of XFEATURE_LBR. 1351 + */ 1352 + static inline unsigned int get_lbr_state_size(void) 1353 + { 1354 + return sizeof(struct arch_lbr_state) + 1355 + x86_pmu.lbr_nr * sizeof(struct lbr_entry); 1356 + } 1357 + 1358 + static bool is_arch_lbr_xsave_available(void) 1359 + { 1360 + if (!boot_cpu_has(X86_FEATURE_XSAVES)) 1361 + return false; 1362 + 1363 + /* 1364 + * Check the LBR state with the corresponding software structure. 1365 + * Disable LBR XSAVES support if the size doesn't match. 1366 + */ 1367 + if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size())) 1368 + return false; 1369 + 1370 + return true; 1371 + } 1372 + 1373 + void __init intel_pmu_arch_lbr_init(void) 1374 + { 1375 + struct pmu *pmu = x86_get_pmu(); 1376 + union cpuid28_eax eax; 1377 + union cpuid28_ebx ebx; 1378 + union cpuid28_ecx ecx; 1379 + unsigned int unused_edx; 1380 + bool arch_lbr_xsave; 1381 + size_t size; 1382 + u64 lbr_nr; 1383 + 1384 + /* Arch LBR Capabilities */ 1385 + cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); 1386 + 1387 + lbr_nr = fls(eax.split.lbr_depth_mask) * 8; 1388 + if (!lbr_nr) 1389 + goto clear_arch_lbr; 1390 + 1391 + /* Apply the max depth of Arch LBR */ 1392 + if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) 1393 + goto clear_arch_lbr; 1394 + 1395 + x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; 1396 + x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset; 1397 + x86_pmu.lbr_lip = eax.split.lbr_lip; 1398 + x86_pmu.lbr_cpl = ebx.split.lbr_cpl; 1399 + x86_pmu.lbr_filter = ebx.split.lbr_filter; 1400 + x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack; 1401 + x86_pmu.lbr_mispred = ecx.split.lbr_mispred; 1402 + x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; 1403 + x86_pmu.lbr_br_type = ecx.split.lbr_br_type; 1404 + x86_pmu.lbr_nr = lbr_nr; 1405 + 1406 + 1407 + arch_lbr_xsave = is_arch_lbr_xsave_available(); 1408 + if (arch_lbr_xsave) { 1409 + size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) + 1410 + get_lbr_state_size(); 1411 + pmu->task_ctx_cache = create_lbr_kmem_cache(size, 1412 + XSAVE_ALIGNMENT); 1413 + } 1414 + 1415 + if (!pmu->task_ctx_cache) { 1416 + arch_lbr_xsave = false; 1417 + 1418 + size = sizeof(struct x86_perf_task_context_arch_lbr) + 1419 + lbr_nr * sizeof(struct lbr_entry); 1420 + pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0); 1421 + } 1422 + 1423 + x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; 1424 + x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; 1425 + x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0; 1426 + 1427 + /* LBR callstack requires both CPL and Branch Filtering support */ 1428 + if (!x86_pmu.lbr_cpl || 1429 + !x86_pmu.lbr_filter || 1430 + !x86_pmu.lbr_call_stack) 1431 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP; 1432 + 1433 + if (!x86_pmu.lbr_cpl) { 1434 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP; 1435 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP; 1436 + } else if (!x86_pmu.lbr_filter) { 1437 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP; 1438 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP; 1439 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP; 1440 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP; 1441 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP; 1442 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP; 1443 + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP; 1444 + } 1445 + 1446 + x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK; 1447 + x86_pmu.lbr_ctl_map = arch_lbr_ctl_map; 1448 + 1449 + if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter) 1450 + x86_pmu.lbr_ctl_map = NULL; 1451 + 1452 + x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; 1453 + if (arch_lbr_xsave) { 1454 + x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves; 1455 + x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors; 1456 + x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave; 1457 + pr_cont("XSAVE "); 1458 + } else { 1459 + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; 1460 + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; 1461 + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; 1462 + } 1463 + 1464 + pr_cont("Architectural LBR, "); 1465 + 1466 + return; 1467 + 1468 + clear_arch_lbr: 1469 + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); 1470 + } 1471 + 1472 + /** 1473 + * x86_perf_get_lbr - get the LBR records information 1474 + * 1475 + * @lbr: the caller's memory to store the LBR records information 1476 + * 1477 + * Returns: 0 indicates the LBR info has been successfully obtained 1478 + */ 1479 + int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) 1480 + { 1481 + int lbr_fmt = x86_pmu.intel_cap.lbr_format; 1482 + 1483 + lbr->nr = x86_pmu.lbr_nr; 1484 + lbr->from = x86_pmu.lbr_from; 1485 + lbr->to = x86_pmu.lbr_to; 1486 + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; 1487 + 1488 + return 0; 1489 + } 1490 + EXPORT_SYMBOL_GPL(x86_perf_get_lbr); 1491 + 1492 + struct event_constraint vlbr_constraint = 1493 + __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR), 1494 + FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT);
+20 -6
arch/x86/events/intel/uncore.c
··· 16 16 DEFINE_RAW_SPINLOCK(pci2phy_map_lock); 17 17 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); 18 18 struct pci_extra_dev *uncore_extra_pci_dev; 19 - static int max_dies; 19 + int __uncore_max_dies; 20 20 21 21 /* mask of cpus that collect uncore events */ 22 22 static cpumask_t uncore_cpu_mask; ··· 108 108 * The unsigned check also catches the '-1' return value for non 109 109 * existent mappings in the topology map. 110 110 */ 111 - return dieid < max_dies ? pmu->boxes[dieid] : NULL; 111 + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; 112 112 } 113 113 114 114 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) ··· 130 130 struct perf_event *event) 131 131 { 132 132 if (!box->io_addr) 133 + return 0; 134 + 135 + if (!uncore_mmio_is_valid_offset(box, event->hw.event_base)) 133 136 return 0; 134 137 135 138 return readq(box->io_addr + event->hw.event_base); ··· 846 843 .read = uncore_pmu_event_read, 847 844 .module = THIS_MODULE, 848 845 .capabilities = PERF_PMU_CAP_NO_EXCLUDE, 846 + .attr_update = pmu->type->attr_update, 849 847 }; 850 848 } else { 851 849 pmu->pmu = *pmu->type->pmu; 852 850 pmu->pmu.attr_groups = pmu->type->attr_groups; 851 + pmu->pmu.attr_update = pmu->type->attr_update; 853 852 } 854 853 855 854 if (pmu->type->num_boxes == 1) { ··· 882 877 { 883 878 int die; 884 879 885 - for (die = 0; die < max_dies; die++) 880 + for (die = 0; die < uncore_max_dies(); die++) 886 881 kfree(pmu->boxes[die]); 887 882 kfree(pmu->boxes); 888 883 } ··· 891 886 { 892 887 struct intel_uncore_pmu *pmu = type->pmus; 893 888 int i; 889 + 890 + if (type->cleanup_mapping) 891 + type->cleanup_mapping(type); 894 892 895 893 if (pmu) { 896 894 for (i = 0; i < type->num_boxes; i++, pmu++) { ··· 923 915 if (!pmus) 924 916 return -ENOMEM; 925 917 926 - size = max_dies * sizeof(struct intel_uncore_box *); 918 + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); 927 919 928 920 for (i = 0; i < type->num_boxes; i++) { 929 921 pmus[i].func_id = setid ? i : -1; ··· 961 953 } 962 954 963 955 type->pmu_group = &uncore_pmu_attr_group; 956 + 957 + if (type->set_mapping) 958 + type->set_mapping(type); 964 959 965 960 return 0; 966 961 ··· 1123 1112 size_t size; 1124 1113 int ret; 1125 1114 1126 - size = max_dies * sizeof(struct pci_extra_dev); 1115 + size = uncore_max_dies() * sizeof(struct pci_extra_dev); 1127 1116 uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); 1128 1117 if (!uncore_extra_pci_dev) { 1129 1118 ret = -ENOMEM; ··· 1525 1514 X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), 1526 1515 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), 1527 1516 X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), 1517 + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), 1518 + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), 1528 1519 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), 1529 1520 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), 1530 1521 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), ··· 1552 1539 if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) 1553 1540 return -ENODEV; 1554 1541 1555 - max_dies = topology_max_packages() * topology_max_die_per_package(); 1542 + __uncore_max_dies = 1543 + topology_max_packages() * topology_max_die_per_package(); 1556 1544 1557 1545 uncore_init = (struct intel_uncore_init_fun *)id->driver_data; 1558 1546 if (uncore_init->pci_init) {
+37
arch/x86/events/intel/uncore.h
··· 61 61 unsigned msr_offset; 62 62 unsigned mmio_offset; 63 63 }; 64 + unsigned mmio_map_size; 64 65 unsigned num_shared_regs:8; 65 66 unsigned single_fixed:1; 66 67 unsigned pair_ctr_ctl:1; ··· 73 72 struct uncore_event_desc *event_descs; 74 73 struct freerunning_counters *freerunning; 75 74 const struct attribute_group *attr_groups[4]; 75 + const struct attribute_group **attr_update; 76 76 struct pmu *pmu; /* for custom pmu ops */ 77 + /* 78 + * Uncore PMU would store relevant platform topology configuration here 79 + * to identify which platform component each PMON block of that type is 80 + * supposed to monitor. 81 + */ 82 + u64 *topology; 83 + /* 84 + * Optional callbacks for managing mapping of Uncore units to PMONs 85 + */ 86 + int (*set_mapping)(struct intel_uncore_type *type); 87 + void (*cleanup_mapping)(struct intel_uncore_type *type); 77 88 }; 78 89 79 90 #define pmu_group attr_groups[0] ··· 182 169 ssize_t uncore_event_show(struct kobject *kobj, 183 170 struct kobj_attribute *attr, char *buf); 184 171 172 + static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) 173 + { 174 + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); 175 + } 176 + 177 + #define to_device_attribute(n) container_of(n, struct device_attribute, attr) 178 + #define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr) 179 + #define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n)) 180 + 181 + extern int __uncore_max_dies; 182 + #define uncore_max_dies() (__uncore_max_dies) 183 + 185 184 #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ 186 185 { \ 187 186 .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ ··· 219 194 static inline bool uncore_pmc_freerunning(int idx) 220 195 { 221 196 return idx == UNCORE_PMC_IDX_FREERUNNING; 197 + } 198 + 199 + static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box, 200 + unsigned long offset) 201 + { 202 + if (offset < box->pmu->type->mmio_map_size) 203 + return true; 204 + 205 + pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n", 206 + offset, box->pmu->type->name); 207 + 208 + return false; 222 209 } 223 210 224 211 static inline
+78 -2
arch/x86/events/intel/uncore_snb.c
··· 42 42 #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 43 43 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 44 44 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 45 + #define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44 46 + #define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54 47 + #define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64 48 + #define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51 49 + #define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61 50 + #define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71 51 + #define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33 52 + #define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43 53 + #define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53 54 + #define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63 55 + #define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73 45 56 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 46 57 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 47 58 #define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 ··· 426 415 427 416 static void snb_uncore_imc_init_box(struct intel_uncore_box *box) 428 417 { 418 + struct intel_uncore_type *type = box->pmu->type; 429 419 struct pci_dev *pdev = box->pci_dev; 430 420 int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; 431 421 resource_size_t addr; ··· 442 430 443 431 addr &= ~(PAGE_SIZE - 1); 444 432 445 - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); 433 + box->io_addr = ioremap(addr, type->mmio_map_size); 434 + if (!box->io_addr) 435 + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); 436 + 446 437 box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; 447 438 } 448 439 ··· 601 586 .num_counters = 2, 602 587 .num_boxes = 1, 603 588 .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, 589 + .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, 604 590 .freerunning = snb_uncore_imc_freerunning, 605 591 .event_descs = snb_uncore_imc_events, 606 592 .format_group = &snb_uncore_imc_format_group, ··· 787 771 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), 788 772 .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 789 773 }, 774 + { /* IMC */ 775 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), 776 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 777 + }, 778 + { /* IMC */ 779 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), 780 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 781 + }, 782 + { /* IMC */ 783 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), 784 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 785 + }, 786 + { /* IMC */ 787 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), 788 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 789 + }, 790 + { /* IMC */ 791 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), 792 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 793 + }, 794 + { /* IMC */ 795 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), 796 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 797 + }, 798 + { /* IMC */ 799 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), 800 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 801 + }, 802 + { /* IMC */ 803 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), 804 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 805 + }, 806 + { /* IMC */ 807 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), 808 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 809 + }, 810 + { /* IMC */ 811 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), 812 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 813 + }, 814 + { /* IMC */ 815 + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), 816 + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), 817 + }, 790 818 { /* end: all zeroes */ }, 791 819 }; 792 820 ··· 923 863 IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ 924 864 IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ 925 865 IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ 866 + IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver), 867 + IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver), 868 + IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver), 869 + IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver), 870 + IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver), 871 + IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver), 872 + IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver), 873 + IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver), 874 + IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver), 875 + IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver), 876 + IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), 926 877 IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ 927 878 IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ 928 879 { /* end marker */ } ··· 1156 1085 } 1157 1086 1158 1087 #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 1088 + #define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 1159 1089 1160 1090 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) 1161 1091 { 1162 1092 struct pci_dev *pdev = tgl_uncore_get_mc_dev(); 1163 1093 struct intel_uncore_pmu *pmu = box->pmu; 1094 + struct intel_uncore_type *type = pmu->type; 1164 1095 resource_size_t addr; 1165 1096 u32 mch_bar; 1166 1097 ··· 1185 1112 addr |= ((resource_size_t)mch_bar << 32); 1186 1113 #endif 1187 1114 1188 - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); 1115 + box->io_addr = ioremap(addr, type->mmio_map_size); 1116 + if (!box->io_addr) 1117 + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); 1189 1118 } 1190 1119 1191 1120 static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { ··· 1213 1138 .num_counters = 3, 1214 1139 .num_boxes = 2, 1215 1140 .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, 1141 + .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE, 1216 1142 .freerunning = tgl_uncore_imc_freerunning, 1217 1143 .ops = &tgl_uncore_imc_freerunning_ops, 1218 1144 .event_descs = tgl_uncore_imc_events,
+206 -2
arch/x86/events/intel/uncore_snbep.c
··· 273 273 #define SKX_CPUNODEID 0xc0 274 274 #define SKX_GIDNIDMAP 0xd4 275 275 276 + /* 277 + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR 278 + * that BIOS programmed. MSR has package scope. 279 + * | Bit | Default | Description 280 + * | [63] | 00h | VALID - When set, indicates the CPU bus 281 + * numbers have been initialized. (RO) 282 + * |[62:48]| --- | Reserved 283 + * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned 284 + * CPUBUSNO(5). (RO) 285 + * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned 286 + * CPUBUSNO(4). (RO) 287 + * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned 288 + * CPUBUSNO(3). (RO) 289 + * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned 290 + * CPUBUSNO(2). (RO) 291 + * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned 292 + * CPUBUSNO(1). (RO) 293 + * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned 294 + * CPUBUSNO(0). (RO) 295 + */ 296 + #define SKX_MSR_CPU_BUS_NUMBER 0x300 297 + #define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63) 298 + #define BUS_NUM_STRIDE 8 299 + 276 300 /* SKX CHA */ 277 301 #define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) 278 302 #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) ··· 3636 3612 .read_counter = uncore_msr_read_counter, 3637 3613 }; 3638 3614 3615 + static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) 3616 + { 3617 + return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); 3618 + } 3619 + 3620 + static umode_t 3621 + skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) 3622 + { 3623 + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); 3624 + 3625 + /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */ 3626 + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode; 3627 + } 3628 + 3629 + static ssize_t skx_iio_mapping_show(struct device *dev, 3630 + struct device_attribute *attr, char *buf) 3631 + { 3632 + struct pci_bus *bus = pci_find_next_bus(NULL); 3633 + struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); 3634 + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); 3635 + long die = (long)ea->var; 3636 + 3637 + /* 3638 + * Current implementation is for single segment configuration hence it's 3639 + * safe to take the segment value from the first available root bus. 3640 + */ 3641 + return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), 3642 + skx_iio_stack(uncore_pmu, die)); 3643 + } 3644 + 3645 + static int skx_msr_cpu_bus_read(int cpu, u64 *topology) 3646 + { 3647 + u64 msr_value; 3648 + 3649 + if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || 3650 + !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) 3651 + return -ENXIO; 3652 + 3653 + *topology = msr_value; 3654 + 3655 + return 0; 3656 + } 3657 + 3658 + static int die_to_cpu(int die) 3659 + { 3660 + int res = 0, cpu, current_die; 3661 + /* 3662 + * Using cpus_read_lock() to ensure cpu is not going down between 3663 + * looking at cpu_online_mask. 3664 + */ 3665 + cpus_read_lock(); 3666 + for_each_online_cpu(cpu) { 3667 + current_die = topology_logical_die_id(cpu); 3668 + if (current_die == die) { 3669 + res = cpu; 3670 + break; 3671 + } 3672 + } 3673 + cpus_read_unlock(); 3674 + return res; 3675 + } 3676 + 3677 + static int skx_iio_get_topology(struct intel_uncore_type *type) 3678 + { 3679 + int i, ret; 3680 + struct pci_bus *bus = NULL; 3681 + 3682 + /* 3683 + * Verified single-segment environments only; disabled for multiple 3684 + * segment topologies for now except VMD domains. 3685 + * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. 3686 + */ 3687 + while ((bus = pci_find_next_bus(bus)) 3688 + && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff)) 3689 + ; 3690 + if (bus) 3691 + return -EPERM; 3692 + 3693 + type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL); 3694 + if (!type->topology) 3695 + return -ENOMEM; 3696 + 3697 + for (i = 0; i < uncore_max_dies(); i++) { 3698 + ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]); 3699 + if (ret) { 3700 + kfree(type->topology); 3701 + type->topology = NULL; 3702 + return ret; 3703 + } 3704 + } 3705 + 3706 + return 0; 3707 + } 3708 + 3709 + static struct attribute_group skx_iio_mapping_group = { 3710 + .is_visible = skx_iio_mapping_visible, 3711 + }; 3712 + 3713 + static const struct attribute_group *skx_iio_attr_update[] = { 3714 + &skx_iio_mapping_group, 3715 + NULL, 3716 + }; 3717 + 3718 + static int skx_iio_set_mapping(struct intel_uncore_type *type) 3719 + { 3720 + char buf[64]; 3721 + int ret; 3722 + long die = -1; 3723 + struct attribute **attrs = NULL; 3724 + struct dev_ext_attribute *eas = NULL; 3725 + 3726 + ret = skx_iio_get_topology(type); 3727 + if (ret) 3728 + return ret; 3729 + 3730 + /* One more for NULL. */ 3731 + attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); 3732 + if (!attrs) 3733 + goto err; 3734 + 3735 + eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); 3736 + if (!eas) 3737 + goto err; 3738 + 3739 + for (die = 0; die < uncore_max_dies(); die++) { 3740 + sprintf(buf, "die%ld", die); 3741 + sysfs_attr_init(&eas[die].attr.attr); 3742 + eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); 3743 + if (!eas[die].attr.attr.name) 3744 + goto err; 3745 + eas[die].attr.attr.mode = 0444; 3746 + eas[die].attr.show = skx_iio_mapping_show; 3747 + eas[die].attr.store = NULL; 3748 + eas[die].var = (void *)die; 3749 + attrs[die] = &eas[die].attr.attr; 3750 + } 3751 + skx_iio_mapping_group.attrs = attrs; 3752 + 3753 + return 0; 3754 + err: 3755 + for (; die >= 0; die--) 3756 + kfree(eas[die].attr.attr.name); 3757 + kfree(eas); 3758 + kfree(attrs); 3759 + kfree(type->topology); 3760 + type->attr_update = NULL; 3761 + return -ENOMEM; 3762 + } 3763 + 3764 + static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) 3765 + { 3766 + struct attribute **attr = skx_iio_mapping_group.attrs; 3767 + 3768 + if (!attr) 3769 + return; 3770 + 3771 + for (; *attr; attr++) 3772 + kfree((*attr)->name); 3773 + kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs)); 3774 + kfree(skx_iio_mapping_group.attrs); 3775 + skx_iio_mapping_group.attrs = NULL; 3776 + kfree(type->topology); 3777 + } 3778 + 3639 3779 static struct intel_uncore_type skx_uncore_iio = { 3640 3780 .name = "iio", 3641 3781 .num_counters = 4, ··· 3814 3626 .constraints = skx_uncore_iio_constraints, 3815 3627 .ops = &skx_uncore_iio_ops, 3816 3628 .format_group = &skx_uncore_iio_format_group, 3629 + .attr_update = skx_iio_attr_update, 3630 + .set_mapping = skx_iio_set_mapping, 3631 + .cleanup_mapping = skx_iio_cleanup_mapping, 3817 3632 }; 3818 3633 3819 3634 enum perf_uncore_iio_freerunning_type_id { ··· 4612 4421 unsigned int box_ctl, int mem_offset) 4613 4422 { 4614 4423 struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); 4424 + struct intel_uncore_type *type = box->pmu->type; 4615 4425 resource_size_t addr; 4616 4426 u32 pci_dword; 4617 4427 ··· 4627 4435 4628 4436 addr += box_ctl; 4629 4437 4630 - box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); 4631 - if (!box->io_addr) 4438 + box->io_addr = ioremap(addr, type->mmio_map_size); 4439 + if (!box->io_addr) { 4440 + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); 4632 4441 return; 4442 + } 4633 4443 4634 4444 writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); 4635 4445 } ··· 4674 4480 if (!box->io_addr) 4675 4481 return; 4676 4482 4483 + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) 4484 + return; 4485 + 4677 4486 writel(hwc->config | SNBEP_PMON_CTL_EN, 4678 4487 box->io_addr + hwc->config_base); 4679 4488 } ··· 4687 4490 struct hw_perf_event *hwc = &event->hw; 4688 4491 4689 4492 if (!box->io_addr) 4493 + return; 4494 + 4495 + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) 4690 4496 return; 4691 4497 4692 4498 writel(hwc->config, box->io_addr + hwc->config_base); ··· 4730 4530 .event_mask = SNBEP_PMON_RAW_EVENT_MASK, 4731 4531 .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, 4732 4532 .mmio_offset = SNR_IMC_MMIO_OFFSET, 4533 + .mmio_map_size = SNR_IMC_MMIO_SIZE, 4733 4534 .ops = &snr_uncore_mmio_ops, 4734 4535 .format_group = &skx_uncore_format_group, 4735 4536 }; ··· 4771 4570 .num_counters = 3, 4772 4571 .num_boxes = 1, 4773 4572 .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, 4573 + .mmio_map_size = SNR_IMC_MMIO_SIZE, 4774 4574 .freerunning = snr_imc_freerunning, 4775 4575 .ops = &snr_uncore_imc_freerunning_ops, 4776 4576 .event_descs = snr_uncore_imc_freerunning_events, ··· 5189 4987 .event_mask = SNBEP_PMON_RAW_EVENT_MASK, 5190 4988 .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, 5191 4989 .mmio_offset = SNR_IMC_MMIO_OFFSET, 4990 + .mmio_map_size = SNR_IMC_MMIO_SIZE, 5192 4991 .ops = &icx_uncore_mmio_ops, 5193 4992 .format_group = &skx_uncore_format_group, 5194 4993 }; ··· 5247 5044 .num_counters = 5, 5248 5045 .num_boxes = 4, 5249 5046 .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX, 5047 + .mmio_map_size = SNR_IMC_MMIO_SIZE, 5250 5048 .freerunning = icx_imc_freerunning, 5251 5049 .ops = &icx_uncore_imc_freerunning_ops, 5252 5050 .event_descs = icx_uncore_imc_freerunning_events,
+112 -13
arch/x86/events/perf_event.h
··· 78 78 #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ 79 79 #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ 80 80 #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ 81 + #define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ 81 82 82 83 struct amd_nb { 83 84 int nb_id; /* NorthBridge id */ ··· 180 179 #define MAX_LBR_ENTRIES 32 181 180 182 181 enum { 182 + LBR_FORMAT_32 = 0x00, 183 + LBR_FORMAT_LIP = 0x01, 184 + LBR_FORMAT_EIP = 0x02, 185 + LBR_FORMAT_EIP_FLAGS = 0x03, 186 + LBR_FORMAT_EIP_FLAGS2 = 0x04, 187 + LBR_FORMAT_INFO = 0x05, 188 + LBR_FORMAT_TIME = 0x06, 189 + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, 190 + }; 191 + 192 + enum { 183 193 X86_PERF_KFREE_SHARED = 0, 184 194 X86_PERF_KFREE_EXCL = 1, 185 195 X86_PERF_KFREE_MAX ··· 245 233 int lbr_pebs_users; 246 234 struct perf_branch_stack lbr_stack; 247 235 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 248 - struct er_account *lbr_sel; 236 + union { 237 + struct er_account *lbr_sel; 238 + struct er_account *lbr_ctl; 239 + }; 249 240 u64 br_sel; 250 - struct x86_perf_task_context *last_task_ctx; 241 + void *last_task_ctx; 251 242 int last_log_id; 243 + int lbr_select; 244 + void *lbr_xsave; 252 245 253 246 /* 254 247 * Intel host/guest exclude bits ··· 690 673 /* 691 674 * Intel LBR 692 675 */ 693 - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 694 - int lbr_nr; /* hardware stack size */ 695 - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ 696 - const int *lbr_sel_map; /* lbr_select mappings */ 676 + unsigned int lbr_tos, lbr_from, lbr_to, 677 + lbr_info, lbr_nr; /* LBR base regs and size */ 678 + union { 679 + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ 680 + u64 lbr_ctl_mask; /* LBR_CTL valid bits */ 681 + }; 682 + union { 683 + const int *lbr_sel_map; /* lbr_select mappings */ 684 + int *lbr_ctl_map; /* LBR_CTL mappings */ 685 + }; 697 686 bool lbr_double_abort; /* duplicated lbr aborts */ 698 687 bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ 688 + 689 + /* 690 + * Intel Architectural LBR CPUID Enumeration 691 + */ 692 + unsigned int lbr_depth_mask:8; 693 + unsigned int lbr_deep_c_reset:1; 694 + unsigned int lbr_lip:1; 695 + unsigned int lbr_cpl:1; 696 + unsigned int lbr_filter:1; 697 + unsigned int lbr_call_stack:1; 698 + unsigned int lbr_mispred:1; 699 + unsigned int lbr_timed_lbr:1; 700 + unsigned int lbr_br_type:1; 701 + 702 + void (*lbr_reset)(void); 703 + void (*lbr_read)(struct cpu_hw_events *cpuc); 704 + void (*lbr_save)(void *ctx); 705 + void (*lbr_restore)(void *ctx); 699 706 700 707 /* 701 708 * Intel PT/LBR/BTS are exclusive ··· 759 718 int (*aux_output_match) (struct perf_event *event); 760 719 }; 761 720 762 - struct x86_perf_task_context { 763 - u64 lbr_from[MAX_LBR_ENTRIES]; 764 - u64 lbr_to[MAX_LBR_ENTRIES]; 765 - u64 lbr_info[MAX_LBR_ENTRIES]; 766 - int tos; 767 - int valid_lbrs; 721 + struct x86_perf_task_context_opt { 768 722 int lbr_callstack_users; 769 723 int lbr_stack_state; 770 724 int log_id; 725 + }; 726 + 727 + struct x86_perf_task_context { 728 + u64 lbr_sel; 729 + int tos; 730 + int valid_lbrs; 731 + struct x86_perf_task_context_opt opt; 732 + struct lbr_entry lbr[MAX_LBR_ENTRIES]; 733 + }; 734 + 735 + struct x86_perf_task_context_arch_lbr { 736 + struct x86_perf_task_context_opt opt; 737 + struct lbr_entry entries[]; 738 + }; 739 + 740 + /* 741 + * Add padding to guarantee the 64-byte alignment of the state buffer. 742 + * 743 + * The structure is dynamically allocated. The size of the LBR state may vary 744 + * based on the number of LBR registers. 745 + * 746 + * Do not put anything after the LBR state. 747 + */ 748 + struct x86_perf_task_context_arch_lbr_xsave { 749 + struct x86_perf_task_context_opt opt; 750 + 751 + union { 752 + struct xregs_state xsave; 753 + struct { 754 + struct fxregs_state i387; 755 + struct xstate_header header; 756 + struct arch_lbr_state lbr; 757 + } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT))); 758 + }; 771 759 }; 772 760 773 761 #define x86_add_quirk(func_) \ ··· 846 776 847 777 struct pmu *x86_get_pmu(void); 848 778 extern struct x86_pmu x86_pmu __read_mostly; 779 + 780 + static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) 781 + { 782 + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) 783 + return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; 784 + 785 + return &((struct x86_perf_task_context *)ctx)->opt; 786 + } 849 787 850 788 static inline bool x86_pmu_has_lbr_callstack(void) 851 789 { ··· 1067 989 1068 990 void reserve_ds_buffers(void); 1069 991 992 + void release_lbr_buffers(void); 993 + 1070 994 extern struct event_constraint bts_constraint; 995 + extern struct event_constraint vlbr_constraint; 1071 996 1072 997 void intel_pmu_enable_bts(u64 config); 1073 998 ··· 1122 1041 1123 1042 void intel_pmu_auto_reload_read(struct perf_event *event); 1124 1043 1125 - void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); 1044 + void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); 1126 1045 1127 1046 void intel_ds_init(void); 1128 1047 ··· 1135 1054 1136 1055 void intel_pmu_lbr_reset(void); 1137 1056 1057 + void intel_pmu_lbr_reset_32(void); 1058 + 1059 + void intel_pmu_lbr_reset_64(void); 1060 + 1138 1061 void intel_pmu_lbr_add(struct perf_event *event); 1139 1062 1140 1063 void intel_pmu_lbr_del(struct perf_event *event); ··· 1148 1063 void intel_pmu_lbr_disable_all(void); 1149 1064 1150 1065 void intel_pmu_lbr_read(void); 1066 + 1067 + void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); 1068 + 1069 + void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); 1070 + 1071 + void intel_pmu_lbr_save(void *ctx); 1072 + 1073 + void intel_pmu_lbr_restore(void *ctx); 1151 1074 1152 1075 void intel_pmu_lbr_init_core(void); 1153 1076 ··· 1172 1079 void intel_pmu_lbr_init_skl(void); 1173 1080 1174 1081 void intel_pmu_lbr_init_knl(void); 1082 + 1083 + void intel_pmu_arch_lbr_init(void); 1175 1084 1176 1085 void intel_pmu_pebs_data_source_nhm(void); 1177 1086 ··· 1207 1112 } 1208 1113 1209 1114 static inline void release_ds_buffers(void) 1115 + { 1116 + } 1117 + 1118 + static inline void release_lbr_buffers(void) 1210 1119 { 1211 1120 } 1212 1121
+2 -1
arch/x86/events/rapl.c
··· 787 787 X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), 788 788 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), 789 789 X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), 790 - X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), 790 + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), 791 + X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), 791 792 {}, 792 793 }; 793 794 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
+1 -1
arch/x86/events/zhaoxin/core.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * Zhoaxin PMU; like Intel Architectural PerfMon-v2 3 + * Zhaoxin PMU; like Intel Architectural PerfMon-v2 4 4 */ 5 5 6 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+1
arch/x86/include/asm/cpufeatures.h
··· 366 366 #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ 367 367 #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ 368 368 #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ 369 + #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ 369 370 #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ 370 371 #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ 371 372 #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
+7 -40
arch/x86/include/asm/fpu/internal.h
··· 274 274 */ 275 275 static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) 276 276 { 277 - u64 mask = -1; 277 + u64 mask = xfeatures_mask_all; 278 278 u32 lmask = mask; 279 279 u32 hmask = mask >> 32; 280 280 int err; ··· 320 320 */ 321 321 static inline void copy_xregs_to_kernel(struct xregs_state *xstate) 322 322 { 323 - u64 mask = -1; 323 + u64 mask = xfeatures_mask_all; 324 324 u32 lmask = mask; 325 325 u32 hmask = mask >> 32; 326 326 int err; ··· 356 356 */ 357 357 static inline int copy_xregs_to_user(struct xregs_state __user *buf) 358 358 { 359 + u64 mask = xfeatures_mask_user(); 360 + u32 lmask = mask; 361 + u32 hmask = mask >> 32; 359 362 int err; 360 363 361 364 /* ··· 370 367 return -EFAULT; 371 368 372 369 stac(); 373 - XSTATE_OP(XSAVE, buf, -1, -1, err); 370 + XSTATE_OP(XSAVE, buf, lmask, hmask, err); 374 371 clac(); 375 372 376 373 return err; ··· 411 408 return err; 412 409 } 413 410 414 - /* 415 - * These must be called with preempt disabled. Returns 416 - * 'true' if the FPU state is still intact and we can 417 - * keep registers active. 418 - * 419 - * The legacy FNSAVE instruction cleared all FPU state 420 - * unconditionally, so registers are essentially destroyed. 421 - * Modern FPU state can be kept in registers, if there are 422 - * no pending FP exceptions. 423 - */ 424 - static inline int copy_fpregs_to_fpstate(struct fpu *fpu) 425 - { 426 - if (likely(use_xsave())) { 427 - copy_xregs_to_kernel(&fpu->state.xsave); 428 - 429 - /* 430 - * AVX512 state is tracked here because its use is 431 - * known to slow the max clock speed of the core. 432 - */ 433 - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) 434 - fpu->avx512_timestamp = jiffies; 435 - return 1; 436 - } 437 - 438 - if (likely(use_fxsr())) { 439 - copy_fxregs_to_kernel(fpu); 440 - return 1; 441 - } 442 - 443 - /* 444 - * Legacy FPU register saving, FNSAVE always clears FPU registers, 445 - * so we have to mark them inactive: 446 - */ 447 - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); 448 - 449 - return 0; 450 - } 411 + extern int copy_fpregs_to_fpstate(struct fpu *fpu); 451 412 452 413 static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) 453 414 {
+27
arch/x86/include/asm/fpu/types.h
··· 114 114 XFEATURE_Hi16_ZMM, 115 115 XFEATURE_PT_UNIMPLEMENTED_SO_FAR, 116 116 XFEATURE_PKRU, 117 + XFEATURE_RSRVD_COMP_10, 118 + XFEATURE_RSRVD_COMP_11, 119 + XFEATURE_RSRVD_COMP_12, 120 + XFEATURE_RSRVD_COMP_13, 121 + XFEATURE_RSRVD_COMP_14, 122 + XFEATURE_LBR, 117 123 118 124 XFEATURE_MAX, 119 125 }; ··· 134 128 #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) 135 129 #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) 136 130 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) 131 + #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) 137 132 138 133 #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) 139 134 #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ ··· 234 227 struct pkru_state { 235 228 u32 pkru; 236 229 u32 pad; 230 + } __packed; 231 + 232 + /* 233 + * State component 15: Architectural LBR configuration state. 234 + * The size of Arch LBR state depends on the number of LBRs (lbr_depth). 235 + */ 236 + 237 + struct lbr_entry { 238 + u64 from; 239 + u64 to; 240 + u64 info; 241 + }; 242 + 243 + struct arch_lbr_state { 244 + u64 lbr_ctl; 245 + u64 lbr_depth; 246 + u64 ler_from; 247 + u64 ler_to; 248 + u64 ler_info; 249 + struct lbr_entry entries[]; 237 250 } __packed; 238 251 239 252 struct xstate_header {
+36
arch/x86/include/asm/fpu/xstate.h
··· 21 21 #define XSAVE_YMM_SIZE 256 22 22 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) 23 23 24 + #define XSAVE_ALIGNMENT 64 25 + 24 26 /* All currently supported user features */ 25 27 #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ 26 28 XFEATURE_MASK_SSE | \ ··· 38 36 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) 39 37 40 38 /* 39 + * A supervisor state component may not always contain valuable information, 40 + * and its size may be huge. Saving/restoring such supervisor state components 41 + * at each context switch can cause high CPU and space overhead, which should 42 + * be avoided. Such supervisor state components should only be saved/restored 43 + * on demand. The on-demand dynamic supervisor features are set in this mask. 44 + * 45 + * Unlike the existing supported supervisor features, a dynamic supervisor 46 + * feature does not allocate a buffer in task->fpu, and the corresponding 47 + * supervisor state component cannot be saved/restored at each context switch. 48 + * 49 + * To support a dynamic supervisor feature, a developer should follow the 50 + * dos and don'ts as below: 51 + * - Do dynamically allocate a buffer for the supervisor state component. 52 + * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the 53 + * state component to/from the buffer. 54 + * - Don't set the bit corresponding to the dynamic supervisor feature in 55 + * IA32_XSS at run time, since it has been set at boot time. 56 + */ 57 + #define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR) 58 + 59 + /* 41 60 * Unsupported supervisor features. When a supervisor feature in this mask is 42 61 * supported in the future, move it to the supported supervisor feature mask. 43 62 */ ··· 66 43 67 44 /* All supervisor states including supported and unsupported states. */ 68 45 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ 46 + XFEATURE_MASK_DYNAMIC | \ 69 47 XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) 70 48 71 49 #ifdef CONFIG_X86_64 ··· 87 63 return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; 88 64 } 89 65 66 + static inline u64 xfeatures_mask_dynamic(void) 67 + { 68 + if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) 69 + return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR; 70 + 71 + return XFEATURE_MASK_DYNAMIC; 72 + } 73 + 90 74 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; 91 75 92 76 extern void __init update_regset_xstate_info(unsigned int size, ··· 103 71 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); 104 72 const void *get_xsave_field_ptr(int xfeature_nr); 105 73 int using_compacted_format(void); 74 + int xfeature_size(int xfeature_nr); 106 75 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); 107 76 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); 108 77 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); 109 78 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); 110 79 void copy_supervisor_to_kernel(struct xregs_state *xsave); 80 + void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); 81 + void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); 82 + 111 83 112 84 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ 113 85 int validate_user_xstate_header(const struct xstate_header *hdr);
+2
arch/x86/include/asm/kprobes.h
··· 66 66 */ 67 67 bool boostable; 68 68 bool if_modifier; 69 + /* Number of bytes of text poked */ 70 + int tp_len; 69 71 }; 70 72 71 73 struct arch_optimized_insn {
+16
arch/x86/include/asm/msr-index.h
··· 158 158 #define LBR_INFO_MISPRED BIT_ULL(63) 159 159 #define LBR_INFO_IN_TX BIT_ULL(62) 160 160 #define LBR_INFO_ABORT BIT_ULL(61) 161 + #define LBR_INFO_CYC_CNT_VALID BIT_ULL(60) 161 162 #define LBR_INFO_CYCLES 0xffff 163 + #define LBR_INFO_BR_TYPE_OFFSET 56 164 + #define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) 165 + 166 + #define MSR_ARCH_LBR_CTL 0x000014ce 167 + #define ARCH_LBR_CTL_LBREN BIT(0) 168 + #define ARCH_LBR_CTL_CPL_OFFSET 1 169 + #define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET) 170 + #define ARCH_LBR_CTL_STACK_OFFSET 3 171 + #define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET) 172 + #define ARCH_LBR_CTL_FILTER_OFFSET 16 173 + #define ARCH_LBR_CTL_FILTER (0x7full << ARCH_LBR_CTL_FILTER_OFFSET) 174 + #define MSR_ARCH_LBR_DEPTH 0x000014cf 175 + #define MSR_ARCH_LBR_FROM_0 0x00001500 176 + #define MSR_ARCH_LBR_TO_0 0x00001600 177 + #define MSR_ARCH_LBR_INFO_0 0x00001200 162 178 163 179 #define MSR_IA32_PEBS_ENABLE 0x000003f1 164 180 #define MSR_PEBS_DATA_CFG 0x000003f2
+73 -9
arch/x86/include/asm/perf_event.h
··· 142 142 unsigned int full; 143 143 }; 144 144 145 + /* 146 + * Intel Architectural LBR CPUID detection/enumeration details: 147 + */ 148 + union cpuid28_eax { 149 + struct { 150 + /* Supported LBR depth values */ 151 + unsigned int lbr_depth_mask:8; 152 + unsigned int reserved:22; 153 + /* Deep C-state Reset */ 154 + unsigned int lbr_deep_c_reset:1; 155 + /* IP values contain LIP */ 156 + unsigned int lbr_lip:1; 157 + } split; 158 + unsigned int full; 159 + }; 160 + 161 + union cpuid28_ebx { 162 + struct { 163 + /* CPL Filtering Supported */ 164 + unsigned int lbr_cpl:1; 165 + /* Branch Filtering Supported */ 166 + unsigned int lbr_filter:1; 167 + /* Call-stack Mode Supported */ 168 + unsigned int lbr_call_stack:1; 169 + } split; 170 + unsigned int full; 171 + }; 172 + 173 + union cpuid28_ecx { 174 + struct { 175 + /* Mispredict Bit Supported */ 176 + unsigned int lbr_mispred:1; 177 + /* Timed LBRs Supported */ 178 + unsigned int lbr_timed_lbr:1; 179 + /* Branch Type Field Supported */ 180 + unsigned int lbr_br_type:1; 181 + } split; 182 + unsigned int full; 183 + }; 184 + 145 185 struct x86_pmu_capability { 146 186 int version; 147 187 int num_counters_gp; ··· 232 192 #define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) 233 193 #define GLOBAL_STATUS_ASIF BIT_ULL(60) 234 194 #define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) 235 - #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) 195 + #define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 196 + #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) 236 197 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) 198 + 199 + /* 200 + * We model guest LBR event tracing as another fixed-mode PMC like BTS. 201 + * 202 + * We choose bit 58 because it's used to indicate LBR stack frozen state 203 + * for architectural perfmon v4, also we unconditionally mask that bit in 204 + * the handle_pmi_common(), so it'll never be set in the overflow handling. 205 + * 206 + * With this fake counter assigned, the guest LBR event user (such as KVM), 207 + * can program the LBR registers on its own, and we don't actually do anything 208 + * with then in the host context. 209 + */ 210 + #define INTEL_PMC_IDX_FIXED_VLBR (GLOBAL_STATUS_LBRS_FROZEN_BIT) 211 + 212 + /* 213 + * Pseudo-encoding the guest LBR event as event=0x00,umask=0x1b, 214 + * since it would claim bit 58 which is effectively Fixed26. 215 + */ 216 + #define INTEL_FIXED_VLBR_EVENT 0x1b00 237 217 238 218 /* 239 219 * Adaptive PEBS v4 ··· 280 220 281 221 struct pebs_xmm { 282 222 u64 xmm[16*2]; /* two entries for each register */ 283 - }; 284 - 285 - struct pebs_lbr_entry { 286 - u64 from, to, info; 287 - }; 288 - 289 - struct pebs_lbr { 290 - struct pebs_lbr_entry lbr[0]; /* Variable length */ 291 223 }; 292 224 293 225 /* ··· 385 333 u64 host, guest; 386 334 }; 387 335 336 + struct x86_pmu_lbr { 337 + unsigned int nr; 338 + unsigned int from; 339 + unsigned int to; 340 + unsigned int info; 341 + }; 342 + 388 343 extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); 389 344 extern void perf_check_microcode(void); 390 345 extern int x86_perf_rdpmc_index(struct perf_event *event); ··· 407 348 408 349 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) 409 350 extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); 351 + extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); 410 352 #else 411 353 static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) 412 354 { 413 355 *nr = 0; 414 356 return NULL; 357 + } 358 + static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) 359 + { 360 + return -1; 415 361 } 416 362 #endif 417 363
+36 -1
arch/x86/kernel/alternative.c
··· 3 3 4 4 #include <linux/module.h> 5 5 #include <linux/sched.h> 6 + #include <linux/perf_event.h> 6 7 #include <linux/mutex.h> 7 8 #include <linux/list.h> 8 9 #include <linux/stringify.h> ··· 1002 1001 s32 rel32; 1003 1002 u8 opcode; 1004 1003 const u8 text[POKE_MAX_OPCODE_SIZE]; 1004 + u8 old; 1005 1005 }; 1006 1006 1007 1007 struct bp_patching_desc { ··· 1170 1168 /* 1171 1169 * First step: add a int3 trap to the address that will be patched. 1172 1170 */ 1173 - for (i = 0; i < nr_entries; i++) 1171 + for (i = 0; i < nr_entries; i++) { 1172 + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); 1174 1173 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); 1174 + } 1175 1175 1176 1176 text_poke_sync(); 1177 1177 ··· 1181 1177 * Second step: update all but the first byte of the patched range. 1182 1178 */ 1183 1179 for (do_sync = 0, i = 0; i < nr_entries; i++) { 1180 + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; 1184 1181 int len = text_opcode_size(tp[i].opcode); 1185 1182 1186 1183 if (len - INT3_INSN_SIZE > 0) { 1184 + memcpy(old + INT3_INSN_SIZE, 1185 + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1186 + len - INT3_INSN_SIZE); 1187 1187 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, 1188 1188 (const char *)tp[i].text + INT3_INSN_SIZE, 1189 1189 len - INT3_INSN_SIZE); 1190 1190 do_sync++; 1191 1191 } 1192 + 1193 + /* 1194 + * Emit a perf event to record the text poke, primarily to 1195 + * support Intel PT decoding which must walk the executable code 1196 + * to reconstruct the trace. The flow up to here is: 1197 + * - write INT3 byte 1198 + * - IPI-SYNC 1199 + * - write instruction tail 1200 + * At this point the actual control flow will be through the 1201 + * INT3 and handler and not hit the old or new instruction. 1202 + * Intel PT outputs FUP/TIP packets for the INT3, so the flow 1203 + * can still be decoded. Subsequently: 1204 + * - emit RECORD_TEXT_POKE with the new instruction 1205 + * - IPI-SYNC 1206 + * - write first byte 1207 + * - IPI-SYNC 1208 + * So before the text poke event timestamp, the decoder will see 1209 + * either the old instruction flow or FUP/TIP of INT3. After the 1210 + * text poke event timestamp, the decoder will see either the 1211 + * new instruction flow or FUP/TIP of INT3. Thus decoders can 1212 + * use the timestamp as the point at which to modify the 1213 + * executable code. 1214 + * The old instruction is recorded so that the event can be 1215 + * processed forwards or backwards. 1216 + */ 1217 + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, 1218 + tp[i].text, len); 1192 1219 } 1193 1220 1194 1221 if (do_sync) {
+39
arch/x86/kernel/fpu/core.c
··· 82 82 } 83 83 EXPORT_SYMBOL(irq_fpu_usable); 84 84 85 + /* 86 + * These must be called with preempt disabled. Returns 87 + * 'true' if the FPU state is still intact and we can 88 + * keep registers active. 89 + * 90 + * The legacy FNSAVE instruction cleared all FPU state 91 + * unconditionally, so registers are essentially destroyed. 92 + * Modern FPU state can be kept in registers, if there are 93 + * no pending FP exceptions. 94 + */ 95 + int copy_fpregs_to_fpstate(struct fpu *fpu) 96 + { 97 + if (likely(use_xsave())) { 98 + copy_xregs_to_kernel(&fpu->state.xsave); 99 + 100 + /* 101 + * AVX512 state is tracked here because its use is 102 + * known to slow the max clock speed of the core. 103 + */ 104 + if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) 105 + fpu->avx512_timestamp = jiffies; 106 + return 1; 107 + } 108 + 109 + if (likely(use_fxsr())) { 110 + copy_fxregs_to_kernel(fpu); 111 + return 1; 112 + } 113 + 114 + /* 115 + * Legacy FPU register saving, FNSAVE always clears FPU registers, 116 + * so we have to mark them inactive: 117 + */ 118 + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); 119 + 120 + return 0; 121 + } 122 + EXPORT_SYMBOL(copy_fpregs_to_fpstate); 123 + 85 124 void kernel_fpu_begin(void) 86 125 { 87 126 preempt_disable();
+83 -6
arch/x86/kernel/fpu/xstate.c
··· 233 233 /* 234 234 * MSR_IA32_XSS sets supervisor states managed by XSAVES. 235 235 */ 236 - if (boot_cpu_has(X86_FEATURE_XSAVES)) 237 - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 236 + if (boot_cpu_has(X86_FEATURE_XSAVES)) { 237 + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 238 + xfeatures_mask_dynamic()); 239 + } 238 240 } 239 241 240 242 static bool xfeature_enabled(enum xfeature xfeature) ··· 488 486 return ebx; 489 487 } 490 488 491 - static int xfeature_size(int xfeature_nr) 489 + int xfeature_size(int xfeature_nr) 492 490 { 493 491 u32 eax, ebx, ecx, edx; 494 492 ··· 600 598 */ 601 599 if ((nr < XFEATURE_YMM) || 602 600 (nr >= XFEATURE_MAX) || 603 - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { 601 + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || 602 + ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) { 604 603 WARN_ONCE(1, "no structure for xstate: %d\n", nr); 605 604 XSTATE_WARN_ON(1); 606 605 } ··· 850 847 * Restore IA32_XSS. The same CPUID bit enumerates support 851 848 * of XSAVES and MSR_IA32_XSS. 852 849 */ 853 - if (boot_cpu_has(X86_FEATURE_XSAVES)) 854 - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 850 + if (boot_cpu_has(X86_FEATURE_XSAVES)) { 851 + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 852 + xfeatures_mask_dynamic()); 853 + } 855 854 } 856 855 857 856 /* ··· 1359 1354 xbuf + xstate_supervisor_only_offsets[i], 1360 1355 xstate_sizes[i]); 1361 1356 } 1357 + } 1358 + 1359 + /** 1360 + * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to 1361 + * an xsave area 1362 + * @xstate: A pointer to an xsave area 1363 + * @mask: Represent the dynamic supervisor features saved into the xsave area 1364 + * 1365 + * Only the dynamic supervisor states sets in the mask are saved into the xsave 1366 + * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic 1367 + * supervisor feature). Besides the dynamic supervisor states, the legacy 1368 + * region and XSAVE header are also saved into the xsave area. The supervisor 1369 + * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and 1370 + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. 1371 + * 1372 + * The xsave area must be 64-bytes aligned. 1373 + */ 1374 + void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) 1375 + { 1376 + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; 1377 + u32 lmask, hmask; 1378 + int err; 1379 + 1380 + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) 1381 + return; 1382 + 1383 + if (WARN_ON_FPU(!dynamic_mask)) 1384 + return; 1385 + 1386 + lmask = dynamic_mask; 1387 + hmask = dynamic_mask >> 32; 1388 + 1389 + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); 1390 + 1391 + /* Should never fault when copying to a kernel buffer */ 1392 + WARN_ON_FPU(err); 1393 + } 1394 + 1395 + /** 1396 + * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from 1397 + * an xsave area 1398 + * @xstate: A pointer to an xsave area 1399 + * @mask: Represent the dynamic supervisor features restored from the xsave area 1400 + * 1401 + * Only the dynamic supervisor states sets in the mask are restored from the 1402 + * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of 1403 + * dynamic supervisor feature). Besides the dynamic supervisor states, the 1404 + * legacy region and XSAVE header are also restored from the xsave area. The 1405 + * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and 1406 + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. 1407 + * 1408 + * The xsave area must be 64-bytes aligned. 1409 + */ 1410 + void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) 1411 + { 1412 + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; 1413 + u32 lmask, hmask; 1414 + int err; 1415 + 1416 + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) 1417 + return; 1418 + 1419 + if (WARN_ON_FPU(!dynamic_mask)) 1420 + return; 1421 + 1422 + lmask = dynamic_mask; 1423 + hmask = dynamic_mask >> 32; 1424 + 1425 + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 1426 + 1427 + /* Should never fault when copying from a kernel buffer */ 1428 + WARN_ON_FPU(err); 1362 1429 } 1363 1430 1364 1431 #ifdef CONFIG_PROC_PID_ARCH_STATUS
+14 -1
arch/x86/kernel/kprobes/core.c
··· 33 33 #include <linux/hardirq.h> 34 34 #include <linux/preempt.h> 35 35 #include <linux/sched/debug.h> 36 + #include <linux/perf_event.h> 36 37 #include <linux/extable.h> 37 38 #include <linux/kdebug.h> 38 39 #include <linux/kallsyms.h> ··· 473 472 /* Also, displacement change doesn't affect the first byte */ 474 473 p->opcode = buf[0]; 475 474 475 + p->ainsn.tp_len = len; 476 + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); 477 + 476 478 /* OK, write back the instruction(s) into ROX insn buffer */ 477 479 text_poke(p->ainsn.insn, buf, len); 478 480 ··· 507 503 508 504 void arch_arm_kprobe(struct kprobe *p) 509 505 { 510 - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); 506 + u8 int3 = INT3_INSN_OPCODE; 507 + 508 + text_poke(p->addr, &int3, 1); 511 509 text_poke_sync(); 510 + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); 512 511 } 513 512 514 513 void arch_disarm_kprobe(struct kprobe *p) 515 514 { 515 + u8 int3 = INT3_INSN_OPCODE; 516 + 517 + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); 516 518 text_poke(p->addr, &p->opcode, 1); 517 519 text_poke_sync(); 518 520 } ··· 526 516 void arch_remove_kprobe(struct kprobe *p) 527 517 { 528 518 if (p->ainsn.insn) { 519 + /* Record the perf event before freeing the slot */ 520 + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, 521 + p->ainsn.tp_len, NULL, 0); 529 522 free_insn_slot(p->ainsn.insn, p->ainsn.boostable); 530 523 p->ainsn.insn = NULL; 531 524 }
+33 -5
arch/x86/kernel/kprobes/opt.c
··· 6 6 * Copyright (C) Hitachi Ltd., 2012 7 7 */ 8 8 #include <linux/kprobes.h> 9 + #include <linux/perf_event.h> 9 10 #include <linux/ptrace.h> 10 11 #include <linux/string.h> 11 12 #include <linux/slab.h> ··· 353 352 static 354 353 void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) 355 354 { 356 - if (op->optinsn.insn) { 357 - free_optinsn_slot(op->optinsn.insn, dirty); 355 + u8 *slot = op->optinsn.insn; 356 + if (slot) { 357 + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; 358 + 359 + /* Record the perf event before freeing the slot */ 360 + if (dirty) 361 + perf_event_text_poke(slot, slot, len, NULL, 0); 362 + 363 + free_optinsn_slot(slot, dirty); 358 364 op->optinsn.insn = NULL; 359 365 op->optinsn.size = 0; 360 366 } ··· 432 424 (u8 *)op->kp.addr + op->optinsn.size); 433 425 len += JMP32_INSN_SIZE; 434 426 427 + /* 428 + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also 429 + * used in __arch_remove_optimized_kprobe(). 430 + */ 431 + 435 432 /* We have to use text_poke() for instruction buffer because it is RO */ 433 + perf_event_text_poke(slot, NULL, 0, buf, len); 436 434 text_poke(slot, buf, len); 435 + 437 436 ret = 0; 438 437 out: 439 438 kfree(buf); ··· 492 477 */ 493 478 void arch_unoptimize_kprobe(struct optimized_kprobe *op) 494 479 { 495 - arch_arm_kprobe(&op->kp); 496 - text_poke(op->kp.addr + INT3_INSN_SIZE, 497 - op->optinsn.copied_insn, DISP32_SIZE); 480 + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; 481 + u8 old[JMP32_INSN_SIZE]; 482 + u8 *addr = op->kp.addr; 483 + 484 + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); 485 + memcpy(new + INT3_INSN_SIZE, 486 + op->optinsn.copied_insn, 487 + JMP32_INSN_SIZE - INT3_INSN_SIZE); 488 + 489 + text_poke(addr, new, INT3_INSN_SIZE); 498 490 text_poke_sync(); 491 + text_poke(addr + INT3_INSN_SIZE, 492 + new + INT3_INSN_SIZE, 493 + JMP32_INSN_SIZE - INT3_INSN_SIZE); 494 + text_poke_sync(); 495 + 496 + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); 499 497 } 500 498 501 499 /*
+8 -4
include/linux/ftrace.h
··· 58 58 const char * 59 59 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, 60 60 unsigned long *off, char **modname, char *sym); 61 - int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, 62 - char *type, char *name, 63 - char *module_name, int *exported); 64 61 #else 65 62 static inline const char * 66 63 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, ··· 65 68 { 66 69 return NULL; 67 70 } 71 + #endif 72 + 73 + #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) 74 + int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, 75 + char *type, char *name, 76 + char *module_name, int *exported); 77 + #else 68 78 static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, 69 79 char *type, char *name, 70 80 char *module_name, int *exported) ··· 79 75 return -1; 80 76 } 81 77 #endif 82 - 83 78 84 79 #ifdef CONFIG_FUNCTION_TRACER 85 80 ··· 210 207 struct ftrace_ops_hash old_hash; 211 208 unsigned long trampoline; 212 209 unsigned long trampoline_size; 210 + struct list_head list; 213 211 #endif 214 212 }; 215 213
+15
include/linux/kprobes.h
··· 242 242 struct mutex mutex; 243 243 void *(*alloc)(void); /* allocate insn page */ 244 244 void (*free)(void *); /* free insn page */ 245 + const char *sym; /* symbol for insn pages */ 245 246 struct list_head pages; /* list of kprobe_insn_page */ 246 247 size_t insn_size; /* size of instruction slot */ 247 248 int nr_garbage; ··· 273 272 { \ 274 273 return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \ 275 274 } 275 + #define KPROBE_INSN_PAGE_SYM "kprobe_insn_page" 276 + #define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" 277 + int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, 278 + unsigned long *value, char *type, char *sym); 276 279 #else /* __ARCH_WANT_KPROBES_INSN_SLOT */ 277 280 #define DEFINE_INSN_CACHE_OPS(__name) \ 278 281 static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ ··· 382 377 void *alloc_insn_page(void); 383 378 void free_insn_page(void *page); 384 379 380 + int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, 381 + char *sym); 382 + 383 + int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, 384 + char *type, char *sym); 385 385 #else /* !CONFIG_KPROBES: */ 386 386 387 387 static inline int kprobes_built_in(void) ··· 448 438 static inline bool within_kprobe_blacklist(unsigned long addr) 449 439 { 450 440 return true; 441 + } 442 + static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, 443 + char *type, char *sym) 444 + { 445 + return -ERANGE; 451 446 } 452 447 #endif /* CONFIG_KPROBES */ 453 448 static inline int disable_kretprobe(struct kretprobe *rp)
+12 -3
include/linux/perf_event.h
··· 366 366 * ->stop() with PERF_EF_UPDATE will read the counter and update 367 367 * period/count values like ->read() would. 368 368 * 369 - * ->start() with PERF_EF_RELOAD will reprogram the the counter 369 + * ->start() with PERF_EF_RELOAD will reprogram the counter 370 370 * value, must be preceded by a ->stop() with PERF_EF_UPDATE. 371 371 */ 372 372 void (*start) (struct perf_event *event, int flags); ··· 419 419 */ 420 420 void (*sched_task) (struct perf_event_context *ctx, 421 421 bool sched_in); 422 + 422 423 /* 423 - * PMU specific data size 424 + * Kmem cache of PMU specific data 424 425 */ 425 - size_t task_ctx_size; 426 + struct kmem_cache *task_ctx_cache; 426 427 427 428 /* 428 429 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) ··· 1233 1232 extern void perf_event_comm(struct task_struct *tsk, bool exec); 1234 1233 extern void perf_event_namespaces(struct task_struct *tsk); 1235 1234 extern void perf_event_fork(struct task_struct *tsk); 1235 + extern void perf_event_text_poke(const void *addr, 1236 + const void *old_bytes, size_t old_len, 1237 + const void *new_bytes, size_t new_len); 1236 1238 1237 1239 /* Callchains */ 1238 1240 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); ··· 1483 1479 static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } 1484 1480 static inline void perf_event_namespaces(struct task_struct *tsk) { } 1485 1481 static inline void perf_event_fork(struct task_struct *tsk) { } 1482 + static inline void perf_event_text_poke(const void *addr, 1483 + const void *old_bytes, 1484 + size_t old_len, 1485 + const void *new_bytes, 1486 + size_t new_len) { } 1486 1487 static inline void perf_event_init(void) { } 1487 1488 static inline int perf_swevent_get_recursion_context(void) { return -1; } 1488 1489 static inline void perf_swevent_put_recursion_context(int rctx) { }
+25 -1
include/uapi/linux/perf_event.h
··· 383 383 bpf_event : 1, /* include bpf events */ 384 384 aux_output : 1, /* generate AUX records instead of events */ 385 385 cgroup : 1, /* include cgroup events */ 386 - __reserved_1 : 31; 386 + text_poke : 1, /* include text poke events */ 387 + __reserved_1 : 30; 387 388 388 389 union { 389 390 __u32 wakeup_events; /* wakeup every n events */ ··· 1042 1041 */ 1043 1042 PERF_RECORD_CGROUP = 19, 1044 1043 1044 + /* 1045 + * Records changes to kernel text i.e. self-modified code. 'old_len' is 1046 + * the number of old bytes, 'new_len' is the number of new bytes. Either 1047 + * 'old_len' or 'new_len' may be zero to indicate, for example, the 1048 + * addition or removal of a trampoline. 'bytes' contains the old bytes 1049 + * followed immediately by the new bytes. 1050 + * 1051 + * struct { 1052 + * struct perf_event_header header; 1053 + * u64 addr; 1054 + * u16 old_len; 1055 + * u16 new_len; 1056 + * u8 bytes[]; 1057 + * struct sample_id sample_id; 1058 + * }; 1059 + */ 1060 + PERF_RECORD_TEXT_POKE = 20, 1061 + 1045 1062 PERF_RECORD_MAX, /* non-ABI */ 1046 1063 }; 1047 1064 1048 1065 enum perf_record_ksymbol_type { 1049 1066 PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, 1050 1067 PERF_RECORD_KSYMBOL_TYPE_BPF = 1, 1068 + /* 1069 + * Out of line code such as kprobe-replaced instructions or optimized 1070 + * kprobes or ftrace trampolines. 1071 + */ 1072 + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, 1051 1073 PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ 1052 1074 }; 1053 1075
+108 -7
kernel/events/core.c
··· 394 394 static atomic_t nr_ksymbol_events __read_mostly; 395 395 static atomic_t nr_bpf_events __read_mostly; 396 396 static atomic_t nr_cgroup_events __read_mostly; 397 + static atomic_t nr_text_poke_events __read_mostly; 397 398 398 399 static LIST_HEAD(pmus); 399 400 static DEFINE_MUTEX(pmus_lock); ··· 1238 1237 refcount_inc(&ctx->refcount); 1239 1238 } 1240 1239 1240 + static void *alloc_task_ctx_data(struct pmu *pmu) 1241 + { 1242 + if (pmu->task_ctx_cache) 1243 + return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); 1244 + 1245 + return NULL; 1246 + } 1247 + 1248 + static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) 1249 + { 1250 + if (pmu->task_ctx_cache && task_ctx_data) 1251 + kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); 1252 + } 1253 + 1241 1254 static void free_ctx(struct rcu_head *head) 1242 1255 { 1243 1256 struct perf_event_context *ctx; 1244 1257 1245 1258 ctx = container_of(head, struct perf_event_context, rcu_head); 1246 - kfree(ctx->task_ctx_data); 1259 + free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); 1247 1260 kfree(ctx); 1248 1261 } 1249 1262 ··· 4485 4470 goto errout; 4486 4471 4487 4472 if (event->attach_state & PERF_ATTACH_TASK_DATA) { 4488 - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); 4473 + task_ctx_data = alloc_task_ctx_data(pmu); 4489 4474 if (!task_ctx_data) { 4490 4475 err = -ENOMEM; 4491 4476 goto errout; ··· 4543 4528 } 4544 4529 } 4545 4530 4546 - kfree(task_ctx_data); 4531 + free_task_ctx_data(pmu, task_ctx_data); 4547 4532 return ctx; 4548 4533 4549 4534 errout: 4550 - kfree(task_ctx_data); 4535 + free_task_ctx_data(pmu, task_ctx_data); 4551 4536 return ERR_PTR(err); 4552 4537 } 4553 4538 ··· 4590 4575 if (attr->mmap || attr->mmap_data || attr->mmap2 || 4591 4576 attr->comm || attr->comm_exec || 4592 4577 attr->task || attr->ksymbol || 4593 - attr->context_switch || 4578 + attr->context_switch || attr->text_poke || 4594 4579 attr->bpf_event) 4595 4580 return true; 4596 4581 return false; ··· 4666 4651 atomic_dec(&nr_ksymbol_events); 4667 4652 if (event->attr.bpf_event) 4668 4653 atomic_dec(&nr_bpf_events); 4654 + if (event->attr.text_poke) 4655 + atomic_dec(&nr_text_poke_events); 4669 4656 4670 4657 if (dec) { 4671 4658 if (!atomic_add_unless(&perf_sched_count, -1, 1)) ··· 8645 8628 perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); 8646 8629 } 8647 8630 8631 + struct perf_text_poke_event { 8632 + const void *old_bytes; 8633 + const void *new_bytes; 8634 + size_t pad; 8635 + u16 old_len; 8636 + u16 new_len; 8637 + 8638 + struct { 8639 + struct perf_event_header header; 8640 + 8641 + u64 addr; 8642 + } event_id; 8643 + }; 8644 + 8645 + static int perf_event_text_poke_match(struct perf_event *event) 8646 + { 8647 + return event->attr.text_poke; 8648 + } 8649 + 8650 + static void perf_event_text_poke_output(struct perf_event *event, void *data) 8651 + { 8652 + struct perf_text_poke_event *text_poke_event = data; 8653 + struct perf_output_handle handle; 8654 + struct perf_sample_data sample; 8655 + u64 padding = 0; 8656 + int ret; 8657 + 8658 + if (!perf_event_text_poke_match(event)) 8659 + return; 8660 + 8661 + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); 8662 + 8663 + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); 8664 + if (ret) 8665 + return; 8666 + 8667 + perf_output_put(&handle, text_poke_event->event_id); 8668 + perf_output_put(&handle, text_poke_event->old_len); 8669 + perf_output_put(&handle, text_poke_event->new_len); 8670 + 8671 + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); 8672 + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); 8673 + 8674 + if (text_poke_event->pad) 8675 + __output_copy(&handle, &padding, text_poke_event->pad); 8676 + 8677 + perf_event__output_id_sample(event, &handle, &sample); 8678 + 8679 + perf_output_end(&handle); 8680 + } 8681 + 8682 + void perf_event_text_poke(const void *addr, const void *old_bytes, 8683 + size_t old_len, const void *new_bytes, size_t new_len) 8684 + { 8685 + struct perf_text_poke_event text_poke_event; 8686 + size_t tot, pad; 8687 + 8688 + if (!atomic_read(&nr_text_poke_events)) 8689 + return; 8690 + 8691 + tot = sizeof(text_poke_event.old_len) + old_len; 8692 + tot += sizeof(text_poke_event.new_len) + new_len; 8693 + pad = ALIGN(tot, sizeof(u64)) - tot; 8694 + 8695 + text_poke_event = (struct perf_text_poke_event){ 8696 + .old_bytes = old_bytes, 8697 + .new_bytes = new_bytes, 8698 + .pad = pad, 8699 + .old_len = old_len, 8700 + .new_len = new_len, 8701 + .event_id = { 8702 + .header = { 8703 + .type = PERF_RECORD_TEXT_POKE, 8704 + .misc = PERF_RECORD_MISC_KERNEL, 8705 + .size = sizeof(text_poke_event.event_id) + tot + pad, 8706 + }, 8707 + .addr = (unsigned long)addr, 8708 + }, 8709 + }; 8710 + 8711 + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); 8712 + } 8713 + 8648 8714 void perf_event_itrace_started(struct perf_event *event) 8649 8715 { 8650 8716 event->attach_state |= PERF_ATTACH_ITRACE; ··· 11045 10945 atomic_inc(&nr_ksymbol_events); 11046 10946 if (event->attr.bpf_event) 11047 10947 atomic_inc(&nr_bpf_events); 10948 + if (event->attr.text_poke) 10949 + atomic_inc(&nr_text_poke_events); 11048 10950 11049 10951 if (inc) { 11050 10952 /* ··· 12511 12409 !child_ctx->task_ctx_data) { 12512 12410 struct pmu *pmu = child_event->pmu; 12513 12411 12514 - child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, 12515 - GFP_KERNEL); 12412 + child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); 12516 12413 if (!child_ctx->task_ctx_data) { 12517 12414 free_event(child_event); 12518 12415 return ERR_PTR(-ENOMEM);
+38 -4
kernel/kallsyms.c
··· 24 24 #include <linux/slab.h> 25 25 #include <linux/filter.h> 26 26 #include <linux/ftrace.h> 27 + #include <linux/kprobes.h> 27 28 #include <linux/compiler.h> 28 29 29 30 /* ··· 438 437 loff_t pos_arch_end; 439 438 loff_t pos_mod_end; 440 439 loff_t pos_ftrace_mod_end; 440 + loff_t pos_bpf_end; 441 441 unsigned long value; 442 442 unsigned int nameoff; /* If iterating in core kernel symbols. */ 443 443 char type; ··· 482 480 return 1; 483 481 } 484 482 483 + /* 484 + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace 485 + * purposes. In that case "__builtin__ftrace" is used as a module name, even 486 + * though "__builtin__ftrace" is not a module. 487 + */ 485 488 static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) 486 489 { 487 490 int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, ··· 503 496 504 497 static int get_ksymbol_bpf(struct kallsym_iter *iter) 505 498 { 499 + int ret; 500 + 506 501 strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); 507 502 iter->exported = 0; 508 - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, 509 - &iter->value, &iter->type, 510 - iter->name) < 0 ? 0 : 1; 503 + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, 504 + &iter->value, &iter->type, 505 + iter->name); 506 + if (ret < 0) { 507 + iter->pos_bpf_end = iter->pos; 508 + return 0; 509 + } 510 + 511 + return 1; 512 + } 513 + 514 + /* 515 + * This uses "__builtin__kprobes" as a module name for symbols for pages 516 + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a 517 + * module. 518 + */ 519 + static int get_ksymbol_kprobe(struct kallsym_iter *iter) 520 + { 521 + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); 522 + iter->exported = 0; 523 + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, 524 + &iter->value, &iter->type, 525 + iter->name) < 0 ? 0 : 1; 511 526 } 512 527 513 528 /* Returns space to next name. */ ··· 556 527 iter->pos_arch_end = 0; 557 528 iter->pos_mod_end = 0; 558 529 iter->pos_ftrace_mod_end = 0; 530 + iter->pos_bpf_end = 0; 559 531 } 560 532 } 561 533 ··· 581 551 get_ksymbol_ftrace_mod(iter)) 582 552 return 1; 583 553 584 - return get_ksymbol_bpf(iter); 554 + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && 555 + get_ksymbol_bpf(iter)) 556 + return 1; 557 + 558 + return get_ksymbol_kprobe(iter); 585 559 } 586 560 587 561 /* Returns false if pos at or past end of file. */
+57 -3
kernel/kprobes.c
··· 35 35 #include <linux/ftrace.h> 36 36 #include <linux/cpu.h> 37 37 #include <linux/jump_label.h> 38 + #include <linux/perf_event.h> 38 39 39 40 #include <asm/sections.h> 40 41 #include <asm/cacheflush.h> ··· 124 123 .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), 125 124 .alloc = alloc_insn_page, 126 125 .free = free_insn_page, 126 + .sym = KPROBE_INSN_PAGE_SYM, 127 127 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), 128 128 .insn_size = MAX_INSN_SIZE, 129 129 .nr_garbage = 0, ··· 190 188 kip->cache = c; 191 189 list_add_rcu(&kip->list, &c->pages); 192 190 slot = kip->insns; 191 + 192 + /* Record the perf ksymbol register event after adding the page */ 193 + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, 194 + PAGE_SIZE, false, c->sym); 193 195 out: 194 196 mutex_unlock(&c->mutex); 195 197 return slot; ··· 212 206 * next time somebody inserts a probe. 213 207 */ 214 208 if (!list_is_singular(&kip->list)) { 209 + /* 210 + * Record perf ksymbol unregister event before removing 211 + * the page. 212 + */ 213 + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, 214 + (unsigned long)kip->insns, PAGE_SIZE, true, 215 + kip->cache->sym); 215 216 list_del_rcu(&kip->list); 216 217 synchronize_rcu(); 217 218 kip->cache->free(kip->insns); ··· 308 295 return ret; 309 296 } 310 297 298 + int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, 299 + unsigned long *value, char *type, char *sym) 300 + { 301 + struct kprobe_insn_page *kip; 302 + int ret = -ERANGE; 303 + 304 + rcu_read_lock(); 305 + list_for_each_entry_rcu(kip, &c->pages, list) { 306 + if ((*symnum)--) 307 + continue; 308 + strlcpy(sym, c->sym, KSYM_NAME_LEN); 309 + *type = 't'; 310 + *value = (unsigned long)kip->insns; 311 + ret = 0; 312 + break; 313 + } 314 + rcu_read_unlock(); 315 + 316 + return ret; 317 + } 318 + 311 319 #ifdef CONFIG_OPTPROBES 312 320 /* For optimized_kprobe buffer */ 313 321 struct kprobe_insn_cache kprobe_optinsn_slots = { 314 322 .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), 315 323 .alloc = alloc_insn_page, 316 324 .free = free_insn_page, 325 + .sym = KPROBE_OPTINSN_PAGE_SYM, 317 326 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), 318 327 /* .insn_size is initialized later */ 319 328 .nr_garbage = 0, ··· 598 563 mutex_lock(&kprobe_mutex); 599 564 cpus_read_lock(); 600 565 mutex_lock(&text_mutex); 601 - /* Lock modules while optimizing kprobes */ 602 - mutex_lock(&module_mutex); 603 566 604 567 /* 605 568 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) ··· 622 589 /* Step 4: Free cleaned kprobes after quiesence period */ 623 590 do_free_cleaned_kprobes(); 624 591 625 - mutex_unlock(&module_mutex); 626 592 mutex_unlock(&text_mutex); 627 593 cpus_read_unlock(); 628 594 ··· 2262 2230 static void kprobe_remove_ksym_blacklist(unsigned long entry) 2263 2231 { 2264 2232 kprobe_remove_area_blacklist(entry, entry + 1); 2233 + } 2234 + 2235 + int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, 2236 + char *type, char *sym) 2237 + { 2238 + return -ERANGE; 2239 + } 2240 + 2241 + int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, 2242 + char *sym) 2243 + { 2244 + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT 2245 + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) 2246 + return 0; 2247 + #ifdef CONFIG_OPTPROBES 2248 + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) 2249 + return 0; 2250 + #endif 2251 + #endif 2252 + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) 2253 + return 0; 2254 + return -ERANGE; 2265 2255 } 2266 2256 2267 2257 int __init __weak arch_populate_kprobe_blacklist(void)
+99 -2
kernel/trace/ftrace.c
··· 2764 2764 { 2765 2765 } 2766 2766 2767 + /* List of trace_ops that have allocated trampolines */ 2768 + static LIST_HEAD(ftrace_ops_trampoline_list); 2769 + 2770 + static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) 2771 + { 2772 + lockdep_assert_held(&ftrace_lock); 2773 + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); 2774 + } 2775 + 2776 + static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) 2777 + { 2778 + lockdep_assert_held(&ftrace_lock); 2779 + list_del_rcu(&ops->list); 2780 + } 2781 + 2782 + /* 2783 + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols 2784 + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is 2785 + * not a module. 2786 + */ 2787 + #define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" 2788 + #define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" 2789 + 2790 + static void ftrace_trampoline_free(struct ftrace_ops *ops) 2791 + { 2792 + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && 2793 + ops->trampoline) { 2794 + /* 2795 + * Record the text poke event before the ksymbol unregister 2796 + * event. 2797 + */ 2798 + perf_event_text_poke((void *)ops->trampoline, 2799 + (void *)ops->trampoline, 2800 + ops->trampoline_size, NULL, 0); 2801 + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, 2802 + ops->trampoline, ops->trampoline_size, 2803 + true, FTRACE_TRAMPOLINE_SYM); 2804 + /* Remove from kallsyms after the perf events */ 2805 + ftrace_remove_trampoline_from_kallsyms(ops); 2806 + } 2807 + 2808 + arch_ftrace_trampoline_free(ops); 2809 + } 2810 + 2767 2811 static void ftrace_startup_enable(int command) 2768 2812 { 2769 2813 if (saved_ftrace_func != ftrace_trace_function) { ··· 2978 2934 synchronize_rcu_tasks(); 2979 2935 2980 2936 free_ops: 2981 - arch_ftrace_trampoline_free(ops); 2937 + ftrace_trampoline_free(ops); 2982 2938 } 2983 2939 2984 2940 return 0; ··· 6222 6178 unsigned int num_funcs; 6223 6179 }; 6224 6180 6181 + static int ftrace_get_trampoline_kallsym(unsigned int symnum, 6182 + unsigned long *value, char *type, 6183 + char *name, char *module_name, 6184 + int *exported) 6185 + { 6186 + struct ftrace_ops *op; 6187 + 6188 + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { 6189 + if (!op->trampoline || symnum--) 6190 + continue; 6191 + *value = op->trampoline; 6192 + *type = 't'; 6193 + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); 6194 + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); 6195 + *exported = 0; 6196 + return 0; 6197 + } 6198 + 6199 + return -ERANGE; 6200 + } 6201 + 6225 6202 #ifdef CONFIG_MODULES 6226 6203 6227 6204 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) ··· 6579 6514 { 6580 6515 struct ftrace_mod_map *mod_map; 6581 6516 struct ftrace_mod_func *mod_func; 6517 + int ret; 6582 6518 6583 6519 preempt_disable(); 6584 6520 list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ··· 6606 6540 WARN_ON(1); 6607 6541 break; 6608 6542 } 6543 + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, 6544 + module_name, exported); 6609 6545 preempt_enable(); 6610 - return -ERANGE; 6546 + return ret; 6611 6547 } 6612 6548 6613 6549 #else ··· 6620 6552 unsigned long start, unsigned long end) 6621 6553 { 6622 6554 return NULL; 6555 + } 6556 + int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, 6557 + char *type, char *name, char *module_name, 6558 + int *exported) 6559 + { 6560 + int ret; 6561 + 6562 + preempt_disable(); 6563 + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, 6564 + module_name, exported); 6565 + preempt_enable(); 6566 + return ret; 6623 6567 } 6624 6568 #endif /* CONFIG_MODULES */ 6625 6569 ··· 6813 6733 6814 6734 static void ftrace_update_trampoline(struct ftrace_ops *ops) 6815 6735 { 6736 + unsigned long trampoline = ops->trampoline; 6737 + 6816 6738 arch_ftrace_update_trampoline(ops); 6739 + if (ops->trampoline && ops->trampoline != trampoline && 6740 + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { 6741 + /* Add to kallsyms before the perf events */ 6742 + ftrace_add_trampoline_to_kallsyms(ops); 6743 + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, 6744 + ops->trampoline, ops->trampoline_size, false, 6745 + FTRACE_TRAMPOLINE_SYM); 6746 + /* 6747 + * Record the perf text poke event after the ksymbol register 6748 + * event. 6749 + */ 6750 + perf_event_text_poke((void *)ops->trampoline, NULL, 0, 6751 + (void *)ops->trampoline, 6752 + ops->trampoline_size); 6753 + } 6817 6754 } 6818 6755 6819 6756 void ftrace_init_trace_array(struct trace_array *tr)