Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'perf-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:
"Core & generic-arch updates:

- Add support for dynamic constraints and propagate it to the Intel
driver (Kan Liang)

- Fix & enhance driver-specific throttling support (Kan Liang)

- Record sample last_period before updating on the x86 and PowerPC
platforms (Mark Barnett)

- Make perf_pmu_unregister() usable (Peter Zijlstra)

- Unify perf_event_free_task() / perf_event_exit_task_context()
(Peter Zijlstra)

- Simplify perf_event_release_kernel() and perf_event_free_task()
(Peter Zijlstra)

- Allocate non-contiguous AUX pages by default (Yabin Cui)

Uprobes updates:

- Add support to emulate NOP instructions (Jiri Olsa)

- selftests/bpf: Add 5-byte NOP uprobe trigger benchmark (Jiri Olsa)

x86 Intel PMU enhancements:

- Support Intel Auto Counter Reload [ACR] (Kan Liang)

- Add PMU support for Clearwater Forest (Dapeng Mi)

- Arch-PEBS preparatory changes: (Dapeng Mi)
- Parse CPUID archPerfmonExt leaves for non-hybrid CPUs
- Decouple BTS initialization from PEBS initialization
- Introduce pairs of PEBS static calls

x86 AMD PMU enhancements:

- Use hrtimer for handling overflows in the AMD uncore driver
(Sandipan Das)

- Prevent UMC counters from saturating (Sandipan Das)

Fixes and cleanups:

- Fix put_ctx() ordering (Frederic Weisbecker)

- Fix irq work dereferencing garbage (Frederic Weisbecker)

- Misc fixes and cleanups (Changbin Du, Frederic Weisbecker, Ian
Rogers, Ingo Molnar, Kan Liang, Peter Zijlstra, Qing Wang, Sandipan
Das, Thorsten Blum)"

* tag 'perf-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits)
perf/headers: Clean up <linux/perf_event.h> a bit
perf/uapi: Clean up <uapi/linux/perf_event.h> a bit
perf/uapi: Fix PERF_RECORD_SAMPLE comments in <uapi/linux/perf_event.h>
mips/perf: Remove driver-specific throttle support
xtensa/perf: Remove driver-specific throttle support
sparc/perf: Remove driver-specific throttle support
loongarch/perf: Remove driver-specific throttle support
csky/perf: Remove driver-specific throttle support
arc/perf: Remove driver-specific throttle support
alpha/perf: Remove driver-specific throttle support
perf/apple_m1: Remove driver-specific throttle support
perf/arm: Remove driver-specific throttle support
s390/perf: Remove driver-specific throttle support
powerpc/perf: Remove driver-specific throttle support
perf/x86/zhaoxin: Remove driver-specific throttle support
perf/x86/amd: Remove driver-specific throttle support
perf/x86/intel: Remove driver-specific throttle support
perf: Only dump the throttle log for the leader
perf: Fix the throttle logic for a group
perf/core: Add the is_event_in_freq_mode() helper to simplify the code
...

+1978 -1192
+3 -8
arch/alpha/kernel/perf_event.c
··· 852 852 alpha_perf_event_update(event, hwc, idx, alpha_pmu->pmc_max_period[idx]+1); 853 853 perf_sample_data_init(&data, 0, hwc->last_period); 854 854 855 - if (alpha_perf_event_set_period(event, hwc, idx)) { 856 - if (perf_event_overflow(event, &data, regs)) { 857 - /* Interrupts coming too quickly; "throttle" the 858 - * counter, i.e., disable it for a little while. 859 - */ 860 - alpha_pmu_stop(event, 0); 861 - } 862 - } 855 + if (alpha_perf_event_set_period(event, hwc, idx)) 856 + perf_event_overflow(event, &data, regs); 857 + 863 858 wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask); 864 859 865 860 return;
+2 -4
arch/arc/kernel/perf_event.c
··· 599 599 600 600 arc_perf_event_update(event, &event->hw, event->hw.idx); 601 601 perf_sample_data_init(&data, 0, hwc->last_period); 602 - if (arc_pmu_event_set_period(event)) { 603 - if (perf_event_overflow(event, &data, regs)) 604 - arc_pmu_stop(event, 0); 605 - } 602 + if (arc_pmu_event_set_period(event)) 603 + perf_event_overflow(event, &data, regs); 606 604 607 605 active_ints &= ~BIT(idx); 608 606 } while (active_ints);
+1 -2
arch/csky/kernel/perf_event.c
··· 1139 1139 perf_sample_data_init(&data, 0, hwc->last_period); 1140 1140 csky_pmu_event_set_period(event); 1141 1141 1142 - if (perf_event_overflow(event, &data, regs)) 1143 - csky_pmu_stop_event(event); 1142 + perf_event_overflow(event, &data, regs); 1144 1143 } 1145 1144 1146 1145 csky_pmu_enable(&csky_pmu.pmu);
+1 -2
arch/loongarch/kernel/perf_event.c
··· 479 479 if (!loongarch_pmu_event_set_period(event, hwc, idx)) 480 480 return; 481 481 482 - if (perf_event_overflow(event, data, regs)) 483 - loongarch_pmu_disable_event(idx); 482 + perf_event_overflow(event, data, regs); 484 483 } 485 484 486 485 static irqreturn_t pmu_handle_irq(int irq, void *dev)
+1 -2
arch/mips/kernel/perf_event_mipsxx.c
··· 791 791 if (!mipspmu_event_set_period(event, hwc, idx)) 792 792 return; 793 793 794 - if (perf_event_overflow(event, data, regs)) 795 - mipsxx_pmu_disable_event(idx); 794 + perf_event_overflow(event, data, regs); 796 795 } 797 796 798 797
+4 -5
arch/powerpc/perf/core-book3s.c
··· 2239 2239 struct pt_regs *regs) 2240 2240 { 2241 2241 u64 period = event->hw.sample_period; 2242 + const u64 last_period = event->hw.last_period; 2242 2243 s64 prev, delta, left; 2243 2244 int record = 0; 2244 2245 ··· 2321 2320 if (record) { 2322 2321 struct perf_sample_data data; 2323 2322 2324 - perf_sample_data_init(&data, ~0ULL, event->hw.last_period); 2323 + perf_sample_data_init(&data, ~0ULL, last_period); 2325 2324 2326 2325 if (event->attr.sample_type & PERF_SAMPLE_ADDR_TYPE) 2327 2326 perf_get_data_addr(event, regs, &data.addr); ··· 2344 2343 ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); 2345 2344 data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; 2346 2345 } 2347 - if (perf_event_overflow(event, &data, regs)) 2348 - power_pmu_stop(event, 0); 2346 + perf_event_overflow(event, &data, regs); 2349 2347 } else if (period) { 2350 2348 /* Account for interrupt in case of invalid SIAR */ 2351 - if (perf_event_account_interrupt(event)) 2352 - power_pmu_stop(event, 0); 2349 + perf_event_account_interrupt(event); 2353 2350 } 2354 2351 } 2355 2352
+3 -3
arch/powerpc/perf/core-fsl-emb.c
··· 590 590 struct pt_regs *regs) 591 591 { 592 592 u64 period = event->hw.sample_period; 593 + const u64 last_period = event->hw.last_period; 593 594 s64 prev, delta, left; 594 595 int record = 0; 595 596 ··· 633 632 if (record) { 634 633 struct perf_sample_data data; 635 634 636 - perf_sample_data_init(&data, 0, event->hw.last_period); 635 + perf_sample_data_init(&data, 0, last_period); 637 636 638 - if (perf_event_overflow(event, &data, regs)) 639 - fsl_emb_pmu_stop(event, 0); 637 + perf_event_overflow(event, &data, regs); 640 638 } 641 639 } 642 640
-2
arch/s390/kernel/perf_cpum_cf.c
··· 980 980 } 981 981 982 982 overflow = perf_event_overflow(event, &data, &regs); 983 - if (overflow) 984 - event->pmu->stop(event, 0); 985 983 986 984 perf_event_update_userpage(event); 987 985 return overflow;
+1 -4
arch/s390/kernel/perf_cpum_sf.c
··· 1072 1072 overflow = 0; 1073 1073 if (perf_event_exclude(event, &regs, sde_regs)) 1074 1074 goto out; 1075 - if (perf_event_overflow(event, &data, &regs)) { 1076 - overflow = 1; 1077 - event->pmu->stop(event, 0); 1078 - } 1075 + overflow = perf_event_overflow(event, &data, &regs); 1079 1076 perf_event_update_userpage(event); 1080 1077 out: 1081 1078 return overflow;
+1 -2
arch/sparc/kernel/perf_event.c
··· 1668 1668 if (!sparc_perf_event_set_period(event, hwc, idx)) 1669 1669 continue; 1670 1670 1671 - if (perf_event_overflow(event, &data, regs)) 1672 - sparc_pmu_stop(event, 0); 1671 + perf_event_overflow(event, &data, regs); 1673 1672 } 1674 1673 1675 1674 finish_clock = sched_clock();
+1 -2
arch/x86/events/amd/core.c
··· 1003 1003 1004 1004 perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1005 1005 1006 - if (perf_event_overflow(event, &data, regs)) 1007 - x86_pmu_stop(event, 0); 1006 + perf_event_overflow(event, &data, regs); 1008 1007 } 1009 1008 1010 1009 /*
+1 -3
arch/x86/events/amd/ibs.c
··· 1373 1373 hwc->sample_period = perf_ibs->min_period; 1374 1374 1375 1375 out: 1376 - if (throttle) { 1377 - perf_ibs_stop(event, 0); 1378 - } else { 1376 + if (!throttle) { 1379 1377 if (perf_ibs == &perf_ibs_op) { 1380 1378 if (ibs_caps & IBS_CAPS_OPCNTEXT) { 1381 1379 new_config = period & IBS_OP_MAX_CNT_EXT_MASK;
+101 -2
arch/x86/events/amd/uncore.c
··· 21 21 #define NUM_COUNTERS_NB 4 22 22 #define NUM_COUNTERS_L2 4 23 23 #define NUM_COUNTERS_L3 6 24 + #define NUM_COUNTERS_MAX 64 24 25 25 26 #define RDPMC_BASE_NB 6 26 27 #define RDPMC_BASE_LLC 10 ··· 39 38 int refcnt; 40 39 int cpu; 41 40 struct perf_event **events; 42 - struct hlist_node node; 41 + unsigned long active_mask[BITS_TO_LONGS(NUM_COUNTERS_MAX)]; 42 + int nr_active; 43 + struct hrtimer hrtimer; 44 + u64 hrtimer_duration; 43 45 }; 44 46 45 47 struct amd_uncore_pmu { ··· 87 83 88 84 static struct amd_uncore uncores[UNCORE_TYPE_MAX]; 89 85 86 + /* Interval for hrtimer, defaults to 60000 milliseconds */ 87 + static unsigned int update_interval = 60 * MSEC_PER_SEC; 88 + module_param(update_interval, uint, 0444); 89 + 90 90 static struct amd_uncore_pmu *event_to_amd_uncore_pmu(struct perf_event *event) 91 91 { 92 92 return container_of(event->pmu, struct amd_uncore_pmu, pmu); 93 + } 94 + 95 + static enum hrtimer_restart amd_uncore_hrtimer(struct hrtimer *hrtimer) 96 + { 97 + struct amd_uncore_ctx *ctx; 98 + struct perf_event *event; 99 + int bit; 100 + 101 + ctx = container_of(hrtimer, struct amd_uncore_ctx, hrtimer); 102 + 103 + if (!ctx->nr_active || ctx->cpu != smp_processor_id()) 104 + return HRTIMER_NORESTART; 105 + 106 + for_each_set_bit(bit, ctx->active_mask, NUM_COUNTERS_MAX) { 107 + event = ctx->events[bit]; 108 + event->pmu->read(event); 109 + } 110 + 111 + hrtimer_forward_now(hrtimer, ns_to_ktime(ctx->hrtimer_duration)); 112 + return HRTIMER_RESTART; 113 + } 114 + 115 + static void amd_uncore_start_hrtimer(struct amd_uncore_ctx *ctx) 116 + { 117 + hrtimer_start(&ctx->hrtimer, ns_to_ktime(ctx->hrtimer_duration), 118 + HRTIMER_MODE_REL_PINNED_HARD); 119 + } 120 + 121 + static void amd_uncore_cancel_hrtimer(struct amd_uncore_ctx *ctx) 122 + { 123 + hrtimer_cancel(&ctx->hrtimer); 124 + } 125 + 126 + static void amd_uncore_init_hrtimer(struct amd_uncore_ctx *ctx) 127 + { 128 + hrtimer_setup(&ctx->hrtimer, amd_uncore_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 93 129 } 94 130 95 131 static void amd_uncore_read(struct perf_event *event) ··· 162 118 163 119 static void amd_uncore_start(struct perf_event *event, int flags) 164 120 { 121 + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); 122 + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); 165 123 struct hw_perf_event *hwc = &event->hw; 124 + 125 + if (!ctx->nr_active++) 126 + amd_uncore_start_hrtimer(ctx); 166 127 167 128 if (flags & PERF_EF_RELOAD) 168 129 wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); 169 130 170 131 hwc->state = 0; 132 + __set_bit(hwc->idx, ctx->active_mask); 171 133 wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); 172 134 perf_event_update_userpage(event); 173 135 } 174 136 175 137 static void amd_uncore_stop(struct perf_event *event, int flags) 176 138 { 139 + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); 140 + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); 177 141 struct hw_perf_event *hwc = &event->hw; 178 142 179 143 wrmsrl(hwc->config_base, hwc->config); ··· 191 139 event->pmu->read(event); 192 140 hwc->state |= PERF_HES_UPTODATE; 193 141 } 142 + 143 + if (!--ctx->nr_active) 144 + amd_uncore_cancel_hrtimer(ctx); 145 + 146 + __clear_bit(hwc->idx, ctx->active_mask); 194 147 } 195 148 196 149 static int amd_uncore_add(struct perf_event *event, int flags) ··· 547 490 kfree(curr); 548 491 goto fail; 549 492 } 493 + 494 + amd_uncore_init_hrtimer(curr); 495 + curr->hrtimer_duration = (u64)update_interval * NSEC_PER_MSEC; 550 496 551 497 cpumask_set_cpu(cpu, &pmu->active_mask); 552 498 } ··· 940 880 941 881 static void amd_uncore_umc_start(struct perf_event *event, int flags) 942 882 { 883 + struct amd_uncore_pmu *pmu = event_to_amd_uncore_pmu(event); 884 + struct amd_uncore_ctx *ctx = *per_cpu_ptr(pmu->ctx, event->cpu); 943 885 struct hw_perf_event *hwc = &event->hw; 886 + 887 + if (!ctx->nr_active++) 888 + amd_uncore_start_hrtimer(ctx); 944 889 945 890 if (flags & PERF_EF_RELOAD) 946 891 wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); 947 892 948 893 hwc->state = 0; 894 + __set_bit(hwc->idx, ctx->active_mask); 949 895 wrmsrl(hwc->config_base, (hwc->config | AMD64_PERFMON_V2_ENABLE_UMC)); 950 896 perf_event_update_userpage(event); 897 + } 898 + 899 + static void amd_uncore_umc_read(struct perf_event *event) 900 + { 901 + struct hw_perf_event *hwc = &event->hw; 902 + u64 prev, new, shift; 903 + s64 delta; 904 + 905 + shift = COUNTER_SHIFT + 1; 906 + prev = local64_read(&hwc->prev_count); 907 + 908 + /* 909 + * UMC counters do not have RDPMC assignments. Read counts directly 910 + * from the corresponding PERF_CTR. 911 + */ 912 + rdmsrl(hwc->event_base, new); 913 + 914 + /* 915 + * Unlike the other uncore counters, UMC counters saturate and set the 916 + * Overflow bit (bit 48) on overflow. Since they do not roll over, 917 + * proactively reset the corresponding PERF_CTR when bit 47 is set so 918 + * that the counter never gets a chance to saturate. 919 + */ 920 + if (new & BIT_ULL(63 - COUNTER_SHIFT)) { 921 + wrmsrl(hwc->event_base, 0); 922 + local64_set(&hwc->prev_count, 0); 923 + } else { 924 + local64_set(&hwc->prev_count, new); 925 + } 926 + 927 + delta = (new << shift) - (prev << shift); 928 + delta >>= shift; 929 + local64_add(delta, &event->count); 951 930 } 952 931 953 932 static ··· 1067 968 .del = amd_uncore_del, 1068 969 .start = amd_uncore_umc_start, 1069 970 .stop = amd_uncore_stop, 1070 - .read = amd_uncore_read, 971 + .read = amd_uncore_umc_read, 1071 972 .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, 1072 973 .module = THIS_MODULE, 1073 974 };
+25 -12
arch/x86/events/core.c
··· 95 95 96 96 DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); 97 97 98 + DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); 99 + DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); 100 + DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); 101 + DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); 102 + 98 103 /* 99 104 * This one is magic, it will get called even when PMU init fails (because 100 105 * there is no PMU), in which case it should simply return NULL. ··· 679 674 event->hw.idx = -1; 680 675 event->hw.last_cpu = -1; 681 676 event->hw.last_tag = ~0ULL; 677 + event->hw.dyn_constraint = ~0ULL; 682 678 683 679 /* mark unused */ 684 680 event->hw.extra_reg.idx = EXTRA_REG_NONE; ··· 762 756 763 757 int is_x86_event(struct perf_event *event) 764 758 { 765 - int i; 766 - 767 - if (!is_hybrid()) 768 - return event->pmu == &pmu; 769 - 770 - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { 771 - if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) 772 - return true; 773 - } 759 + /* 760 + * For a non-hybrid platforms, the type of X86 pmu is 761 + * always PERF_TYPE_RAW. 762 + * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE 763 + * is a unique capability for the X86 PMU. 764 + * Use them to detect a X86 event. 765 + */ 766 + if (event->pmu->type == PERF_TYPE_RAW || 767 + event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) 768 + return true; 774 769 775 770 return false; 776 771 } ··· 1690 1683 struct cpu_hw_events *cpuc; 1691 1684 struct perf_event *event; 1692 1685 int idx, handled = 0; 1686 + u64 last_period; 1693 1687 u64 val; 1694 1688 1695 1689 cpuc = this_cpu_ptr(&cpu_hw_events); ··· 1710 1702 continue; 1711 1703 1712 1704 event = cpuc->events[idx]; 1705 + last_period = event->hw.last_period; 1713 1706 1714 1707 val = static_call(x86_pmu_update)(event); 1715 1708 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) ··· 1724 1715 if (!static_call(x86_pmu_set_period)(event)) 1725 1716 continue; 1726 1717 1727 - perf_sample_data_init(&data, 0, event->hw.last_period); 1718 + perf_sample_data_init(&data, 0, last_period); 1728 1719 1729 1720 perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); 1730 1721 1731 - if (perf_event_overflow(event, &data, regs)) 1732 - x86_pmu_stop(event, 0); 1722 + perf_event_overflow(event, &data, regs); 1733 1723 } 1734 1724 1735 1725 if (handled) ··· 2054 2046 static_call_update(x86_pmu_filter, x86_pmu.filter); 2055 2047 2056 2048 static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); 2049 + 2050 + static_call_update(x86_pmu_pebs_enable, x86_pmu.pebs_enable); 2051 + static_call_update(x86_pmu_pebs_disable, x86_pmu.pebs_disable); 2052 + static_call_update(x86_pmu_pebs_enable_all, x86_pmu.pebs_enable_all); 2053 + static_call_update(x86_pmu_pebs_disable_all, x86_pmu.pebs_disable_all); 2057 2054 } 2058 2055 2059 2056 static void _x86_pmu_read(struct perf_event *event)
+77 -73
arch/x86/events/intel/bts.c
··· 80 80 bts_buffer_setup_aux(struct perf_event *event, void **pages, 81 81 int nr_pages, bool overwrite) 82 82 { 83 - struct bts_buffer *buf; 83 + struct bts_buffer *bb; 84 84 struct page *page; 85 85 int cpu = event->cpu; 86 86 int node = (cpu == -1) ? cpu : cpu_to_node(cpu); 87 87 unsigned long offset; 88 88 size_t size = nr_pages << PAGE_SHIFT; 89 - int pg, nbuf, pad; 89 + int pg, nr_buf, pad; 90 90 91 91 /* count all the high order buffers */ 92 - for (pg = 0, nbuf = 0; pg < nr_pages;) { 92 + for (pg = 0, nr_buf = 0; pg < nr_pages;) { 93 93 page = virt_to_page(pages[pg]); 94 94 pg += buf_nr_pages(page); 95 - nbuf++; 95 + nr_buf++; 96 96 } 97 97 98 98 /* 99 99 * to avoid interrupts in overwrite mode, only allow one physical 100 100 */ 101 - if (overwrite && nbuf > 1) 101 + if (overwrite && nr_buf > 1) 102 102 return NULL; 103 103 104 - buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node); 105 - if (!buf) 104 + bb = kzalloc_node(struct_size(bb, buf, nr_buf), GFP_KERNEL, node); 105 + if (!bb) 106 106 return NULL; 107 107 108 - buf->nr_pages = nr_pages; 109 - buf->nr_bufs = nbuf; 110 - buf->snapshot = overwrite; 111 - buf->data_pages = pages; 112 - buf->real_size = size - size % BTS_RECORD_SIZE; 108 + bb->nr_pages = nr_pages; 109 + bb->nr_bufs = nr_buf; 110 + bb->snapshot = overwrite; 111 + bb->data_pages = pages; 112 + bb->real_size = size - size % BTS_RECORD_SIZE; 113 113 114 - for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { 114 + for (pg = 0, nr_buf = 0, offset = 0, pad = 0; nr_buf < bb->nr_bufs; nr_buf++) { 115 115 unsigned int __nr_pages; 116 116 117 117 page = virt_to_page(pages[pg]); 118 118 __nr_pages = buf_nr_pages(page); 119 - buf->buf[nbuf].page = page; 120 - buf->buf[nbuf].offset = offset; 121 - buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); 122 - buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; 123 - pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; 124 - buf->buf[nbuf].size -= pad; 119 + bb->buf[nr_buf].page = page; 120 + bb->buf[nr_buf].offset = offset; 121 + bb->buf[nr_buf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); 122 + bb->buf[nr_buf].size = buf_size(page) - bb->buf[nr_buf].displacement; 123 + pad = bb->buf[nr_buf].size % BTS_RECORD_SIZE; 124 + bb->buf[nr_buf].size -= pad; 125 125 126 126 pg += __nr_pages; 127 127 offset += __nr_pages << PAGE_SHIFT; 128 128 } 129 129 130 - return buf; 130 + return bb; 131 131 } 132 132 133 133 static void bts_buffer_free_aux(void *data) ··· 135 135 kfree(data); 136 136 } 137 137 138 - static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) 138 + static unsigned long bts_buffer_offset(struct bts_buffer *bb, unsigned int idx) 139 139 { 140 - return buf->buf[idx].offset + buf->buf[idx].displacement; 140 + return bb->buf[idx].offset + bb->buf[idx].displacement; 141 141 } 142 142 143 143 static void 144 - bts_config_buffer(struct bts_buffer *buf) 144 + bts_config_buffer(struct bts_buffer *bb) 145 145 { 146 146 int cpu = raw_smp_processor_id(); 147 147 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 148 - struct bts_phys *phys = &buf->buf[buf->cur_buf]; 148 + struct bts_phys *phys = &bb->buf[bb->cur_buf]; 149 149 unsigned long index, thresh = 0, end = phys->size; 150 150 struct page *page = phys->page; 151 151 152 - index = local_read(&buf->head); 152 + index = local_read(&bb->head); 153 153 154 - if (!buf->snapshot) { 155 - if (buf->end < phys->offset + buf_size(page)) 156 - end = buf->end - phys->offset - phys->displacement; 154 + if (!bb->snapshot) { 155 + if (bb->end < phys->offset + buf_size(page)) 156 + end = bb->end - phys->offset - phys->displacement; 157 157 158 158 index -= phys->offset + phys->displacement; 159 159 ··· 168 168 ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement; 169 169 ds->bts_index = ds->bts_buffer_base + index; 170 170 ds->bts_absolute_maximum = ds->bts_buffer_base + end; 171 - ds->bts_interrupt_threshold = !buf->snapshot 171 + ds->bts_interrupt_threshold = !bb->snapshot 172 172 ? ds->bts_buffer_base + thresh 173 173 : ds->bts_absolute_maximum + BTS_RECORD_SIZE; 174 174 } ··· 184 184 { 185 185 int cpu = raw_smp_processor_id(); 186 186 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 187 - struct bts_buffer *buf = perf_get_aux(&bts->handle); 187 + struct bts_buffer *bb = perf_get_aux(&bts->handle); 188 188 unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head; 189 189 190 - if (!buf) 190 + if (!bb) 191 191 return; 192 192 193 - head = index + bts_buffer_offset(buf, buf->cur_buf); 194 - old = local_xchg(&buf->head, head); 193 + head = index + bts_buffer_offset(bb, bb->cur_buf); 194 + old = local_xchg(&bb->head, head); 195 195 196 - if (!buf->snapshot) { 196 + if (!bb->snapshot) { 197 197 if (old == head) 198 198 return; 199 199 ··· 205 205 * old and head are always in the same physical buffer, so we 206 206 * can subtract them to get the data size. 207 207 */ 208 - local_add(head - old, &buf->data_size); 208 + local_add(head - old, &bb->data_size); 209 209 } else { 210 - local_set(&buf->data_size, head); 210 + local_set(&bb->data_size, head); 211 211 } 212 212 213 213 /* ··· 218 218 } 219 219 220 220 static int 221 - bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); 221 + bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle); 222 222 223 223 /* 224 224 * Ordering PMU callbacks wrt themselves and the PMI is done by means ··· 232 232 static void __bts_event_start(struct perf_event *event) 233 233 { 234 234 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 235 - struct bts_buffer *buf = perf_get_aux(&bts->handle); 235 + struct bts_buffer *bb = perf_get_aux(&bts->handle); 236 236 u64 config = 0; 237 237 238 - if (!buf->snapshot) 238 + if (!bb->snapshot) 239 239 config |= ARCH_PERFMON_EVENTSEL_INT; 240 240 if (!event->attr.exclude_kernel) 241 241 config |= ARCH_PERFMON_EVENTSEL_OS; 242 242 if (!event->attr.exclude_user) 243 243 config |= ARCH_PERFMON_EVENTSEL_USR; 244 244 245 - bts_config_buffer(buf); 245 + bts_config_buffer(bb); 246 246 247 247 /* 248 248 * local barrier to make sure that ds configuration made it ··· 261 261 { 262 262 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 263 263 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 264 - struct bts_buffer *buf; 264 + struct bts_buffer *bb; 265 265 266 - buf = perf_aux_output_begin(&bts->handle, event); 267 - if (!buf) 266 + bb = perf_aux_output_begin(&bts->handle, event); 267 + if (!bb) 268 268 goto fail_stop; 269 269 270 - if (bts_buffer_reset(buf, &bts->handle)) 270 + if (bts_buffer_reset(bb, &bts->handle)) 271 271 goto fail_end_stop; 272 272 273 273 bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; ··· 306 306 { 307 307 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 308 308 struct bts_ctx *bts = this_cpu_ptr(bts_ctx); 309 - struct bts_buffer *buf = NULL; 309 + struct bts_buffer *bb = NULL; 310 310 int state = READ_ONCE(bts->state); 311 311 312 312 if (state == BTS_STATE_ACTIVE) 313 313 __bts_event_stop(event, BTS_STATE_STOPPED); 314 314 315 315 if (state != BTS_STATE_STOPPED) 316 - buf = perf_get_aux(&bts->handle); 316 + bb = perf_get_aux(&bts->handle); 317 317 318 318 event->hw.state |= PERF_HES_STOPPED; 319 319 320 320 if (flags & PERF_EF_UPDATE) { 321 321 bts_update(bts); 322 322 323 - if (buf) { 324 - if (buf->snapshot) 323 + if (bb) { 324 + if (bb->snapshot) 325 325 bts->handle.head = 326 - local_xchg(&buf->data_size, 327 - buf->nr_pages << PAGE_SHIFT); 326 + local_xchg(&bb->data_size, 327 + bb->nr_pages << PAGE_SHIFT); 328 328 perf_aux_output_end(&bts->handle, 329 - local_xchg(&buf->data_size, 0)); 329 + local_xchg(&bb->data_size, 0)); 330 330 } 331 331 332 332 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; ··· 382 382 } 383 383 384 384 static int 385 - bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) 385 + bts_buffer_reset(struct bts_buffer *bb, struct perf_output_handle *handle) 386 386 { 387 387 unsigned long head, space, next_space, pad, gap, skip, wakeup; 388 388 unsigned int next_buf; 389 389 struct bts_phys *phys, *next_phys; 390 390 int ret; 391 391 392 - if (buf->snapshot) 392 + if (bb->snapshot) 393 393 return 0; 394 394 395 - head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); 395 + head = handle->head & ((bb->nr_pages << PAGE_SHIFT) - 1); 396 396 397 - phys = &buf->buf[buf->cur_buf]; 397 + phys = &bb->buf[bb->cur_buf]; 398 398 space = phys->offset + phys->displacement + phys->size - head; 399 399 pad = space; 400 400 if (space > handle->size) { ··· 403 403 } 404 404 if (space <= BTS_SAFETY_MARGIN) { 405 405 /* See if next phys buffer has more space */ 406 - next_buf = buf->cur_buf + 1; 407 - if (next_buf >= buf->nr_bufs) 406 + next_buf = bb->cur_buf + 1; 407 + if (next_buf >= bb->nr_bufs) 408 408 next_buf = 0; 409 - next_phys = &buf->buf[next_buf]; 409 + next_phys = &bb->buf[next_buf]; 410 410 gap = buf_size(phys->page) - phys->displacement - phys->size + 411 411 next_phys->displacement; 412 412 skip = pad + gap; ··· 431 431 * anymore, so we must not be racing with 432 432 * bts_update(). 433 433 */ 434 - buf->cur_buf = next_buf; 435 - local_set(&buf->head, head); 434 + bb->cur_buf = next_buf; 435 + local_set(&bb->head, head); 436 436 } 437 437 } 438 438 } ··· 445 445 space -= space % BTS_RECORD_SIZE; 446 446 } 447 447 448 - buf->end = head + space; 448 + bb->end = head + space; 449 449 450 450 /* 451 451 * If we have no space, the lost notification would have been sent when ··· 462 462 struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; 463 463 struct bts_ctx *bts; 464 464 struct perf_event *event; 465 - struct bts_buffer *buf; 465 + struct bts_buffer *bb; 466 466 s64 old_head; 467 467 int err = -ENOSPC, handled = 0; 468 468 ··· 485 485 if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) 486 486 return handled; 487 487 488 - buf = perf_get_aux(&bts->handle); 489 - if (!buf) 488 + bb = perf_get_aux(&bts->handle); 489 + if (!bb) 490 490 return handled; 491 491 492 492 /* ··· 494 494 * there's no other way of telling, because the pointer will 495 495 * keep moving 496 496 */ 497 - if (buf->snapshot) 497 + if (bb->snapshot) 498 498 return 0; 499 499 500 - old_head = local_read(&buf->head); 500 + old_head = local_read(&bb->head); 501 501 bts_update(bts); 502 502 503 503 /* no new data */ 504 - if (old_head == local_read(&buf->head)) 504 + if (old_head == local_read(&bb->head)) 505 505 return handled; 506 506 507 - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0)); 507 + perf_aux_output_end(&bts->handle, local_xchg(&bb->data_size, 0)); 508 508 509 - buf = perf_aux_output_begin(&bts->handle, event); 510 - if (buf) 511 - err = bts_buffer_reset(buf, &bts->handle); 509 + bb = perf_aux_output_begin(&bts->handle, event); 510 + if (bb) 511 + err = bts_buffer_reset(bb, &bts->handle); 512 512 513 513 if (err) { 514 514 WRITE_ONCE(bts->state, BTS_STATE_STOPPED); 515 515 516 - if (buf) { 516 + if (bb) { 517 517 /* 518 518 * BTS_STATE_STOPPED should be visible before 519 519 * cleared handle::event ··· 599 599 600 600 static __init int bts_init(void) 601 601 { 602 - if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) 602 + if (!boot_cpu_has(X86_FEATURE_DTES64)) 603 + return -ENODEV; 604 + 605 + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); 606 + if (!x86_pmu.bts) 603 607 return -ENODEV; 604 608 605 609 if (boot_cpu_has(X86_FEATURE_PTI)) {
+319 -26
arch/x86/events/intel/core.c
··· 2224 2224 EVENT_EXTRA_END 2225 2225 }; 2226 2226 2227 + EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01"); 2228 + EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02"); 2229 + EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02"); 2230 + 2231 + static struct attribute *skt_events_attrs[] = { 2232 + EVENT_PTR(td_fe_bound_skt), 2233 + EVENT_PTR(td_retiring_skt), 2234 + EVENT_PTR(td_bad_spec_cmt), 2235 + EVENT_PTR(td_be_bound_skt), 2236 + NULL, 2237 + }; 2238 + 2227 2239 #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ 2228 2240 #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ 2229 2241 #define KNL_MCDRAM_LOCAL BIT_ULL(21) ··· 2306 2294 static __always_inline void intel_pmu_disable_all(void) 2307 2295 { 2308 2296 __intel_pmu_disable_all(true); 2309 - intel_pmu_pebs_disable_all(); 2297 + static_call_cond(x86_pmu_pebs_disable_all)(); 2310 2298 intel_pmu_lbr_disable_all(); 2311 2299 } 2312 2300 ··· 2338 2326 2339 2327 static void intel_pmu_enable_all(int added) 2340 2328 { 2341 - intel_pmu_pebs_enable_all(); 2329 + static_call_cond(x86_pmu_pebs_enable_all)(); 2342 2330 __intel_pmu_enable_all(added, false); 2343 2331 } 2344 2332 ··· 2595 2583 * so we don't trigger the event without PEBS bit set. 2596 2584 */ 2597 2585 if (unlikely(event->attr.precise_ip)) 2598 - intel_pmu_pebs_disable(event); 2586 + static_call(x86_pmu_pebs_disable)(event); 2599 2587 } 2600 2588 2601 2589 static void intel_pmu_assign_event(struct perf_event *event, int idx) ··· 2615 2603 intel_pmu_lbr_del(event); 2616 2604 if (event->attr.precise_ip) 2617 2605 intel_pmu_pebs_del(event); 2606 + if (is_pebs_counter_event_group(event) || 2607 + is_acr_event_group(event)) 2608 + this_cpu_ptr(&cpu_hw_events)->n_late_setup--; 2618 2609 } 2619 2610 2620 2611 static int icl_set_topdown_event_period(struct perf_event *event) ··· 2895 2880 cpuc->fixed_ctrl_val |= bits; 2896 2881 } 2897 2882 2883 + static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) 2884 + { 2885 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2886 + int msr_b, msr_c; 2887 + 2888 + if (!mask && !cpuc->acr_cfg_b[idx]) 2889 + return; 2890 + 2891 + if (idx < INTEL_PMC_IDX_FIXED) { 2892 + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; 2893 + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; 2894 + } else { 2895 + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; 2896 + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; 2897 + idx -= INTEL_PMC_IDX_FIXED; 2898 + } 2899 + 2900 + if (cpuc->acr_cfg_b[idx] != mask) { 2901 + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); 2902 + cpuc->acr_cfg_b[idx] = mask; 2903 + } 2904 + /* Only need to update the reload value when there is a valid config value. */ 2905 + if (mask && cpuc->acr_cfg_c[idx] != reload) { 2906 + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); 2907 + cpuc->acr_cfg_c[idx] = reload; 2908 + } 2909 + } 2910 + 2911 + static void intel_pmu_enable_acr(struct perf_event *event) 2912 + { 2913 + struct hw_perf_event *hwc = &event->hw; 2914 + 2915 + if (!is_acr_event_group(event) || !event->attr.config2) { 2916 + /* 2917 + * The disable doesn't clear the ACR CFG register. 2918 + * Check and clear the ACR CFG register. 2919 + */ 2920 + intel_pmu_config_acr(hwc->idx, 0, 0); 2921 + return; 2922 + } 2923 + 2924 + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); 2925 + } 2926 + 2927 + DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); 2928 + 2898 2929 static void intel_pmu_enable_event(struct perf_event *event) 2899 2930 { 2900 2931 u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; ··· 2948 2887 int idx = hwc->idx; 2949 2888 2950 2889 if (unlikely(event->attr.precise_ip)) 2951 - intel_pmu_pebs_enable(event); 2890 + static_call(x86_pmu_pebs_enable)(event); 2952 2891 2953 2892 switch (idx) { 2954 2893 case 0 ... INTEL_PMC_IDX_FIXED - 1: 2955 2894 if (branch_sample_counters(event)) 2956 2895 enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; 2957 2896 intel_set_masks(event, idx); 2897 + static_call_cond(intel_pmu_enable_acr_event)(event); 2958 2898 __x86_pmu_enable_event(hwc, enable_mask); 2959 2899 break; 2960 2900 case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: 2901 + static_call_cond(intel_pmu_enable_acr_event)(event); 2902 + fallthrough; 2961 2903 case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: 2962 2904 intel_pmu_enable_fixed(event); 2963 2905 break; ··· 2978 2914 } 2979 2915 } 2980 2916 2917 + static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) 2918 + { 2919 + struct perf_event *event, *leader; 2920 + int i, j, idx; 2921 + 2922 + for (i = 0; i < cpuc->n_events; i++) { 2923 + leader = cpuc->event_list[i]; 2924 + if (!is_acr_event_group(leader)) 2925 + continue; 2926 + 2927 + /* The ACR events must be contiguous. */ 2928 + for (j = i; j < cpuc->n_events; j++) { 2929 + event = cpuc->event_list[j]; 2930 + if (event->group_leader != leader->group_leader) 2931 + break; 2932 + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { 2933 + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) 2934 + return; 2935 + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); 2936 + } 2937 + } 2938 + i = j - 1; 2939 + } 2940 + } 2941 + 2942 + void intel_pmu_late_setup(void) 2943 + { 2944 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 2945 + 2946 + if (!cpuc->n_late_setup) 2947 + return; 2948 + 2949 + intel_pmu_pebs_late_setup(cpuc); 2950 + intel_pmu_acr_late_setup(cpuc); 2951 + } 2952 + 2981 2953 static void intel_pmu_add_event(struct perf_event *event) 2982 2954 { 2983 2955 if (event->attr.precise_ip) 2984 2956 intel_pmu_pebs_add(event); 2985 2957 if (intel_pmu_needs_branch_stack(event)) 2986 2958 intel_pmu_lbr_add(event); 2959 + if (is_pebs_counter_event_group(event) || 2960 + is_acr_event_group(event)) 2961 + this_cpu_ptr(&cpu_hw_events)->n_late_setup++; 2987 2962 } 2988 2963 2989 2964 /* ··· 3138 3035 continue; 3139 3036 3140 3037 perf_sample_data_init(data, 0, event->hw.last_period); 3141 - if (perf_event_overflow(event, data, regs)) 3142 - x86_pmu_stop(event, 0); 3038 + perf_event_overflow(event, data, regs); 3143 3039 3144 3040 /* Inject one fake event is enough. */ 3145 3041 break; ··· 3243 3141 3244 3142 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 3245 3143 struct perf_event *event = cpuc->events[bit]; 3144 + u64 last_period; 3246 3145 3247 3146 handled++; 3248 3147 ··· 3271 3168 if (is_pebs_counter_event_group(event)) 3272 3169 x86_pmu.drain_pebs(regs, &data); 3273 3170 3171 + last_period = event->hw.last_period; 3172 + 3274 3173 if (!intel_pmu_save_and_restart(event)) 3275 3174 continue; 3276 3175 3277 - perf_sample_data_init(&data, 0, event->hw.last_period); 3176 + perf_sample_data_init(&data, 0, last_period); 3278 3177 3279 3178 if (has_branch_stack(event)) 3280 3179 intel_pmu_lbr_save_brstack(&data, cpuc, event); 3281 3180 3282 - if (perf_event_overflow(event, &data, regs)) 3283 - x86_pmu_stop(event, 0); 3181 + perf_event_overflow(event, &data, regs); 3284 3182 } 3285 3183 3286 3184 return handled; ··· 3843 3739 if (cpuc->excl_cntrs) 3844 3740 return intel_get_excl_constraints(cpuc, event, idx, c2); 3845 3741 3846 - /* Not all counters support the branch counter feature. */ 3847 - if (branch_sample_counters(event)) { 3742 + if (event->hw.dyn_constraint != ~0ULL) { 3848 3743 c2 = dyn_constraint(cpuc, c2, idx); 3849 - c2->idxmsk64 &= x86_pmu.lbr_counters; 3744 + c2->idxmsk64 &= event->hw.dyn_constraint; 3850 3745 c2->weight = hweight64(c2->idxmsk64); 3851 3746 } 3852 3747 ··· 4186 4083 return start; 4187 4084 } 4188 4085 4086 + static inline bool intel_pmu_has_acr(struct pmu *pmu) 4087 + { 4088 + return !!hybrid(pmu, acr_cause_mask64); 4089 + } 4090 + 4091 + static bool intel_pmu_is_acr_group(struct perf_event *event) 4092 + { 4093 + /* The group leader has the ACR flag set */ 4094 + if (is_acr_event_group(event)) 4095 + return true; 4096 + 4097 + /* The acr_mask is set */ 4098 + if (event->attr.config2) 4099 + return true; 4100 + 4101 + return false; 4102 + } 4103 + 4104 + static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, 4105 + u64 *cause_mask, int *num) 4106 + { 4107 + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); 4108 + *cause_mask |= event->attr.config2; 4109 + *num += 1; 4110 + } 4111 + 4112 + static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, 4113 + int idx, u64 cause_mask) 4114 + { 4115 + if (test_bit(idx, (unsigned long *)&cause_mask)) 4116 + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); 4117 + } 4118 + 4189 4119 static int intel_pmu_hw_config(struct perf_event *event) 4190 4120 { 4191 4121 int ret = x86_pmu_hw_config(event); ··· 4280 4144 leader = event->group_leader; 4281 4145 if (branch_sample_call_stack(leader)) 4282 4146 return -EINVAL; 4283 - if (branch_sample_counters(leader)) 4147 + if (branch_sample_counters(leader)) { 4284 4148 num++; 4149 + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; 4150 + } 4285 4151 leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; 4286 4152 4287 4153 for_each_sibling_event(sibling, leader) { 4288 4154 if (branch_sample_call_stack(sibling)) 4289 4155 return -EINVAL; 4290 - if (branch_sample_counters(sibling)) 4156 + if (branch_sample_counters(sibling)) { 4291 4157 num++; 4158 + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; 4159 + } 4292 4160 } 4293 4161 4294 4162 if (num > fls(x86_pmu.lbr_counters)) ··· 4346 4206 is_sampling_event(event) && 4347 4207 event->attr.precise_ip) 4348 4208 event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; 4209 + 4210 + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { 4211 + struct perf_event *sibling, *leader = event->group_leader; 4212 + struct pmu *pmu = event->pmu; 4213 + bool has_sw_event = false; 4214 + int num = 0, idx = 0; 4215 + u64 cause_mask = 0; 4216 + 4217 + /* Not support perf metrics */ 4218 + if (is_metric_event(event)) 4219 + return -EINVAL; 4220 + 4221 + /* Not support freq mode */ 4222 + if (event->attr.freq) 4223 + return -EINVAL; 4224 + 4225 + /* PDist is not supported */ 4226 + if (event->attr.config2 && event->attr.precise_ip > 2) 4227 + return -EINVAL; 4228 + 4229 + /* The reload value cannot exceeds the max period */ 4230 + if (event->attr.sample_period > x86_pmu.max_period) 4231 + return -EINVAL; 4232 + /* 4233 + * The counter-constraints of each event cannot be finalized 4234 + * unless the whole group is scanned. However, it's hard 4235 + * to know whether the event is the last one of the group. 4236 + * Recalculate the counter-constraints for each event when 4237 + * adding a new event. 4238 + * 4239 + * The group is traversed twice, which may be optimized later. 4240 + * In the first round, 4241 + * - Find all events which do reload when other events 4242 + * overflow and set the corresponding counter-constraints 4243 + * - Add all events, which can cause other events reload, 4244 + * in the cause_mask 4245 + * - Error out if the number of events exceeds the HW limit 4246 + * - The ACR events must be contiguous. 4247 + * Error out if there are non-X86 events between ACR events. 4248 + * This is not a HW limit, but a SW limit. 4249 + * With the assumption, the intel_pmu_acr_late_setup() can 4250 + * easily convert the event idx to counter idx without 4251 + * traversing the whole event list. 4252 + */ 4253 + if (!is_x86_event(leader)) 4254 + return -EINVAL; 4255 + 4256 + if (leader->attr.config2) 4257 + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); 4258 + 4259 + if (leader->nr_siblings) { 4260 + for_each_sibling_event(sibling, leader) { 4261 + if (!is_x86_event(sibling)) { 4262 + has_sw_event = true; 4263 + continue; 4264 + } 4265 + if (!sibling->attr.config2) 4266 + continue; 4267 + if (has_sw_event) 4268 + return -EINVAL; 4269 + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); 4270 + } 4271 + } 4272 + if (leader != event && event->attr.config2) { 4273 + if (has_sw_event) 4274 + return -EINVAL; 4275 + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); 4276 + } 4277 + 4278 + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || 4279 + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) 4280 + return -EINVAL; 4281 + /* 4282 + * In the second round, apply the counter-constraints for 4283 + * the events which can cause other events reload. 4284 + */ 4285 + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); 4286 + 4287 + if (leader->nr_siblings) { 4288 + for_each_sibling_event(sibling, leader) 4289 + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); 4290 + } 4291 + 4292 + if (leader != event) 4293 + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); 4294 + 4295 + leader->hw.flags |= PERF_X86_EVENT_ACR; 4296 + } 4349 4297 4350 4298 if ((event->attr.type == PERF_TYPE_HARDWARE) || 4351 4299 (event->attr.type == PERF_TYPE_HW_CACHE)) ··· 4582 4354 .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, 4583 4355 }; 4584 4356 4585 - if (!x86_pmu.pebs) 4357 + if (!x86_pmu.ds_pebs) 4586 4358 return arr; 4587 4359 4588 4360 /* ··· 5180 4952 goto err; 5181 4953 } 5182 4954 5183 - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { 4955 + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { 5184 4956 size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); 5185 4957 5186 4958 cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); ··· 5269 5041 return false; 5270 5042 } 5271 5043 5272 - static void update_pmu_cap(struct x86_hybrid_pmu *pmu) 5044 + static void update_pmu_cap(struct pmu *pmu) 5273 5045 { 5274 5046 unsigned int cntr, fixed_cntr, ecx, edx; 5275 5047 union cpuid35_eax eax; ··· 5278 5050 cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); 5279 5051 5280 5052 if (ebx.split.umask2) 5281 - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; 5053 + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; 5282 5054 if (ebx.split.eq) 5283 - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; 5055 + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; 5284 5056 5285 5057 if (eax.split.cntr_subleaf) { 5286 5058 cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, 5287 5059 &cntr, &fixed_cntr, &ecx, &edx); 5288 - pmu->cntr_mask64 = cntr; 5289 - pmu->fixed_cntr_mask64 = fixed_cntr; 5060 + hybrid(pmu, cntr_mask64) = cntr; 5061 + hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; 5062 + } 5063 + 5064 + if (eax.split.acr_subleaf) { 5065 + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, 5066 + &cntr, &fixed_cntr, &ecx, &edx); 5067 + /* The mask of the counters which can be reloaded */ 5068 + hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); 5069 + 5070 + /* The mask of the counters which can cause a reload of reloadable counters */ 5071 + hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); 5290 5072 } 5291 5073 5292 5074 if (!intel_pmu_broken_perf_cap()) { 5293 5075 /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ 5294 - rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); 5076 + rdmsrl(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities); 5295 5077 } 5296 5078 } 5297 5079 ··· 5388 5150 goto end; 5389 5151 5390 5152 if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) 5391 - update_pmu_cap(pmu); 5153 + update_pmu_cap(&pmu->pmu); 5392 5154 5393 5155 intel_pmu_check_hybrid_pmus(pmu); 5394 5156 ··· 5762 5524 * these chips. 5763 5525 */ 5764 5526 pr_warn("PEBS disabled due to CPU errata\n"); 5765 - x86_pmu.pebs = 0; 5527 + x86_pmu.ds_pebs = 0; 5766 5528 x86_pmu.pebs_constraints = NULL; 5767 5529 } 5768 5530 ··· 6250 6012 static umode_t 6251 6013 pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) 6252 6014 { 6253 - return x86_pmu.pebs ? attr->mode : 0; 6015 + return x86_pmu.ds_pebs ? attr->mode : 0; 6254 6016 } 6255 6017 6256 6018 static umode_t ··· 6279 6041 return x86_pmu.intel_cap.perf_metrics ? attr->mode : 0; 6280 6042 6281 6043 return attr->mode; 6044 + } 6045 + 6046 + PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); 6047 + 6048 + static struct attribute *format_acr_attrs[] = { 6049 + &format_attr_acr_mask.attr, 6050 + NULL 6051 + }; 6052 + 6053 + static umode_t 6054 + acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) 6055 + { 6056 + struct device *dev = kobj_to_dev(kobj); 6057 + 6058 + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; 6282 6059 } 6283 6060 6284 6061 static struct attribute_group group_events_td = { ··· 6338 6085 .is_visible = evtsel_ext_is_visible, 6339 6086 }; 6340 6087 6088 + static struct attribute_group group_format_acr = { 6089 + .name = "format", 6090 + .attrs = format_acr_attrs, 6091 + .is_visible = acr_is_visible, 6092 + }; 6093 + 6341 6094 static struct attribute_group group_default = { 6342 6095 .attrs = intel_pmu_attrs, 6343 6096 .is_visible = default_is_visible, ··· 6358 6099 &group_format_extra, 6359 6100 &group_format_extra_skl, 6360 6101 &group_format_evtsel_ext, 6102 + &group_format_acr, 6361 6103 &group_default, 6362 6104 NULL, 6363 6105 }; ··· 6643 6383 &group_caps_lbr, 6644 6384 &hybrid_group_format_extra, 6645 6385 &group_format_evtsel_ext, 6386 + &group_format_acr, 6646 6387 &group_default, 6647 6388 &hybrid_group_cpus, 6648 6389 NULL, ··· 6836 6575 intel_pmu_init_grt(pmu); 6837 6576 hybrid(pmu, event_constraints) = intel_skt_event_constraints; 6838 6577 hybrid(pmu, extra_regs) = intel_cmt_extra_regs; 6578 + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); 6839 6579 } 6840 6580 6841 6581 __init int intel_pmu_init(void) ··· 6897 6635 6898 6636 x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); 6899 6637 x86_pmu.pebs_capable = PEBS_COUNTER_MASK; 6638 + x86_pmu.config_mask = X86_RAW_EVENT_MASK; 6900 6639 6901 6640 /* 6902 6641 * Quirk: v2 perfmon does not report fixed-purpose events, so ··· 6926 6663 if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) 6927 6664 intel_pmu_arch_lbr_init(); 6928 6665 6929 - intel_ds_init(); 6666 + intel_pebs_init(); 6930 6667 6931 6668 x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ 6932 6669 ··· 6936 6673 pr_cont(" AnyThread deprecated, "); 6937 6674 } 6938 6675 6676 + /* 6677 + * Many features on and after V6 require dynamic constraint, 6678 + * e.g., Arch PEBS, ACR. 6679 + */ 6680 + if (version >= 6) 6681 + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; 6939 6682 /* 6940 6683 * Install the hw-cache-events table: 6941 6684 */ ··· 7151 6882 extra_attr = cmt_format_attr; 7152 6883 pr_cont("Crestmont events, "); 7153 6884 name = "crestmont"; 6885 + break; 6886 + 6887 + case INTEL_ATOM_DARKMONT_X: 6888 + intel_pmu_init_skt(NULL); 6889 + intel_pmu_pebs_data_source_cmt(); 6890 + x86_pmu.pebs_latency_data = cmt_latency_data; 6891 + x86_pmu.get_event_constraints = cmt_get_event_constraints; 6892 + td_attr = skt_events_attrs; 6893 + mem_attr = grt_mem_attrs; 6894 + extra_attr = cmt_format_attr; 6895 + pr_cont("Darkmont events, "); 6896 + name = "darkmont"; 7154 6897 break; 7155 6898 7156 6899 case INTEL_WESTMERE: ··· 7713 7432 7714 7433 x86_pmu.attr_update = hybrid_attr_update; 7715 7434 } 7435 + 7436 + /* 7437 + * The archPerfmonExt (0x23) includes an enhanced enumeration of 7438 + * PMU architectural features with a per-core view. For non-hybrid, 7439 + * each core has the same PMU capabilities. It's good enough to 7440 + * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu 7441 + * is used to keep the common capabilities. Still keep the values 7442 + * from the leaf 0xa. The core specific update will be done later 7443 + * when a new type is online. 7444 + */ 7445 + if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) 7446 + update_pmu_cap(NULL); 7716 7447 7717 7448 intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, 7718 7449 &x86_pmu.fixed_cntr_mask64,
+29 -26
arch/x86/events/intel/ds.c
··· 624 624 int max, node = cpu_to_node(cpu); 625 625 void *buffer, *insn_buff, *cea; 626 626 627 - if (!x86_pmu.pebs) 627 + if (!x86_pmu.ds_pebs) 628 628 return 0; 629 629 630 630 buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); ··· 659 659 struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); 660 660 void *cea; 661 661 662 - if (!x86_pmu.pebs) 662 + if (!x86_pmu.ds_pebs) 663 663 return; 664 664 665 665 kfree(per_cpu(insn_buffer, cpu)); ··· 734 734 { 735 735 int cpu; 736 736 737 - if (!x86_pmu.bts && !x86_pmu.pebs) 737 + if (!x86_pmu.bts && !x86_pmu.ds_pebs) 738 738 return; 739 739 740 740 for_each_possible_cpu(cpu) ··· 750 750 } 751 751 752 752 for_each_possible_cpu(cpu) { 753 - release_pebs_buffer(cpu); 753 + if (x86_pmu.ds_pebs) 754 + release_pebs_buffer(cpu); 754 755 release_bts_buffer(cpu); 755 756 } 756 757 } ··· 762 761 int cpu; 763 762 764 763 x86_pmu.bts_active = 0; 765 - x86_pmu.pebs_active = 0; 766 764 767 - if (!x86_pmu.bts && !x86_pmu.pebs) 765 + if (x86_pmu.ds_pebs) 766 + x86_pmu.pebs_active = 0; 767 + 768 + if (!x86_pmu.bts && !x86_pmu.ds_pebs) 768 769 return; 769 770 770 771 if (!x86_pmu.bts) 771 772 bts_err = 1; 772 773 773 - if (!x86_pmu.pebs) 774 + if (!x86_pmu.ds_pebs) 774 775 pebs_err = 1; 775 776 776 777 for_each_possible_cpu(cpu) { ··· 784 781 if (!bts_err && alloc_bts_buffer(cpu)) 785 782 bts_err = 1; 786 783 787 - if (!pebs_err && alloc_pebs_buffer(cpu)) 784 + if (x86_pmu.ds_pebs && !pebs_err && 785 + alloc_pebs_buffer(cpu)) 788 786 pebs_err = 1; 789 787 790 788 if (bts_err && pebs_err) ··· 797 793 release_bts_buffer(cpu); 798 794 } 799 795 800 - if (pebs_err) { 796 + if (x86_pmu.ds_pebs && pebs_err) { 801 797 for_each_possible_cpu(cpu) 802 798 release_pebs_buffer(cpu); 803 799 } ··· 809 805 if (x86_pmu.bts && !bts_err) 810 806 x86_pmu.bts_active = 1; 811 807 812 - if (x86_pmu.pebs && !pebs_err) 808 + if (x86_pmu.ds_pebs && !pebs_err) 813 809 x86_pmu.pebs_active = 1; 814 810 815 811 for_each_possible_cpu(cpu) { ··· 1359 1355 } 1360 1356 1361 1357 1362 - static void intel_pmu_late_setup(void) 1358 + void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc) 1363 1359 { 1364 - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 1365 1360 struct perf_event *event; 1366 1361 u64 pebs_data_cfg = 0; 1367 1362 int i; ··· 1831 1828 1832 1829 perf_sample_data_init(data, 0, event->hw.last_period); 1833 1830 1834 - data->period = event->hw.last_period; 1835 - 1836 1831 /* 1837 1832 * Use latency for weight (only avail with PEBS-LL) 1838 1833 */ ··· 2083 2082 sample_type = event->attr.sample_type; 2084 2083 format_group = basic->format_group; 2085 2084 perf_sample_data_init(data, 0, event->hw.last_period); 2086 - data->period = event->hw.last_period; 2087 2085 2088 2086 setup_pebs_time(event, data, basic->tsc); 2089 2087 ··· 2359 2359 * All but the last records are processed. 2360 2360 * The last one is left to be able to call the overflow handler. 2361 2361 */ 2362 - if (perf_event_overflow(event, data, regs)) 2363 - x86_pmu_stop(event, 0); 2362 + perf_event_overflow(event, data, regs); 2364 2363 } 2365 2364 2366 2365 if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { ··· 2588 2589 if (error[bit]) { 2589 2590 perf_log_lost_samples(event, error[bit]); 2590 2591 2591 - if (iregs && perf_event_account_interrupt(event)) 2592 - x86_pmu_stop(event, 0); 2592 + if (iregs) 2593 + perf_event_account_interrupt(event); 2593 2594 } 2594 2595 2595 2596 if (counts[bit]) { ··· 2669 2670 } 2670 2671 2671 2672 /* 2672 - * BTS, PEBS probe and setup 2673 + * PEBS probe and setup 2673 2674 */ 2674 2675 2675 - void __init intel_ds_init(void) 2676 + void __init intel_pebs_init(void) 2676 2677 { 2677 2678 /* 2678 2679 * No support for 32bit formats ··· 2680 2681 if (!boot_cpu_has(X86_FEATURE_DTES64)) 2681 2682 return; 2682 2683 2683 - x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); 2684 - x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); 2684 + x86_pmu.ds_pebs = boot_cpu_has(X86_FEATURE_PEBS); 2685 2685 x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; 2686 2686 if (x86_pmu.version <= 4) 2687 2687 x86_pmu.pebs_no_isolation = 1; 2688 2688 2689 - if (x86_pmu.pebs) { 2689 + if (x86_pmu.ds_pebs) { 2690 2690 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; 2691 2691 char *pebs_qual = ""; 2692 2692 int format = x86_pmu.intel_cap.pebs_format; 2693 2693 2694 2694 if (format < 4) 2695 2695 x86_pmu.intel_cap.pebs_baseline = 0; 2696 + 2697 + x86_pmu.pebs_enable = intel_pmu_pebs_enable; 2698 + x86_pmu.pebs_disable = intel_pmu_pebs_disable; 2699 + x86_pmu.pebs_enable_all = intel_pmu_pebs_enable_all; 2700 + x86_pmu.pebs_disable_all = intel_pmu_pebs_disable_all; 2696 2701 2697 2702 switch (format) { 2698 2703 case 0: ··· 2782 2779 2783 2780 default: 2784 2781 pr_cont("no PEBS fmt%d%c, ", format, pebs_type); 2785 - x86_pmu.pebs = 0; 2782 + x86_pmu.ds_pebs = 0; 2786 2783 } 2787 2784 } 2788 2785 } ··· 2791 2788 { 2792 2789 struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); 2793 2790 2794 - if (!x86_pmu.bts && !x86_pmu.pebs) 2791 + if (!x86_pmu.bts && !x86_pmu.ds_pebs) 2795 2792 return; 2796 2793 2797 2794 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+4 -3
arch/x86/events/intel/knc.c
··· 241 241 242 242 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 243 243 struct perf_event *event = cpuc->events[bit]; 244 + u64 last_period; 244 245 245 246 handled++; 246 247 247 248 if (!test_bit(bit, cpuc->active_mask)) 248 249 continue; 249 250 251 + last_period = event->hw.last_period; 250 252 if (!intel_pmu_save_and_restart(event)) 251 253 continue; 252 254 253 - perf_sample_data_init(&data, 0, event->hw.last_period); 255 + perf_sample_data_init(&data, 0, last_period); 254 256 255 - if (perf_event_overflow(event, &data, regs)) 256 - x86_pmu_stop(event, 0); 257 + perf_event_overflow(event, &data, regs); 257 258 } 258 259 259 260 /*
+1 -1
arch/x86/events/intel/lbr.c
··· 1618 1618 x86_pmu.lbr_nr = lbr_nr; 1619 1619 1620 1620 if (!!x86_pmu.lbr_counters) 1621 - x86_pmu.flags |= PMU_FL_BR_CNTR; 1621 + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; 1622 1622 1623 1623 if (x86_pmu.lbr_mispred) 1624 1624 static_branch_enable(&x86_lbr_mispred);
+1 -2
arch/x86/events/intel/p4.c
··· 1072 1072 continue; 1073 1073 1074 1074 1075 - if (perf_event_overflow(event, &data, regs)) 1076 - x86_pmu_stop(event, 0); 1075 + perf_event_overflow(event, &data, regs); 1077 1076 } 1078 1077 1079 1078 if (handled)
+2
arch/x86/events/intel/pt.c
··· 1863 1863 1864 1864 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 1865 1865 pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; 1866 + else 1867 + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_PREFER_LARGE; 1866 1868 1867 1869 pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | 1868 1870 PERF_PMU_CAP_ITRACE |
+2 -10
arch/x86/events/intel/uncore.c
··· 305 305 { 306 306 struct intel_uncore_box *box; 307 307 struct perf_event *event; 308 - unsigned long flags; 309 308 int bit; 310 309 311 310 box = container_of(hrtimer, struct intel_uncore_box, hrtimer); 312 311 if (!box->n_active || box->cpu != smp_processor_id()) 313 312 return HRTIMER_NORESTART; 314 - /* 315 - * disable local interrupt to prevent uncore_pmu_event_start/stop 316 - * to interrupt the update process 317 - */ 318 - local_irq_save(flags); 319 313 320 314 /* 321 315 * handle boxes with an active event list as opposed to active ··· 322 328 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) 323 329 uncore_perf_event_update(box, box->events[bit]); 324 330 325 - local_irq_restore(flags); 326 - 327 331 hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); 328 332 return HRTIMER_RESTART; 329 333 } ··· 329 337 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) 330 338 { 331 339 hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), 332 - HRTIMER_MODE_REL_PINNED); 340 + HRTIMER_MODE_REL_PINNED_HARD); 333 341 } 334 342 335 343 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) ··· 339 347 340 348 static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) 341 349 { 342 - hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 350 + hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 343 351 } 344 352 345 353 static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+43 -2
arch/x86/events/perf_event.h
··· 127 127 return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR); 128 128 } 129 129 130 + static inline bool is_acr_event_group(struct perf_event *event) 131 + { 132 + return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); 133 + } 134 + 130 135 struct amd_nb { 131 136 int nb_id; /* NorthBridge id */ 132 137 int refcnt; /* reference count */ ··· 273 268 struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; 274 269 275 270 int n_excl; /* the number of exclusive events */ 271 + int n_late_setup; /* the num of events needs late setup */ 276 272 277 273 unsigned int txn_flags; 278 274 int is_fake; ··· 298 292 /* Intel Fixed counter configuration */ 299 293 u64 fixed_ctrl_val; 300 294 u64 active_fixed_ctrl_val; 295 + 296 + /* Intel ACR configuration */ 297 + u64 acr_cfg_b[X86_PMC_IDX_MAX]; 298 + u64 acr_cfg_c[X86_PMC_IDX_MAX]; 301 299 302 300 /* 303 301 * Intel LBR bits ··· 724 714 u64 fixed_cntr_mask64; 725 715 unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 726 716 }; 717 + 718 + union { 719 + u64 acr_cntr_mask64; 720 + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 721 + }; 722 + union { 723 + u64 acr_cause_mask64; 724 + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 725 + }; 727 726 struct event_constraint unconstrained; 728 727 729 728 u64 hw_cache_event_ids ··· 815 796 int (*hw_config)(struct perf_event *event); 816 797 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); 817 798 void (*late_setup)(void); 799 + void (*pebs_enable)(struct perf_event *event); 800 + void (*pebs_disable)(struct perf_event *event); 801 + void (*pebs_enable_all)(void); 802 + void (*pebs_disable_all)(void); 818 803 unsigned eventsel; 819 804 unsigned perfctr; 820 805 unsigned fixedctr; ··· 834 811 union { 835 812 u64 fixed_cntr_mask64; 836 813 unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 814 + }; 815 + union { 816 + u64 acr_cntr_mask64; 817 + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 818 + }; 819 + union { 820 + u64 acr_cause_mask64; 821 + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 837 822 }; 838 823 int cntval_bits; 839 824 u64 cntval_mask; ··· 909 878 */ 910 879 unsigned int bts :1, 911 880 bts_active :1, 912 - pebs :1, 881 + ds_pebs :1, 913 882 pebs_active :1, 914 883 pebs_broken :1, 915 884 pebs_prec_dist :1, ··· 1080 1049 #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ 1081 1050 #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ 1082 1051 #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ 1052 + #define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ 1083 1053 1084 1054 #define EVENT_VAR(_id) event_attr_##_id 1085 1055 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr ··· 1123 1091 .pmu_type = _pmu, \ 1124 1092 } 1125 1093 1094 + int is_x86_event(struct perf_event *event); 1126 1095 struct pmu *x86_get_pmu(unsigned int cpu); 1127 1096 extern struct x86_pmu x86_pmu __read_mostly; 1128 1097 ··· 1131 1098 DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); 1132 1099 DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); 1133 1100 DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); 1101 + DECLARE_STATIC_CALL(x86_pmu_pebs_enable, *x86_pmu.pebs_enable); 1102 + DECLARE_STATIC_CALL(x86_pmu_pebs_disable, *x86_pmu.pebs_disable); 1103 + DECLARE_STATIC_CALL(x86_pmu_pebs_enable_all, *x86_pmu.pebs_enable_all); 1104 + DECLARE_STATIC_CALL(x86_pmu_pebs_disable_all, *x86_pmu.pebs_disable_all); 1134 1105 1135 1106 static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) 1136 1107 { ··· 1624 1587 1625 1588 int intel_pmu_drain_bts_buffer(void); 1626 1589 1590 + void intel_pmu_late_setup(void); 1591 + 1627 1592 u64 grt_latency_data(struct perf_event *event, u64 status); 1628 1593 1629 1594 u64 cmt_latency_data(struct perf_event *event, u64 status); ··· 1682 1643 1683 1644 void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); 1684 1645 1646 + void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc); 1647 + 1685 1648 void intel_pmu_drain_pebs_buffer(void); 1686 1649 1687 1650 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); 1688 1651 1689 - void intel_ds_init(void); 1652 + void intel_pebs_init(void); 1690 1653 1691 1654 void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, 1692 1655 struct cpu_hw_events *cpuc,
+21 -20
arch/x86/events/perf_event_flags.h
··· 2 2 /* 3 3 * struct hw_perf_event.flags flags 4 4 */ 5 - PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ 6 - PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ 7 - PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ 8 - PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ 9 - PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ 10 - PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ 11 - PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ 12 - PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ 13 - PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ 14 - PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ 15 - PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ 16 - PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ 17 - PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ 18 - PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ 19 - PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ 20 - PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ 21 - PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ 22 - PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ 23 - PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ 24 - PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ 5 + PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ 6 + PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ 7 + PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ 8 + PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ 9 + PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ 10 + PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ 11 + PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ 12 + PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ 13 + PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ 14 + PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ 15 + PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ 16 + PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ 17 + PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ 18 + PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ 19 + PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ 20 + PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ 21 + PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ 22 + PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ 23 + PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ 24 + PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ 25 + PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */
+1 -2
arch/x86/events/zhaoxin/core.c
··· 397 397 if (!x86_perf_event_set_period(event)) 398 398 continue; 399 399 400 - if (perf_event_overflow(event, &data, regs)) 401 - x86_pmu_stop(event, 0); 400 + perf_event_overflow(event, &data, regs); 402 401 } 403 402 404 403 /*
+4
arch/x86/include/asm/msr-index.h
··· 602 602 /* V6 PMON MSR range */ 603 603 #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 604 604 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 605 + #define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 606 + #define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 605 607 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 608 + #define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 609 + #define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 606 610 #define MSR_IA32_PMC_V6_STEP 4 607 611 608 612 /* KeyID partitioning between MKTME and TDX */
+1
arch/x86/include/asm/perf_event.h
··· 195 195 */ 196 196 #define ARCH_PERFMON_EXT_LEAF 0x00000023 197 197 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 198 + #define ARCH_PERFMON_ACR_LEAF 0x2 198 199 199 200 union cpuid35_eax { 200 201 struct {
+5
arch/x86/kernel/uprobes.c
··· 840 840 insn_byte_t p; 841 841 int i; 842 842 843 + /* x86_nops[insn->length]; same as jmp with .offs = 0 */ 844 + if (insn->length <= ASM_NOP_MAX && 845 + !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) 846 + goto setup; 847 + 843 848 switch (opc1) { 844 849 case 0xeb: /* jmp 8 */ 845 850 case 0xe9: /* jmp 32 */
+1 -2
arch/xtensa/kernel/perf_event.c
··· 388 388 struct pt_regs *regs = get_irq_regs(); 389 389 390 390 perf_sample_data_init(&data, 0, last_period); 391 - if (perf_event_overflow(event, &data, regs)) 392 - xtensa_pmu_stop(event, 0); 391 + perf_event_overflow(event, &data, regs); 393 392 } 394 393 395 394 rc = IRQ_HANDLED;
+1 -2
drivers/perf/apple_m1_cpu_pmu.c
··· 474 474 if (!armpmu_event_set_period(event)) 475 475 continue; 476 476 477 - if (perf_event_overflow(event, &data, regs)) 478 - m1_pmu_disable_event(event); 477 + perf_event_overflow(event, &data, regs); 479 478 } 480 479 481 480 cpu_pmu->start(cpu_pmu);
+1 -2
drivers/perf/arm_pmuv3.c
··· 887 887 * an irq_work which will be taken care of in the handling of 888 888 * IPI_IRQ_WORK. 889 889 */ 890 - if (perf_event_overflow(event, &data, regs)) 891 - cpu_pmu->disable(event); 890 + perf_event_overflow(event, &data, regs); 892 891 } 893 892 armv8pmu_start(cpu_pmu); 894 893
+1 -2
drivers/perf/arm_v6_pmu.c
··· 276 276 if (!armpmu_event_set_period(event)) 277 277 continue; 278 278 279 - if (perf_event_overflow(event, &data, regs)) 280 - cpu_pmu->disable(event); 279 + perf_event_overflow(event, &data, regs); 281 280 } 282 281 283 282 /*
+1 -2
drivers/perf/arm_v7_pmu.c
··· 930 930 if (!armpmu_event_set_period(event)) 931 931 continue; 932 932 933 - if (perf_event_overflow(event, &data, regs)) 934 - cpu_pmu->disable(event); 933 + perf_event_overflow(event, &data, regs); 935 934 } 936 935 937 936 /*
+2 -4
drivers/perf/arm_xscale_pmu.c
··· 186 186 if (!armpmu_event_set_period(event)) 187 187 continue; 188 188 189 - if (perf_event_overflow(event, &data, regs)) 190 - cpu_pmu->disable(event); 189 + perf_event_overflow(event, &data, regs); 191 190 } 192 191 193 192 irq_work_run(); ··· 518 519 if (!armpmu_event_set_period(event)) 519 520 continue; 520 521 521 - if (perf_event_overflow(event, &data, regs)) 522 - cpu_pmu->disable(event); 522 + perf_event_overflow(event, &data, regs); 523 523 } 524 524 525 525 irq_work_run();
-1
include/linux/cpuhotplug.h
··· 60 60 /* PREPARE section invoked on a control CPU */ 61 61 CPUHP_OFFLINE = 0, 62 62 CPUHP_CREATE_THREADS, 63 - CPUHP_PERF_PREPARE, 64 63 CPUHP_PERF_X86_PREPARE, 65 64 CPUHP_PERF_X86_AMD_UNCORE_PREP, 66 65 CPUHP_PERF_POWER,
+167 -131
include/linux/perf_event.h
··· 26 26 # include <asm/local64.h> 27 27 #endif 28 28 29 - #define PERF_GUEST_ACTIVE 0x01 30 - #define PERF_GUEST_USER 0x02 31 - 32 - struct perf_guest_info_callbacks { 33 - unsigned int (*state)(void); 34 - unsigned long (*get_ip)(void); 35 - unsigned int (*handle_intel_pt_intr)(void); 36 - }; 37 - 38 29 #ifdef CONFIG_HAVE_HW_BREAKPOINT 39 - #include <linux/rhashtable-types.h> 40 - #include <asm/hw_breakpoint.h> 30 + # include <linux/rhashtable-types.h> 31 + # include <asm/hw_breakpoint.h> 41 32 #endif 42 33 43 34 #include <linux/list.h> ··· 53 62 #include <linux/security.h> 54 63 #include <linux/static_call.h> 55 64 #include <linux/lockdep.h> 65 + 56 66 #include <asm/local.h> 57 67 58 68 struct perf_callchain_entry { 59 - __u64 nr; 60 - __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ 69 + u64 nr; 70 + u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ 61 71 }; 62 72 63 73 struct perf_callchain_entry_ctx { 64 - struct perf_callchain_entry *entry; 65 - u32 max_stack; 66 - u32 nr; 67 - short contexts; 68 - bool contexts_maxed; 74 + struct perf_callchain_entry *entry; 75 + u32 max_stack; 76 + u32 nr; 77 + short contexts; 78 + bool contexts_maxed; 69 79 }; 70 80 71 81 typedef unsigned long (*perf_copy_f)(void *dst, const void *src, ··· 113 121 * already stored in age order, the hw_idx should be 0. 114 122 */ 115 123 struct perf_branch_stack { 116 - __u64 nr; 117 - __u64 hw_idx; 124 + u64 nr; 125 + u64 hw_idx; 118 126 struct perf_branch_entry entries[]; 119 127 }; 120 128 ··· 124 132 * extra PMU register associated with an event 125 133 */ 126 134 struct hw_perf_event_extra { 127 - u64 config; /* register value */ 128 - unsigned int reg; /* register address or index */ 129 - int alloc; /* extra register already allocated */ 130 - int idx; /* index in shared_regs->regs[] */ 135 + u64 config; /* register value */ 136 + unsigned int reg; /* register address or index */ 137 + int alloc; /* extra register already allocated */ 138 + int idx; /* index in shared_regs->regs[] */ 131 139 }; 132 140 133 141 /** ··· 136 144 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific 137 145 * usage. 138 146 */ 139 - #define PERF_EVENT_FLAG_ARCH 0x000fffff 140 - #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 147 + #define PERF_EVENT_FLAG_ARCH 0x0fffffff 148 + #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 141 149 142 150 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); 143 151 ··· 149 157 union { 150 158 struct { /* hardware */ 151 159 u64 config; 160 + u64 config1; 152 161 u64 last_tag; 162 + u64 dyn_constraint; 153 163 unsigned long config_base; 154 164 unsigned long event_base; 155 165 int event_base_rdpmc; ··· 219 225 /* 220 226 * hw_perf_event::state flags; used to track the PERF_EF_* state. 221 227 */ 222 - #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ 223 - #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ 224 - #define PERF_HES_ARCH 0x04 228 + 229 + /* the counter is stopped */ 230 + #define PERF_HES_STOPPED 0x01 231 + 232 + /* event->count up-to-date */ 233 + #define PERF_HES_UPTODATE 0x02 234 + 235 + #define PERF_HES_ARCH 0x04 225 236 226 237 int state; 227 238 ··· 275 276 */ 276 277 u64 freq_time_stamp; 277 278 u64 freq_count_stamp; 278 - #endif 279 + #endif /* CONFIG_PERF_EVENTS */ 279 280 }; 280 281 281 282 struct perf_event; ··· 284 285 /* 285 286 * Common implementation detail of pmu::{start,commit,cancel}_txn 286 287 */ 287 - #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ 288 - #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ 288 + 289 + /* txn to add/schedule event on PMU */ 290 + #define PERF_PMU_TXN_ADD 0x1 291 + 292 + /* txn to read event group from PMU */ 293 + #define PERF_PMU_TXN_READ 0x2 289 294 290 295 /** 291 296 * pmu::capabilities flags 292 297 */ 293 - #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 294 - #define PERF_PMU_CAP_NO_NMI 0x0002 295 - #define PERF_PMU_CAP_AUX_NO_SG 0x0004 296 - #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 297 - #define PERF_PMU_CAP_EXCLUSIVE 0x0010 298 - #define PERF_PMU_CAP_ITRACE 0x0020 299 - #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 300 - #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 301 - #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 302 - #define PERF_PMU_CAP_AUX_PAUSE 0x0200 298 + #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 299 + #define PERF_PMU_CAP_NO_NMI 0x0002 300 + #define PERF_PMU_CAP_AUX_NO_SG 0x0004 301 + #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 302 + #define PERF_PMU_CAP_EXCLUSIVE 0x0010 303 + #define PERF_PMU_CAP_ITRACE 0x0020 304 + #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 305 + #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 306 + #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 307 + #define PERF_PMU_CAP_AUX_PAUSE 0x0200 308 + #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 303 309 304 310 /** 305 311 * pmu::scope 306 312 */ 307 313 enum perf_pmu_scope { 308 - PERF_PMU_SCOPE_NONE = 0, 314 + PERF_PMU_SCOPE_NONE = 0, 309 315 PERF_PMU_SCOPE_CORE, 310 316 PERF_PMU_SCOPE_DIE, 311 317 PERF_PMU_SCOPE_CLUSTER, ··· 328 324 */ 329 325 struct pmu { 330 326 struct list_head entry; 327 + 328 + spinlock_t events_lock; 329 + struct list_head events; 331 330 332 331 struct module *module; 333 332 struct device *dev; ··· 394 387 * Flags for ->add()/->del()/ ->start()/->stop(). There are 395 388 * matching hw_perf_event::state flags. 396 389 */ 397 - #define PERF_EF_START 0x01 /* start the counter when adding */ 398 - #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ 399 - #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ 400 - #define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ 401 - #define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ 390 + 391 + /* start the counter when adding */ 392 + #define PERF_EF_START 0x01 393 + 394 + /* reload the counter when starting */ 395 + #define PERF_EF_RELOAD 0x02 396 + 397 + /* update the counter when stopping */ 398 + #define PERF_EF_UPDATE 0x04 399 + 400 + /* AUX area event, pause tracing */ 401 + #define PERF_EF_PAUSE 0x08 402 + 403 + /* AUX area event, resume tracing */ 404 + #define PERF_EF_RESUME 0x10 402 405 403 406 /* 404 407 * Adds/Removes a counter to/from the PMU, can be done inside a ··· 607 590 * This is a hardware-agnostic filter configuration as specified by the user. 608 591 */ 609 592 struct perf_addr_filter { 610 - struct list_head entry; 611 - struct path path; 612 - unsigned long offset; 613 - unsigned long size; 593 + struct list_head entry; 594 + struct path path; 595 + unsigned long offset; 596 + unsigned long size; 614 597 enum perf_addr_filter_action_t action; 615 598 }; 616 599 ··· 625 608 * bundled together; see perf_event_addr_filters(). 626 609 */ 627 610 struct perf_addr_filters_head { 628 - struct list_head list; 629 - raw_spinlock_t lock; 630 - unsigned int nr_file_filters; 611 + struct list_head list; 612 + raw_spinlock_t lock; 613 + unsigned int nr_file_filters; 631 614 }; 632 615 633 616 struct perf_addr_filter_range { 634 - unsigned long start; 635 - unsigned long size; 617 + unsigned long start; 618 + unsigned long size; 636 619 }; 637 620 638 621 /** 639 622 * enum perf_event_state - the states of an event: 640 623 */ 641 624 enum perf_event_state { 642 - PERF_EVENT_STATE_DEAD = -4, 643 - PERF_EVENT_STATE_EXIT = -3, 644 - PERF_EVENT_STATE_ERROR = -2, 625 + PERF_EVENT_STATE_DEAD = -5, 626 + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ 627 + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ 628 + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ 645 629 PERF_EVENT_STATE_OFF = -1, 646 630 PERF_EVENT_STATE_INACTIVE = 0, 647 631 PERF_EVENT_STATE_ACTIVE = 1, ··· 680 662 struct rcu_head rcu_head; 681 663 }; 682 664 683 - #define PERF_ATTACH_CONTEXT 0x0001 684 - #define PERF_ATTACH_GROUP 0x0002 685 - #define PERF_ATTACH_TASK 0x0004 686 - #define PERF_ATTACH_TASK_DATA 0x0008 687 - #define PERF_ATTACH_GLOBAL_DATA 0x0010 688 - #define PERF_ATTACH_SCHED_CB 0x0020 689 - #define PERF_ATTACH_CHILD 0x0040 690 - #define PERF_ATTACH_EXCLUSIVE 0x0080 691 - #define PERF_ATTACH_CALLCHAIN 0x0100 692 - #define PERF_ATTACH_ITRACE 0x0200 665 + #define PERF_ATTACH_CONTEXT 0x0001 666 + #define PERF_ATTACH_GROUP 0x0002 667 + #define PERF_ATTACH_TASK 0x0004 668 + #define PERF_ATTACH_TASK_DATA 0x0008 669 + #define PERF_ATTACH_GLOBAL_DATA 0x0010 670 + #define PERF_ATTACH_SCHED_CB 0x0020 671 + #define PERF_ATTACH_CHILD 0x0040 672 + #define PERF_ATTACH_EXCLUSIVE 0x0080 673 + #define PERF_ATTACH_CALLCHAIN 0x0100 674 + #define PERF_ATTACH_ITRACE 0x0200 693 675 694 676 struct bpf_prog; 695 677 struct perf_cgroup; 696 678 struct perf_buffer; 697 679 698 680 struct pmu_event_list { 699 - raw_spinlock_t lock; 700 - struct list_head list; 681 + raw_spinlock_t lock; 682 + struct list_head list; 701 683 }; 702 684 703 685 /* ··· 707 689 * disabled is sufficient since it will hold-off the IPIs. 708 690 */ 709 691 #ifdef CONFIG_PROVE_LOCKING 710 - #define lockdep_assert_event_ctx(event) \ 692 + # define lockdep_assert_event_ctx(event) \ 711 693 WARN_ON_ONCE(__lockdep_enabled && \ 712 694 (this_cpu_read(hardirqs_enabled) && \ 713 695 lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) 714 696 #else 715 - #define lockdep_assert_event_ctx(event) 697 + # define lockdep_assert_event_ctx(event) 716 698 #endif 717 699 718 700 #define for_each_sibling_event(sibling, event) \ ··· 870 852 #ifdef CONFIG_EVENT_TRACING 871 853 struct trace_event_call *tp_event; 872 854 struct event_filter *filter; 873 - #ifdef CONFIG_FUNCTION_TRACER 855 + # ifdef CONFIG_FUNCTION_TRACER 874 856 struct ftrace_ops ftrace_ops; 875 - #endif 857 + # endif 876 858 #endif 877 859 878 860 #ifdef CONFIG_CGROUP_PERF ··· 883 865 void *security; 884 866 #endif 885 867 struct list_head sb_list; 868 + struct list_head pmu_list; 886 869 887 870 /* 888 871 * Certain events gets forwarded to another pmu internally by over- ··· 891 872 * of it. event->orig_type contains original 'type' requested by 892 873 * user. 893 874 */ 894 - __u32 orig_type; 875 + u32 orig_type; 895 876 #endif /* CONFIG_PERF_EVENTS */ 896 877 }; 897 878 ··· 956 937 } 957 938 958 939 struct perf_event_groups { 959 - struct rb_root tree; 960 - u64 index; 940 + struct rb_root tree; 941 + u64 index; 961 942 }; 962 943 963 944 ··· 1174 1155 extern void perf_event_itrace_started(struct perf_event *event); 1175 1156 1176 1157 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); 1177 - extern void perf_pmu_unregister(struct pmu *pmu); 1158 + extern int perf_pmu_unregister(struct pmu *pmu); 1178 1159 1179 1160 extern void __perf_event_task_sched_in(struct task_struct *prev, 1180 1161 struct task_struct *task); ··· 1200 1181 extern int perf_event_refresh(struct perf_event *event, int refresh); 1201 1182 extern void perf_event_update_userpage(struct perf_event *event); 1202 1183 extern int perf_event_release_kernel(struct perf_event *event); 1184 + 1203 1185 extern struct perf_event * 1204 1186 perf_event_create_kernel_counter(struct perf_event_attr *attr, 1205 - int cpu, 1206 - struct task_struct *task, 1207 - perf_overflow_handler_t callback, 1208 - void *context); 1187 + int cpu, 1188 + struct task_struct *task, 1189 + perf_overflow_handler_t callback, 1190 + void *context); 1191 + 1209 1192 extern void perf_pmu_migrate_context(struct pmu *pmu, 1210 - int src_cpu, int dst_cpu); 1211 - int perf_event_read_local(struct perf_event *event, u64 *value, 1212 - u64 *enabled, u64 *running); 1193 + int src_cpu, int dst_cpu); 1194 + extern int perf_event_read_local(struct perf_event *event, u64 *value, 1195 + u64 *enabled, u64 *running); 1213 1196 extern u64 perf_event_read_value(struct perf_event *event, 1214 1197 u64 *enabled, u64 *running); 1215 1198 ··· 1428 1407 */ 1429 1408 static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) 1430 1409 { 1431 - br->mispred = 0; 1432 - br->predicted = 0; 1433 - br->in_tx = 0; 1434 - br->abort = 0; 1435 - br->cycles = 0; 1436 - br->type = 0; 1437 - br->spec = PERF_BR_SPEC_NA; 1438 - br->reserved = 0; 1410 + br->mispred = 0; 1411 + br->predicted = 0; 1412 + br->in_tx = 0; 1413 + br->abort = 0; 1414 + br->cycles = 0; 1415 + br->type = 0; 1416 + br->spec = PERF_BR_SPEC_NA; 1417 + br->reserved = 0; 1439 1418 } 1440 1419 1441 1420 extern void perf_output_sample(struct perf_output_handle *handle, ··· 1624 1603 enum perf_bpf_event_type type, 1625 1604 u16 flags); 1626 1605 1606 + #define PERF_GUEST_ACTIVE 0x01 1607 + #define PERF_GUEST_USER 0x02 1608 + 1609 + struct perf_guest_info_callbacks { 1610 + unsigned int (*state)(void); 1611 + unsigned long (*get_ip)(void); 1612 + unsigned int (*handle_intel_pt_intr)(void); 1613 + }; 1614 + 1627 1615 #ifdef CONFIG_GUEST_PERF_EVENTS 1616 + 1628 1617 extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; 1629 1618 1630 1619 DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); ··· 1645 1614 { 1646 1615 return static_call(__perf_guest_state)(); 1647 1616 } 1617 + 1648 1618 static inline unsigned long perf_guest_get_ip(void) 1649 1619 { 1650 1620 return static_call(__perf_guest_get_ip)(); 1651 1621 } 1622 + 1652 1623 static inline unsigned int perf_guest_handle_intel_pt_intr(void) 1653 1624 { 1654 1625 return static_call(__perf_guest_handle_intel_pt_intr)(); 1655 1626 } 1627 + 1656 1628 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); 1657 1629 extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); 1658 - #else 1630 + 1631 + #else /* !CONFIG_GUEST_PERF_EVENTS: */ 1632 + 1659 1633 static inline unsigned int perf_guest_state(void) { return 0; } 1660 1634 static inline unsigned long perf_guest_get_ip(void) { return 0; } 1661 1635 static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } 1662 - #endif /* CONFIG_GUEST_PERF_EVENTS */ 1636 + 1637 + #endif /* !CONFIG_GUEST_PERF_EVENTS */ 1663 1638 1664 1639 extern void perf_event_exec(void); 1665 1640 extern void perf_event_comm(struct task_struct *tsk, bool exec); ··· 1695 1658 { 1696 1659 if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { 1697 1660 struct perf_callchain_entry *entry = ctx->entry; 1661 + 1698 1662 entry->ip[entry->nr++] = ip; 1699 1663 ++ctx->contexts; 1700 1664 return 0; ··· 1709 1671 { 1710 1672 if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { 1711 1673 struct perf_callchain_entry *entry = ctx->entry; 1674 + 1712 1675 entry->ip[entry->nr++] = ip; 1713 1676 ++ctx->nr; 1714 1677 return 0; ··· 1736 1697 return sysctl_perf_event_paranoid > -1; 1737 1698 } 1738 1699 1739 - int perf_allow_kernel(void); 1700 + extern int perf_allow_kernel(void); 1740 1701 1741 1702 static inline int perf_allow_cpu(void) 1742 1703 { ··· 1799 1760 1800 1761 static inline bool has_aux(struct perf_event *event) 1801 1762 { 1802 - return event->pmu->setup_aux; 1763 + return event->pmu && event->pmu->setup_aux; 1803 1764 } 1804 1765 1805 1766 static inline bool has_aux_action(struct perf_event *event) ··· 1858 1819 1859 1820 extern void perf_output_end(struct perf_output_handle *handle); 1860 1821 extern unsigned int perf_output_copy(struct perf_output_handle *handle, 1861 - const void *buf, unsigned int len); 1822 + const void *buf, unsigned int len); 1862 1823 extern unsigned int perf_output_skip(struct perf_output_handle *handle, 1863 1824 unsigned int len); 1864 1825 extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, ··· 1875 1836 extern int perf_event_account_interrupt(struct perf_event *event); 1876 1837 extern int perf_event_period(struct perf_event *event, u64 value); 1877 1838 extern u64 perf_event_pause(struct perf_event *event, bool reset); 1839 + 1878 1840 #else /* !CONFIG_PERF_EVENTS: */ 1841 + 1879 1842 static inline void * 1880 1843 perf_aux_output_begin(struct perf_output_handle *handle, 1881 1844 struct perf_event *event) { return NULL; } ··· 1955 1914 static inline int __perf_event_disable(void *info) { return -1; } 1956 1915 static inline void perf_event_task_tick(void) { } 1957 1916 static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } 1958 - static inline int perf_event_period(struct perf_event *event, u64 value) 1959 - { 1960 - return -EINVAL; 1961 - } 1962 - static inline u64 perf_event_pause(struct perf_event *event, bool reset) 1963 - { 1964 - return 0; 1965 - } 1966 - static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) 1967 - { 1968 - return 0; 1969 - } 1970 - #endif 1917 + static inline int 1918 + perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } 1919 + static inline u64 1920 + perf_event_pause(struct perf_event *event, bool reset) { return 0; } 1921 + static inline int 1922 + perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { return 0; } 1923 + 1924 + #endif /* !CONFIG_PERF_EVENTS */ 1971 1925 1972 1926 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) 1973 1927 extern void perf_restore_debug_store(void); ··· 1970 1934 static inline void perf_restore_debug_store(void) { } 1971 1935 #endif 1972 1936 1973 - #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) 1937 + #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) 1974 1938 1975 1939 struct perf_pmu_events_attr { 1976 - struct device_attribute attr; 1977 - u64 id; 1978 - const char *event_str; 1940 + struct device_attribute attr; 1941 + u64 id; 1942 + const char *event_str; 1979 1943 }; 1980 1944 1981 1945 struct perf_pmu_events_ht_attr { 1982 - struct device_attribute attr; 1983 - u64 id; 1984 - const char *event_str_ht; 1985 - const char *event_str_noht; 1946 + struct device_attribute attr; 1947 + u64 id; 1948 + const char *event_str_ht; 1949 + const char *event_str_noht; 1986 1950 }; 1987 1951 1988 1952 struct perf_pmu_events_hybrid_attr { 1989 - struct device_attribute attr; 1990 - u64 id; 1991 - const char *event_str; 1992 - u64 pmu_type; 1953 + struct device_attribute attr; 1954 + u64 id; 1955 + const char *event_str; 1956 + u64 pmu_type; 1993 1957 }; 1994 1958 1995 1959 struct perf_pmu_format_hybrid_attr { 1996 - struct device_attribute attr; 1997 - u64 pmu_type; 1960 + struct device_attribute attr; 1961 + u64 pmu_type; 1998 1962 }; 1999 1963 2000 1964 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, ··· 2036 2000 2037 2001 /* Performance counter hotplug functions */ 2038 2002 #ifdef CONFIG_PERF_EVENTS 2039 - int perf_event_init_cpu(unsigned int cpu); 2040 - int perf_event_exit_cpu(unsigned int cpu); 2003 + extern int perf_event_init_cpu(unsigned int cpu); 2004 + extern int perf_event_exit_cpu(unsigned int cpu); 2041 2005 #else 2042 - #define perf_event_init_cpu NULL 2043 - #define perf_event_exit_cpu NULL 2006 + # define perf_event_init_cpu NULL 2007 + # define perf_event_exit_cpu NULL 2044 2008 #endif 2045 2009 2046 2010 extern void arch_perf_update_userpage(struct perf_event *event,
+325 -314
include/uapi/linux/perf_event.h
··· 39 39 40 40 /* 41 41 * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE 42 + * 42 43 * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA 43 44 * AA: hardware event ID 44 45 * EEEEEEEE: PMU type ID 46 + * 45 47 * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB 46 48 * BB: hardware cache ID 47 49 * CC: hardware cache op ID 48 50 * DD: hardware cache op result ID 49 51 * EEEEEEEE: PMU type ID 50 - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. 52 + * 53 + * If the PMU type ID is 0, PERF_TYPE_RAW will be applied. 51 54 */ 52 - #define PERF_PMU_TYPE_SHIFT 32 53 - #define PERF_HW_EVENT_MASK 0xffffffff 55 + #define PERF_PMU_TYPE_SHIFT 32 56 + #define PERF_HW_EVENT_MASK 0xffffffff 54 57 55 58 /* 56 59 * Generalized performance event event_id types, used by the ··· 115 112 /* 116 113 * Special "software" events provided by the kernel, even if the hardware 117 114 * does not support performance events. These events measure various 118 - * physical and sw events of the kernel (and allow the profiling of them as 115 + * physical and SW events of the kernel (and allow the profiling of them as 119 116 * well): 120 117 */ 121 118 enum perf_sw_ids { ··· 170 167 }; 171 168 172 169 #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) 170 + 173 171 /* 174 - * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set 172 + * Values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set. 175 173 * 176 174 * If the user does not pass priv level information via branch_sample_type, 177 175 * the kernel uses the event's priv level. Branch and event priv levels do ··· 182 178 * of branches and therefore it supersedes all the other types. 183 179 */ 184 180 enum perf_branch_sample_type_shift { 185 - PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ 186 - PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ 187 - PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ 181 + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ 182 + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ 183 + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ 188 184 189 - PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ 190 - PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ 191 - PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ 192 - PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ 193 - PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ 194 - PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ 195 - PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ 185 + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ 186 + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ 187 + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ 188 + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ 189 + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ 190 + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ 191 + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ 196 192 PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ 197 193 198 - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ 194 + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* CALL/RET stack */ 199 195 PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ 200 196 PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ 201 197 ··· 214 210 }; 215 211 216 212 enum perf_branch_sample_type { 217 - PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, 218 - PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, 219 - PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, 213 + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, 214 + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, 215 + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, 220 216 221 - PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, 222 - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, 223 - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, 224 - PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, 225 - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, 226 - PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, 227 - PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, 228 - PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, 217 + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, 218 + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, 219 + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, 220 + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, 221 + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, 222 + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, 223 + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, 224 + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, 229 225 230 - PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, 231 - PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, 232 - PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, 226 + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, 227 + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, 228 + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, 233 229 234 - PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, 235 - PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, 230 + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, 231 + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, 236 232 237 - PERF_SAMPLE_BRANCH_TYPE_SAVE = 238 - 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, 233 + PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, 239 234 240 - PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, 235 + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, 241 236 242 - PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, 237 + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, 243 238 244 - PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, 239 + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, 245 240 246 - PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, 241 + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, 247 242 }; 248 243 249 244 /* 250 - * Common flow change classification 245 + * Common control flow change classifications: 251 246 */ 252 247 enum { 253 - PERF_BR_UNKNOWN = 0, /* unknown */ 254 - PERF_BR_COND = 1, /* conditional */ 255 - PERF_BR_UNCOND = 2, /* unconditional */ 256 - PERF_BR_IND = 3, /* indirect */ 257 - PERF_BR_CALL = 4, /* function call */ 258 - PERF_BR_IND_CALL = 5, /* indirect function call */ 259 - PERF_BR_RET = 6, /* function return */ 260 - PERF_BR_SYSCALL = 7, /* syscall */ 261 - PERF_BR_SYSRET = 8, /* syscall return */ 262 - PERF_BR_COND_CALL = 9, /* conditional function call */ 263 - PERF_BR_COND_RET = 10, /* conditional function return */ 264 - PERF_BR_ERET = 11, /* exception return */ 265 - PERF_BR_IRQ = 12, /* irq */ 266 - PERF_BR_SERROR = 13, /* system error */ 267 - PERF_BR_NO_TX = 14, /* not in transaction */ 268 - PERF_BR_EXTEND_ABI = 15, /* extend ABI */ 248 + PERF_BR_UNKNOWN = 0, /* Unknown */ 249 + PERF_BR_COND = 1, /* Conditional */ 250 + PERF_BR_UNCOND = 2, /* Unconditional */ 251 + PERF_BR_IND = 3, /* Indirect */ 252 + PERF_BR_CALL = 4, /* Function call */ 253 + PERF_BR_IND_CALL = 5, /* Indirect function call */ 254 + PERF_BR_RET = 6, /* Function return */ 255 + PERF_BR_SYSCALL = 7, /* Syscall */ 256 + PERF_BR_SYSRET = 8, /* Syscall return */ 257 + PERF_BR_COND_CALL = 9, /* Conditional function call */ 258 + PERF_BR_COND_RET = 10, /* Conditional function return */ 259 + PERF_BR_ERET = 11, /* Exception return */ 260 + PERF_BR_IRQ = 12, /* IRQ */ 261 + PERF_BR_SERROR = 13, /* System error */ 262 + PERF_BR_NO_TX = 14, /* Not in transaction */ 263 + PERF_BR_EXTEND_ABI = 15, /* Extend ABI */ 269 264 PERF_BR_MAX, 270 265 }; 271 266 272 267 /* 273 - * Common branch speculation outcome classification 268 + * Common branch speculation outcome classifications: 274 269 */ 275 270 enum { 276 - PERF_BR_SPEC_NA = 0, /* Not available */ 277 - PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ 278 - PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ 279 - PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ 271 + PERF_BR_SPEC_NA = 0, /* Not available */ 272 + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ 273 + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ 274 + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ 280 275 PERF_BR_SPEC_MAX, 281 276 }; 282 277 283 278 enum { 284 - PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ 285 - PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ 286 - PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ 287 - PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ 288 - PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ 289 - PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ 290 - PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ 291 - PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ 279 + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ 280 + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ 281 + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ 282 + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ 283 + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ 284 + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ 285 + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ 286 + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ 292 287 PERF_BR_NEW_MAX, 293 288 }; 294 289 295 290 enum { 296 - PERF_BR_PRIV_UNKNOWN = 0, 297 - PERF_BR_PRIV_USER = 1, 298 - PERF_BR_PRIV_KERNEL = 2, 299 - PERF_BR_PRIV_HV = 3, 291 + PERF_BR_PRIV_UNKNOWN = 0, 292 + PERF_BR_PRIV_USER = 1, 293 + PERF_BR_PRIV_KERNEL = 2, 294 + PERF_BR_PRIV_HV = 3, 300 295 }; 301 296 302 - #define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 303 - #define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 304 - #define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 305 - #define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 306 - #define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 297 + #define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 298 + #define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 299 + #define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 300 + #define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 301 + #define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 307 302 308 303 #define PERF_SAMPLE_BRANCH_PLM_ALL \ 309 304 (PERF_SAMPLE_BRANCH_USER|\ ··· 313 310 * Values to determine ABI of the registers dump. 314 311 */ 315 312 enum perf_sample_regs_abi { 316 - PERF_SAMPLE_REGS_ABI_NONE = 0, 317 - PERF_SAMPLE_REGS_ABI_32 = 1, 318 - PERF_SAMPLE_REGS_ABI_64 = 2, 313 + PERF_SAMPLE_REGS_ABI_NONE = 0, 314 + PERF_SAMPLE_REGS_ABI_32 = 1, 315 + PERF_SAMPLE_REGS_ABI_64 = 2, 319 316 }; 320 317 321 318 /* ··· 323 320 * abort events. Multiple bits can be set. 324 321 */ 325 322 enum { 326 - PERF_TXN_ELISION = (1 << 0), /* From elision */ 327 - PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ 328 - PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ 329 - PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ 330 - PERF_TXN_RETRY = (1 << 4), /* Retry possible */ 331 - PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ 332 - PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ 333 - PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ 323 + PERF_TXN_ELISION = (1 << 0), /* From elision */ 324 + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ 325 + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ 326 + PERF_TXN_ASYNC = (1 << 3), /* Instruction is not related */ 327 + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ 328 + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ 329 + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ 330 + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ 334 331 335 - PERF_TXN_MAX = (1 << 8), /* non-ABI */ 332 + PERF_TXN_MAX = (1 << 8), /* non-ABI */ 336 333 337 - /* bits 32..63 are reserved for the abort code */ 334 + /* Bits 32..63 are reserved for the abort code */ 338 335 339 - PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), 340 - PERF_TXN_ABORT_SHIFT = 32, 336 + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), 337 + PERF_TXN_ABORT_SHIFT = 32, 341 338 }; 342 339 343 340 /* ··· 372 369 PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ 373 370 }; 374 371 375 - #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 376 - #define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ 377 - #define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ 378 - #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ 379 - /* add: sample_stack_user */ 380 - #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ 381 - #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ 382 - #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ 383 - #define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ 384 - #define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ 372 + #define PERF_ATTR_SIZE_VER0 64 /* Size of first published 'struct perf_event_attr' */ 373 + #define PERF_ATTR_SIZE_VER1 72 /* Add: config2 */ 374 + #define PERF_ATTR_SIZE_VER2 80 /* Add: branch_sample_type */ 375 + #define PERF_ATTR_SIZE_VER3 96 /* Add: sample_regs_user */ 376 + /* Add: sample_stack_user */ 377 + #define PERF_ATTR_SIZE_VER4 104 /* Add: sample_regs_intr */ 378 + #define PERF_ATTR_SIZE_VER5 112 /* Add: aux_watermark */ 379 + #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ 380 + #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ 381 + #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ 385 382 386 383 /* 387 - * Hardware event_id to monitor via a performance monitoring event: 388 - * 389 - * @sample_max_stack: Max number of frame pointers in a callchain, 390 - * should be < /proc/sys/kernel/perf_event_max_stack 391 - * Max number of entries of branch stack 392 - * should be < hardware limit 384 + * 'struct perf_event_attr' contains various attributes that define 385 + * a performance event - most of them hardware related configuration 386 + * details, but also a lot of behavioral switches and values implemented 387 + * by the kernel. 393 388 */ 394 389 struct perf_event_attr { 395 390 ··· 397 396 __u32 type; 398 397 399 398 /* 400 - * Size of the attr structure, for fwd/bwd compat. 399 + * Size of the attr structure, for forward/backwards compatibility. 401 400 */ 402 401 __u32 size; 403 402 ··· 452 451 comm_exec : 1, /* flag comm events that are due to an exec */ 453 452 use_clockid : 1, /* use @clockid for time fields */ 454 453 context_switch : 1, /* context switch data */ 455 - write_backward : 1, /* Write ring buffer from end to beginning */ 454 + write_backward : 1, /* write ring buffer from end to beginning */ 456 455 namespaces : 1, /* include namespaces data */ 457 456 ksymbol : 1, /* include ksymbol events */ 458 - bpf_event : 1, /* include bpf events */ 457 + bpf_event : 1, /* include BPF events */ 459 458 aux_output : 1, /* generate AUX records instead of events */ 460 459 cgroup : 1, /* include cgroup events */ 461 460 text_poke : 1, /* include text poke events */ 462 - build_id : 1, /* use build id in mmap2 events */ 461 + build_id : 1, /* use build ID in mmap2 events */ 463 462 inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 464 463 remove_on_exec : 1, /* event is removed from task on exec */ 465 464 sigtrap : 1, /* send synchronous SIGTRAP on event */ 466 465 __reserved_1 : 26; 467 466 468 467 union { 469 - __u32 wakeup_events; /* wakeup every n events */ 468 + __u32 wakeup_events; /* wake up every n events */ 470 469 __u32 wakeup_watermark; /* bytes before wakeup */ 471 470 }; 472 471 ··· 475 474 __u64 bp_addr; 476 475 __u64 kprobe_func; /* for perf_kprobe */ 477 476 __u64 uprobe_path; /* for perf_uprobe */ 478 - __u64 config1; /* extension of config */ 477 + __u64 config1; /* extension of config */ 479 478 }; 480 479 union { 481 480 __u64 bp_len; 482 - __u64 kprobe_addr; /* when kprobe_func == NULL */ 481 + __u64 kprobe_addr; /* when kprobe_func == NULL */ 483 482 __u64 probe_offset; /* for perf_[k,u]probe */ 484 - __u64 config2; /* extension of config1 */ 483 + __u64 config2; /* extension of config1 */ 485 484 }; 486 485 __u64 branch_sample_type; /* enum perf_branch_sample_type */ 487 486 ··· 511 510 * Wakeup watermark for AUX area 512 511 */ 513 512 __u32 aux_watermark; 513 + 514 + /* 515 + * Max number of frame pointers in a callchain, should be 516 + * lower than /proc/sys/kernel/perf_event_max_stack. 517 + * 518 + * Max number of entries of branch stack should be lower 519 + * than the hardware limit. 520 + */ 514 521 __u16 sample_max_stack; 522 + 515 523 __u16 __reserved_2; 516 524 __u32 aux_sample_size; 517 525 ··· 547 537 548 538 /* 549 539 * Structure used by below PERF_EVENT_IOC_QUERY_BPF command 550 - * to query bpf programs attached to the same perf tracepoint 540 + * to query BPF programs attached to the same perf tracepoint 551 541 * as the given perf event. 552 542 */ 553 543 struct perf_event_query_bpf { ··· 569 559 /* 570 560 * Ioctls that can be done on a perf event fd: 571 561 */ 572 - #define PERF_EVENT_IOC_ENABLE _IO ('$', 0) 573 - #define PERF_EVENT_IOC_DISABLE _IO ('$', 1) 574 - #define PERF_EVENT_IOC_REFRESH _IO ('$', 2) 575 - #define PERF_EVENT_IOC_RESET _IO ('$', 3) 576 - #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) 577 - #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) 578 - #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) 579 - #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) 580 - #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) 581 - #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) 562 + #define PERF_EVENT_IOC_ENABLE _IO ('$', 0) 563 + #define PERF_EVENT_IOC_DISABLE _IO ('$', 1) 564 + #define PERF_EVENT_IOC_REFRESH _IO ('$', 2) 565 + #define PERF_EVENT_IOC_RESET _IO ('$', 3) 566 + #define PERF_EVENT_IOC_PERIOD _IOW ('$', 4, __u64) 567 + #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) 568 + #define PERF_EVENT_IOC_SET_FILTER _IOW ('$', 6, char *) 569 + #define PERF_EVENT_IOC_ID _IOR ('$', 7, __u64 *) 570 + #define PERF_EVENT_IOC_SET_BPF _IOW ('$', 8, __u32) 571 + #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW ('$', 9, __u32) 582 572 #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) 583 - #define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) 573 + #define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW ('$', 11, struct perf_event_attr *) 584 574 585 575 enum perf_event_ioc_flags { 586 - PERF_IOC_FLAG_GROUP = 1U << 0, 576 + PERF_IOC_FLAG_GROUP = 1U << 0, 587 577 }; 588 578 589 579 /* ··· 594 584 __u32 compat_version; /* lowest version this is compat with */ 595 585 596 586 /* 597 - * Bits needed to read the hw events in user-space. 587 + * Bits needed to read the HW events in user-space. 598 588 * 599 589 * u32 seq, time_mult, time_shift, index, width; 600 590 * u64 count, enabled, running; ··· 632 622 __u32 index; /* hardware event identifier */ 633 623 __s64 offset; /* add to hardware event value */ 634 624 __u64 time_enabled; /* time event active */ 635 - __u64 time_running; /* time event on cpu */ 625 + __u64 time_running; /* time event on CPU */ 636 626 union { 637 627 __u64 capabilities; 638 628 struct { ··· 660 650 661 651 /* 662 652 * If cap_usr_time the below fields can be used to compute the time 663 - * delta since time_enabled (in ns) using rdtsc or similar. 653 + * delta since time_enabled (in ns) using RDTSC or similar. 664 654 * 665 655 * u64 quot, rem; 666 656 * u64 delta; ··· 733 723 * after reading this value. 734 724 * 735 725 * When the mapping is PROT_WRITE the @data_tail value should be 736 - * written by userspace to reflect the last read data, after issueing 726 + * written by user-space to reflect the last read data, after issuing 737 727 * an smp_mb() to separate the data read from the ->data_tail store. 738 728 * In this case the kernel will not over-write unread data. 739 729 * ··· 749 739 750 740 /* 751 741 * AUX area is defined by aux_{offset,size} fields that should be set 752 - * by the userspace, so that 742 + * by the user-space, so that 753 743 * 754 744 * aux_offset >= data_offset + data_size 755 745 * ··· 823 813 * Indicates that thread was preempted in TASK_RUNNING state. 824 814 * 825 815 * PERF_RECORD_MISC_MMAP_BUILD_ID: 826 - * Indicates that mmap2 event carries build id data. 816 + * Indicates that mmap2 event carries build ID data. 827 817 */ 828 818 #define PERF_RECORD_MISC_EXACT_IP (1 << 14) 829 819 #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) ··· 834 824 #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) 835 825 836 826 struct perf_event_header { 837 - __u32 type; 838 - __u16 misc; 839 - __u16 size; 827 + __u32 type; 828 + __u16 misc; 829 + __u16 size; 840 830 }; 841 831 842 832 struct perf_ns_link_info { 843 - __u64 dev; 844 - __u64 ino; 833 + __u64 dev; 834 + __u64 ino; 845 835 }; 846 836 847 837 enum { 848 - NET_NS_INDEX = 0, 849 - UTS_NS_INDEX = 1, 850 - IPC_NS_INDEX = 2, 851 - PID_NS_INDEX = 3, 852 - USER_NS_INDEX = 4, 853 - MNT_NS_INDEX = 5, 854 - CGROUP_NS_INDEX = 6, 838 + NET_NS_INDEX = 0, 839 + UTS_NS_INDEX = 1, 840 + IPC_NS_INDEX = 2, 841 + PID_NS_INDEX = 3, 842 + USER_NS_INDEX = 4, 843 + MNT_NS_INDEX = 5, 844 + CGROUP_NS_INDEX = 6, 855 845 856 - NR_NAMESPACES, /* number of available namespaces */ 846 + NR_NAMESPACES, /* number of available namespaces */ 857 847 }; 858 848 859 849 enum perf_event_type { ··· 869 859 * optional fields being ignored. 870 860 * 871 861 * struct sample_id { 872 - * { u32 pid, tid; } && PERF_SAMPLE_TID 873 - * { u64 time; } && PERF_SAMPLE_TIME 874 - * { u64 id; } && PERF_SAMPLE_ID 875 - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID 876 - * { u32 cpu, res; } && PERF_SAMPLE_CPU 862 + * { u32 pid, tid; } && PERF_SAMPLE_TID 863 + * { u64 time; } && PERF_SAMPLE_TIME 864 + * { u64 id; } && PERF_SAMPLE_ID 865 + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID 866 + * { u32 cpu, res; } && PERF_SAMPLE_CPU 877 867 * { u64 id; } && PERF_SAMPLE_IDENTIFIER 878 868 * } && perf_event_attr::sample_id_all 879 869 * ··· 884 874 885 875 /* 886 876 * The MMAP events record the PROT_EXEC mappings so that we can 887 - * correlate userspace IPs to code. They have the following structure: 877 + * correlate user-space IPs to code. They have the following structure: 888 878 * 889 879 * struct { 890 880 * struct perf_event_header header; ··· 894 884 * u64 len; 895 885 * u64 pgoff; 896 886 * char filename[]; 897 - * struct sample_id sample_id; 887 + * struct sample_id sample_id; 898 888 * }; 899 889 */ 900 890 PERF_RECORD_MMAP = 1, ··· 904 894 * struct perf_event_header header; 905 895 * u64 id; 906 896 * u64 lost; 907 - * struct sample_id sample_id; 897 + * struct sample_id sample_id; 908 898 * }; 909 899 */ 910 900 PERF_RECORD_LOST = 2, ··· 915 905 * 916 906 * u32 pid, tid; 917 907 * char comm[]; 918 - * struct sample_id sample_id; 908 + * struct sample_id sample_id; 919 909 * }; 920 910 */ 921 911 PERF_RECORD_COMM = 3, ··· 926 916 * u32 pid, ppid; 927 917 * u32 tid, ptid; 928 918 * u64 time; 929 - * struct sample_id sample_id; 919 + * struct sample_id sample_id; 930 920 * }; 931 921 */ 932 922 PERF_RECORD_EXIT = 4, ··· 937 927 * u64 time; 938 928 * u64 id; 939 929 * u64 stream_id; 940 - * struct sample_id sample_id; 930 + * struct sample_id sample_id; 941 931 * }; 942 932 */ 943 933 PERF_RECORD_THROTTLE = 5, ··· 949 939 * u32 pid, ppid; 950 940 * u32 tid, ptid; 951 941 * u64 time; 952 - * struct sample_id sample_id; 942 + * struct sample_id sample_id; 953 943 * }; 954 944 */ 955 945 PERF_RECORD_FORK = 7, ··· 960 950 * u32 pid, tid; 961 951 * 962 952 * struct read_format values; 963 - * struct sample_id sample_id; 953 + * struct sample_id sample_id; 964 954 * }; 965 955 */ 966 956 PERF_RECORD_READ = 8, ··· 1015 1005 * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS 1016 1006 * } && PERF_SAMPLE_BRANCH_STACK 1017 1007 * 1018 - * { u64 abi; # enum perf_sample_regs_abi 1019 - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER 1008 + * { u64 abi; # enum perf_sample_regs_abi 1009 + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER 1020 1010 * 1021 - * { u64 size; 1022 - * char data[size]; 1023 - * u64 dyn_size; } && PERF_SAMPLE_STACK_USER 1011 + * { u64 size; 1012 + * char data[size]; 1013 + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER 1024 1014 * 1025 1015 * { union perf_sample_weight 1026 1016 * { ··· 1045 1035 * { u64 abi; # enum perf_sample_regs_abi 1046 1036 * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR 1047 1037 * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR 1048 - * { u64 size; 1049 - * char data[size]; } && PERF_SAMPLE_AUX 1038 + * { u64 cgroup;} && PERF_SAMPLE_CGROUP 1050 1039 * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE 1051 1040 * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE 1041 + * { u64 size; 1042 + * char data[size]; } && PERF_SAMPLE_AUX 1052 1043 * }; 1053 1044 */ 1054 1045 PERF_RECORD_SAMPLE = 9, ··· 1081 1070 * }; 1082 1071 * u32 prot, flags; 1083 1072 * char filename[]; 1084 - * struct sample_id sample_id; 1073 + * struct sample_id sample_id; 1085 1074 * }; 1086 1075 */ 1087 1076 PERF_RECORD_MMAP2 = 10, ··· 1090 1079 * Records that new data landed in the AUX buffer part. 1091 1080 * 1092 1081 * struct { 1093 - * struct perf_event_header header; 1082 + * struct perf_event_header header; 1094 1083 * 1095 - * u64 aux_offset; 1096 - * u64 aux_size; 1084 + * u64 aux_offset; 1085 + * u64 aux_size; 1097 1086 * u64 flags; 1098 - * struct sample_id sample_id; 1087 + * struct sample_id sample_id; 1099 1088 * }; 1100 1089 */ 1101 1090 PERF_RECORD_AUX = 11, ··· 1178 1167 PERF_RECORD_KSYMBOL = 17, 1179 1168 1180 1169 /* 1181 - * Record bpf events: 1170 + * Record BPF events: 1182 1171 * enum perf_bpf_event_type { 1183 1172 * PERF_BPF_EVENT_UNKNOWN = 0, 1184 1173 * PERF_BPF_EVENT_PROG_LOAD = 1, ··· 1256 1245 #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) 1257 1246 1258 1247 enum perf_bpf_event_type { 1259 - PERF_BPF_EVENT_UNKNOWN = 0, 1260 - PERF_BPF_EVENT_PROG_LOAD = 1, 1261 - PERF_BPF_EVENT_PROG_UNLOAD = 2, 1262 - PERF_BPF_EVENT_MAX, /* non-ABI */ 1248 + PERF_BPF_EVENT_UNKNOWN = 0, 1249 + PERF_BPF_EVENT_PROG_LOAD = 1, 1250 + PERF_BPF_EVENT_PROG_UNLOAD = 2, 1251 + PERF_BPF_EVENT_MAX, /* non-ABI */ 1263 1252 }; 1264 1253 1265 - #define PERF_MAX_STACK_DEPTH 127 1266 - #define PERF_MAX_CONTEXTS_PER_STACK 8 1254 + #define PERF_MAX_STACK_DEPTH 127 1255 + #define PERF_MAX_CONTEXTS_PER_STACK 8 1267 1256 1268 1257 enum perf_callchain_context { 1269 - PERF_CONTEXT_HV = (__u64)-32, 1270 - PERF_CONTEXT_KERNEL = (__u64)-128, 1271 - PERF_CONTEXT_USER = (__u64)-512, 1258 + PERF_CONTEXT_HV = (__u64)-32, 1259 + PERF_CONTEXT_KERNEL = (__u64)-128, 1260 + PERF_CONTEXT_USER = (__u64)-512, 1272 1261 1273 - PERF_CONTEXT_GUEST = (__u64)-2048, 1274 - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, 1275 - PERF_CONTEXT_GUEST_USER = (__u64)-2560, 1262 + PERF_CONTEXT_GUEST = (__u64)-2048, 1263 + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, 1264 + PERF_CONTEXT_GUEST_USER = (__u64)-2560, 1276 1265 1277 - PERF_CONTEXT_MAX = (__u64)-4095, 1266 + PERF_CONTEXT_MAX = (__u64)-4095, 1278 1267 }; 1279 1268 1280 1269 /** 1281 1270 * PERF_RECORD_AUX::flags bits 1282 1271 */ 1283 - #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ 1284 - #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ 1285 - #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ 1286 - #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ 1272 + #define PERF_AUX_FLAG_TRUNCATED 0x0001 /* Record was truncated to fit */ 1273 + #define PERF_AUX_FLAG_OVERWRITE 0x0002 /* Snapshot from overwrite mode */ 1274 + #define PERF_AUX_FLAG_PARTIAL 0x0004 /* Record contains gaps */ 1275 + #define PERF_AUX_FLAG_COLLISION 0x0008 /* Sample collided with another */ 1287 1276 #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ 1288 1277 1289 1278 /* CoreSight PMU AUX buffer formats */ 1290 - #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ 1291 - #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ 1279 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ 1280 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ 1292 1281 1293 - #define PERF_FLAG_FD_NO_GROUP (1UL << 0) 1294 - #define PERF_FLAG_FD_OUTPUT (1UL << 1) 1295 - #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ 1296 - #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 1282 + #define PERF_FLAG_FD_NO_GROUP (1UL << 0) 1283 + #define PERF_FLAG_FD_OUTPUT (1UL << 1) 1284 + #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup ID, per-CPU mode only */ 1285 + #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 1297 1286 1298 1287 #if defined(__LITTLE_ENDIAN_BITFIELD) 1299 1288 union perf_mem_data_src { 1300 1289 __u64 val; 1301 1290 struct { 1302 - __u64 mem_op:5, /* type of opcode */ 1303 - mem_lvl:14, /* memory hierarchy level */ 1304 - mem_snoop:5, /* snoop mode */ 1305 - mem_lock:2, /* lock instr */ 1306 - mem_dtlb:7, /* tlb access */ 1307 - mem_lvl_num:4, /* memory hierarchy level number */ 1308 - mem_remote:1, /* remote */ 1309 - mem_snoopx:2, /* snoop mode, ext */ 1310 - mem_blk:3, /* access blocked */ 1311 - mem_hops:3, /* hop level */ 1312 - mem_rsvd:18; 1291 + __u64 mem_op : 5, /* Type of opcode */ 1292 + mem_lvl : 14, /* Memory hierarchy level */ 1293 + mem_snoop : 5, /* Snoop mode */ 1294 + mem_lock : 2, /* Lock instr */ 1295 + mem_dtlb : 7, /* TLB access */ 1296 + mem_lvl_num : 4, /* Memory hierarchy level number */ 1297 + mem_remote : 1, /* Remote */ 1298 + mem_snoopx : 2, /* Snoop mode, ext */ 1299 + mem_blk : 3, /* Access blocked */ 1300 + mem_hops : 3, /* Hop level */ 1301 + mem_rsvd : 18; 1313 1302 }; 1314 1303 }; 1315 1304 #elif defined(__BIG_ENDIAN_BITFIELD) 1316 1305 union perf_mem_data_src { 1317 1306 __u64 val; 1318 1307 struct { 1319 - __u64 mem_rsvd:18, 1320 - mem_hops:3, /* hop level */ 1321 - mem_blk:3, /* access blocked */ 1322 - mem_snoopx:2, /* snoop mode, ext */ 1323 - mem_remote:1, /* remote */ 1324 - mem_lvl_num:4, /* memory hierarchy level number */ 1325 - mem_dtlb:7, /* tlb access */ 1326 - mem_lock:2, /* lock instr */ 1327 - mem_snoop:5, /* snoop mode */ 1328 - mem_lvl:14, /* memory hierarchy level */ 1329 - mem_op:5; /* type of opcode */ 1308 + __u64 mem_rsvd : 18, 1309 + mem_hops : 3, /* Hop level */ 1310 + mem_blk : 3, /* Access blocked */ 1311 + mem_snoopx : 2, /* Snoop mode, ext */ 1312 + mem_remote : 1, /* Remote */ 1313 + mem_lvl_num : 4, /* Memory hierarchy level number */ 1314 + mem_dtlb : 7, /* TLB access */ 1315 + mem_lock : 2, /* Lock instr */ 1316 + mem_snoop : 5, /* Snoop mode */ 1317 + mem_lvl : 14, /* Memory hierarchy level */ 1318 + mem_op : 5; /* Type of opcode */ 1330 1319 }; 1331 1320 }; 1332 1321 #else 1333 - #error "Unknown endianness" 1322 + # error "Unknown endianness" 1334 1323 #endif 1335 1324 1336 - /* type of opcode (load/store/prefetch,code) */ 1337 - #define PERF_MEM_OP_NA 0x01 /* not available */ 1338 - #define PERF_MEM_OP_LOAD 0x02 /* load instruction */ 1339 - #define PERF_MEM_OP_STORE 0x04 /* store instruction */ 1340 - #define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ 1341 - #define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ 1342 - #define PERF_MEM_OP_SHIFT 0 1325 + /* Type of memory opcode: */ 1326 + #define PERF_MEM_OP_NA 0x0001 /* Not available */ 1327 + #define PERF_MEM_OP_LOAD 0x0002 /* Load instruction */ 1328 + #define PERF_MEM_OP_STORE 0x0004 /* Store instruction */ 1329 + #define PERF_MEM_OP_PFETCH 0x0008 /* Prefetch */ 1330 + #define PERF_MEM_OP_EXEC 0x0010 /* Code (execution) */ 1331 + #define PERF_MEM_OP_SHIFT 0 1343 1332 1344 1333 /* 1345 - * PERF_MEM_LVL_* namespace being depricated to some extent in the 1334 + * The PERF_MEM_LVL_* namespace is being deprecated to some extent in 1346 1335 * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. 1347 - * Supporting this namespace inorder to not break defined ABIs. 1336 + * We support this namespace in order to not break defined ABIs. 1348 1337 * 1349 - * memory hierarchy (memory level, hit or miss) 1338 + * Memory hierarchy (memory level, hit or miss) 1350 1339 */ 1351 - #define PERF_MEM_LVL_NA 0x01 /* not available */ 1352 - #define PERF_MEM_LVL_HIT 0x02 /* hit level */ 1353 - #define PERF_MEM_LVL_MISS 0x04 /* miss level */ 1354 - #define PERF_MEM_LVL_L1 0x08 /* L1 */ 1355 - #define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ 1356 - #define PERF_MEM_LVL_L2 0x20 /* L2 */ 1357 - #define PERF_MEM_LVL_L3 0x40 /* L3 */ 1358 - #define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ 1359 - #define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ 1360 - #define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ 1361 - #define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ 1362 - #define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ 1363 - #define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ 1364 - #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ 1365 - #define PERF_MEM_LVL_SHIFT 5 1340 + #define PERF_MEM_LVL_NA 0x0001 /* Not available */ 1341 + #define PERF_MEM_LVL_HIT 0x0002 /* Hit level */ 1342 + #define PERF_MEM_LVL_MISS 0x0004 /* Miss level */ 1343 + #define PERF_MEM_LVL_L1 0x0008 /* L1 */ 1344 + #define PERF_MEM_LVL_LFB 0x0010 /* Line Fill Buffer */ 1345 + #define PERF_MEM_LVL_L2 0x0020 /* L2 */ 1346 + #define PERF_MEM_LVL_L3 0x0040 /* L3 */ 1347 + #define PERF_MEM_LVL_LOC_RAM 0x0080 /* Local DRAM */ 1348 + #define PERF_MEM_LVL_REM_RAM1 0x0100 /* Remote DRAM (1 hop) */ 1349 + #define PERF_MEM_LVL_REM_RAM2 0x0200 /* Remote DRAM (2 hops) */ 1350 + #define PERF_MEM_LVL_REM_CCE1 0x0400 /* Remote Cache (1 hop) */ 1351 + #define PERF_MEM_LVL_REM_CCE2 0x0800 /* Remote Cache (2 hops) */ 1352 + #define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ 1353 + #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ 1354 + #define PERF_MEM_LVL_SHIFT 5 1366 1355 1367 - #define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ 1368 - #define PERF_MEM_REMOTE_SHIFT 37 1356 + #define PERF_MEM_REMOTE_REMOTE 0x0001 /* Remote */ 1357 + #define PERF_MEM_REMOTE_SHIFT 37 1369 1358 1370 - #define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ 1371 - #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ 1372 - #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ 1373 - #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ 1374 - #define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ 1375 - #define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ 1376 - /* 0x7 available */ 1377 - #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ 1378 - #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ 1379 - #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ 1380 - #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ 1381 - #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ 1382 - #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ 1383 - #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ 1384 - #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ 1359 + #define PERF_MEM_LVLNUM_L1 0x0001 /* L1 */ 1360 + #define PERF_MEM_LVLNUM_L2 0x0002 /* L2 */ 1361 + #define PERF_MEM_LVLNUM_L3 0x0003 /* L3 */ 1362 + #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ 1363 + #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ 1364 + #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ 1365 + /* 0x007 available */ 1366 + #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ 1367 + #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ 1368 + #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ 1369 + #define PERF_MEM_LVLNUM_ANY_CACHE 0x000b /* Any cache */ 1370 + #define PERF_MEM_LVLNUM_LFB 0x000c /* LFB / L1 Miss Handling Buffer */ 1371 + #define PERF_MEM_LVLNUM_RAM 0x000d /* RAM */ 1372 + #define PERF_MEM_LVLNUM_PMEM 0x000e /* PMEM */ 1373 + #define PERF_MEM_LVLNUM_NA 0x000f /* N/A */ 1385 1374 1386 - #define PERF_MEM_LVLNUM_SHIFT 33 1375 + #define PERF_MEM_LVLNUM_SHIFT 33 1387 1376 1388 - /* snoop mode */ 1389 - #define PERF_MEM_SNOOP_NA 0x01 /* not available */ 1390 - #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ 1391 - #define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ 1392 - #define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ 1393 - #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ 1394 - #define PERF_MEM_SNOOP_SHIFT 19 1377 + /* Snoop mode */ 1378 + #define PERF_MEM_SNOOP_NA 0x0001 /* Not available */ 1379 + #define PERF_MEM_SNOOP_NONE 0x0002 /* No snoop */ 1380 + #define PERF_MEM_SNOOP_HIT 0x0004 /* Snoop hit */ 1381 + #define PERF_MEM_SNOOP_MISS 0x0008 /* Snoop miss */ 1382 + #define PERF_MEM_SNOOP_HITM 0x0010 /* Snoop hit modified */ 1383 + #define PERF_MEM_SNOOP_SHIFT 19 1395 1384 1396 - #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ 1397 - #define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ 1398 - #define PERF_MEM_SNOOPX_SHIFT 38 1385 + #define PERF_MEM_SNOOPX_FWD 0x0001 /* Forward */ 1386 + #define PERF_MEM_SNOOPX_PEER 0x0002 /* Transfer from peer */ 1387 + #define PERF_MEM_SNOOPX_SHIFT 38 1399 1388 1400 - /* locked instruction */ 1401 - #define PERF_MEM_LOCK_NA 0x01 /* not available */ 1402 - #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ 1403 - #define PERF_MEM_LOCK_SHIFT 24 1389 + /* Locked instruction */ 1390 + #define PERF_MEM_LOCK_NA 0x0001 /* Not available */ 1391 + #define PERF_MEM_LOCK_LOCKED 0x0002 /* Locked transaction */ 1392 + #define PERF_MEM_LOCK_SHIFT 24 1404 1393 1405 1394 /* TLB access */ 1406 - #define PERF_MEM_TLB_NA 0x01 /* not available */ 1407 - #define PERF_MEM_TLB_HIT 0x02 /* hit level */ 1408 - #define PERF_MEM_TLB_MISS 0x04 /* miss level */ 1409 - #define PERF_MEM_TLB_L1 0x08 /* L1 */ 1410 - #define PERF_MEM_TLB_L2 0x10 /* L2 */ 1411 - #define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ 1412 - #define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ 1413 - #define PERF_MEM_TLB_SHIFT 26 1395 + #define PERF_MEM_TLB_NA 0x0001 /* Not available */ 1396 + #define PERF_MEM_TLB_HIT 0x0002 /* Hit level */ 1397 + #define PERF_MEM_TLB_MISS 0x0004 /* Miss level */ 1398 + #define PERF_MEM_TLB_L1 0x0008 /* L1 */ 1399 + #define PERF_MEM_TLB_L2 0x0010 /* L2 */ 1400 + #define PERF_MEM_TLB_WK 0x0020 /* Hardware Walker*/ 1401 + #define PERF_MEM_TLB_OS 0x0040 /* OS fault handler */ 1402 + #define PERF_MEM_TLB_SHIFT 26 1414 1403 1415 1404 /* Access blocked */ 1416 - #define PERF_MEM_BLK_NA 0x01 /* not available */ 1417 - #define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ 1418 - #define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ 1419 - #define PERF_MEM_BLK_SHIFT 40 1405 + #define PERF_MEM_BLK_NA 0x0001 /* Not available */ 1406 + #define PERF_MEM_BLK_DATA 0x0002 /* Data could not be forwarded */ 1407 + #define PERF_MEM_BLK_ADDR 0x0004 /* Address conflict */ 1408 + #define PERF_MEM_BLK_SHIFT 40 1420 1409 1421 - /* hop level */ 1422 - #define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ 1423 - #define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ 1424 - #define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ 1425 - #define PERF_MEM_HOPS_3 0x04 /* remote board */ 1410 + /* Hop level */ 1411 + #define PERF_MEM_HOPS_0 0x0001 /* Remote core, same node */ 1412 + #define PERF_MEM_HOPS_1 0x0002 /* Remote node, same socket */ 1413 + #define PERF_MEM_HOPS_2 0x0003 /* Remote socket, same board */ 1414 + #define PERF_MEM_HOPS_3 0x0004 /* Remote board */ 1426 1415 /* 5-7 available */ 1427 - #define PERF_MEM_HOPS_SHIFT 43 1416 + #define PERF_MEM_HOPS_SHIFT 43 1428 1417 1429 1418 #define PERF_MEM_S(a, s) \ 1430 1419 (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) 1431 1420 1432 1421 /* 1433 - * single taken branch record layout: 1422 + * Layout of single taken branch records: 1434 1423 * 1435 1424 * from: source instruction (may not always be a branch insn) 1436 1425 * to: branch target ··· 1449 1438 struct perf_branch_entry { 1450 1439 __u64 from; 1451 1440 __u64 to; 1452 - __u64 mispred:1, /* target mispredicted */ 1453 - predicted:1,/* target predicted */ 1454 - in_tx:1, /* in transaction */ 1455 - abort:1, /* transaction abort */ 1456 - cycles:16, /* cycle count to last branch */ 1457 - type:4, /* branch type */ 1458 - spec:2, /* branch speculation info */ 1459 - new_type:4, /* additional branch type */ 1460 - priv:3, /* privilege level */ 1461 - reserved:31; 1441 + __u64 mispred : 1, /* target mispredicted */ 1442 + predicted : 1, /* target predicted */ 1443 + in_tx : 1, /* in transaction */ 1444 + abort : 1, /* transaction abort */ 1445 + cycles : 16, /* cycle count to last branch */ 1446 + type : 4, /* branch type */ 1447 + spec : 2, /* branch speculation info */ 1448 + new_type : 4, /* additional branch type */ 1449 + priv : 3, /* privilege level */ 1450 + reserved : 31; 1462 1451 }; 1463 1452 1464 1453 /* Size of used info bits in struct perf_branch_entry */ 1465 1454 #define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 1466 1455 1467 1456 union perf_sample_weight { 1468 - __u64 full; 1457 + __u64 full; 1469 1458 #if defined(__LITTLE_ENDIAN_BITFIELD) 1470 1459 struct { 1471 - __u32 var1_dw; 1472 - __u16 var2_w; 1473 - __u16 var3_w; 1460 + __u32 var1_dw; 1461 + __u16 var2_w; 1462 + __u16 var3_w; 1474 1463 }; 1475 1464 #elif defined(__BIG_ENDIAN_BITFIELD) 1476 1465 struct { 1477 - __u16 var3_w; 1478 - __u16 var2_w; 1479 - __u32 var1_dw; 1466 + __u16 var3_w; 1467 + __u16 var2_w; 1468 + __u32 var1_dw; 1480 1469 }; 1481 1470 #else 1482 - #error "Unknown endianness" 1471 + # error "Unknown endianness" 1483 1472 #endif 1484 1473 }; 1485 1474
-5
kernel/cpu.c
··· 2069 2069 .teardown.single = NULL, 2070 2070 .cant_stop = true, 2071 2071 }, 2072 - [CPUHP_PERF_PREPARE] = { 2073 - .name = "perf:prepare", 2074 - .startup.single = perf_event_init_cpu, 2075 - .teardown.single = perf_event_exit_cpu, 2076 - }, 2077 2072 [CPUHP_RANDOM_PREPARE] = { 2078 2073 .name = "random:prepare", 2079 2074 .startup.single = random_prepare_cpu,
+424 -185
kernel/events/core.c
··· 1270 1270 if (ctx->task && ctx->task != TASK_TOMBSTONE) 1271 1271 put_task_struct(ctx->task); 1272 1272 call_rcu(&ctx->rcu_head, free_ctx); 1273 + } else { 1274 + smp_mb__after_atomic(); /* pairs with wait_var_event() */ 1275 + if (ctx->task == TASK_TOMBSTONE) 1276 + wake_up_var(&ctx->refcount); 1273 1277 } 1274 1278 } 1275 1279 ··· 2171 2167 * If the event is an aux_event, tear down all links to 2172 2168 * it from other events. 2173 2169 */ 2174 - for_each_sibling_event(iter, event->group_leader) { 2170 + for_each_sibling_event(iter, event) { 2175 2171 if (iter->aux_event != event) 2176 2172 continue; 2177 2173 ··· 2329 2325 if (WARN_ON_ONCE(!parent_event)) 2330 2326 return; 2331 2327 2328 + /* 2329 + * Can't check this from an IPI, the holder is likey another CPU. 2330 + * 2332 2331 lockdep_assert_held(&parent_event->child_mutex); 2332 + */ 2333 2333 2334 2334 sync_child_event(event); 2335 2335 list_del_init(&event->child_list); ··· 2349 2341 { 2350 2342 return (event->cpu == -1 || event->cpu == smp_processor_id()) && 2351 2343 perf_cgroup_match(event); 2344 + } 2345 + 2346 + static inline bool is_event_in_freq_mode(struct perf_event *event) 2347 + { 2348 + return event->attr.freq && event->attr.sample_freq; 2352 2349 } 2353 2350 2354 2351 static void ··· 2393 2380 2394 2381 if (!is_software_event(event)) 2395 2382 cpc->active_oncpu--; 2396 - if (event->attr.freq && event->attr.sample_freq) { 2383 + if (is_event_in_freq_mode(event)) { 2397 2384 ctx->nr_freq--; 2398 2385 epc->nr_freq--; 2399 2386 } ··· 2463 2450 2464 2451 #define DETACH_GROUP 0x01UL 2465 2452 #define DETACH_CHILD 0x02UL 2466 - #define DETACH_DEAD 0x04UL 2467 - #define DETACH_EXIT 0x08UL 2453 + #define DETACH_EXIT 0x04UL 2454 + #define DETACH_REVOKE 0x08UL 2455 + #define DETACH_DEAD 0x10UL 2468 2456 2469 2457 /* 2470 2458 * Cross CPU call to remove a performance event ··· 2491 2477 */ 2492 2478 if (flags & DETACH_EXIT) 2493 2479 state = PERF_EVENT_STATE_EXIT; 2480 + if (flags & DETACH_REVOKE) 2481 + state = PERF_EVENT_STATE_REVOKED; 2494 2482 if (flags & DETACH_DEAD) { 2495 2483 event->pending_disable = 1; 2496 2484 state = PERF_EVENT_STATE_DEAD; 2497 2485 } 2498 2486 event_sched_out(event, ctx); 2499 2487 perf_event_set_state(event, min(event->state, state)); 2488 + 2500 2489 if (flags & DETACH_GROUP) 2501 2490 perf_group_detach(event); 2502 2491 if (flags & DETACH_CHILD) ··· 2645 2628 static void perf_log_throttle(struct perf_event *event, int enable); 2646 2629 static void perf_log_itrace_start(struct perf_event *event); 2647 2630 2631 + static void perf_event_unthrottle(struct perf_event *event, bool start) 2632 + { 2633 + event->hw.interrupts = 0; 2634 + if (start) 2635 + event->pmu->start(event, 0); 2636 + if (event == event->group_leader) 2637 + perf_log_throttle(event, 1); 2638 + } 2639 + 2640 + static void perf_event_throttle(struct perf_event *event) 2641 + { 2642 + event->pmu->stop(event, 0); 2643 + event->hw.interrupts = MAX_INTERRUPTS; 2644 + if (event == event->group_leader) 2645 + perf_log_throttle(event, 0); 2646 + } 2647 + 2648 + static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) 2649 + { 2650 + struct perf_event *sibling, *leader = event->group_leader; 2651 + 2652 + perf_event_unthrottle(leader, skip_start_event ? leader != event : true); 2653 + for_each_sibling_event(sibling, leader) 2654 + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); 2655 + } 2656 + 2657 + static void perf_event_throttle_group(struct perf_event *event) 2658 + { 2659 + struct perf_event *sibling, *leader = event->group_leader; 2660 + 2661 + perf_event_throttle(leader); 2662 + for_each_sibling_event(sibling, leader) 2663 + perf_event_throttle(sibling); 2664 + } 2665 + 2648 2666 static int 2649 2667 event_sched_in(struct perf_event *event, struct perf_event_context *ctx) 2650 2668 { ··· 2708 2656 * ticks already, also for a heavily scheduling task there is little 2709 2657 * guarantee it'll get a tick in a timely manner. 2710 2658 */ 2711 - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { 2712 - perf_log_throttle(event, 1); 2713 - event->hw.interrupts = 0; 2714 - } 2659 + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) 2660 + perf_event_unthrottle(event, false); 2715 2661 2716 2662 perf_pmu_disable(event->pmu); 2717 2663 ··· 2724 2674 2725 2675 if (!is_software_event(event)) 2726 2676 cpc->active_oncpu++; 2727 - if (event->attr.freq && event->attr.sample_freq) { 2677 + if (is_event_in_freq_mode(event)) { 2728 2678 ctx->nr_freq++; 2729 2679 epc->nr_freq++; 2730 2680 } ··· 4287 4237 4288 4238 hwc = &event->hw; 4289 4239 4290 - if (hwc->interrupts == MAX_INTERRUPTS) { 4291 - hwc->interrupts = 0; 4292 - perf_log_throttle(event, 1); 4293 - if (!event->attr.freq || !event->attr.sample_freq) 4294 - event->pmu->start(event, 0); 4295 - } 4240 + if (hwc->interrupts == MAX_INTERRUPTS) 4241 + perf_event_unthrottle_group(event, is_event_in_freq_mode(event)); 4296 4242 4297 - if (!event->attr.freq || !event->attr.sample_freq) 4243 + if (!is_event_in_freq_mode(event)) 4298 4244 continue; 4299 4245 4300 4246 /* ··· 4562 4516 4563 4517 static void perf_remove_from_owner(struct perf_event *event); 4564 4518 static void perf_event_exit_event(struct perf_event *event, 4565 - struct perf_event_context *ctx); 4519 + struct perf_event_context *ctx, 4520 + bool revoke); 4566 4521 4567 4522 /* 4568 4523 * Removes all events from the current task that have been marked ··· 4590 4543 4591 4544 modified = true; 4592 4545 4593 - perf_event_exit_event(event, ctx); 4546 + perf_event_exit_event(event, ctx, false); 4594 4547 } 4595 4548 4596 4549 raw_spin_lock_irqsave(&ctx->lock, flags); ··· 5172 5125 attr->context_switch || attr->text_poke || 5173 5126 attr->bpf_event) 5174 5127 return true; 5128 + 5175 5129 return false; 5176 5130 } 5177 5131 ··· 5569 5521 /* vs perf_event_alloc() error */ 5570 5522 static void __free_event(struct perf_event *event) 5571 5523 { 5524 + struct pmu *pmu = event->pmu; 5525 + 5572 5526 if (event->attach_state & PERF_ATTACH_CALLCHAIN) 5573 5527 put_callchain_buffers(); 5574 5528 ··· 5600 5550 * put_pmu_ctx() needs an event->ctx reference, because of 5601 5551 * epc->ctx. 5602 5552 */ 5553 + WARN_ON_ONCE(!pmu); 5603 5554 WARN_ON_ONCE(!event->ctx); 5604 5555 WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); 5605 5556 put_pmu_ctx(event->pmu_ctx); ··· 5613 5562 if (event->ctx) 5614 5563 put_ctx(event->ctx); 5615 5564 5616 - if (event->pmu) 5617 - module_put(event->pmu->module); 5565 + if (pmu) { 5566 + module_put(pmu->module); 5567 + scoped_guard (spinlock, &pmu->events_lock) { 5568 + list_del(&event->pmu_list); 5569 + wake_up_var(pmu); 5570 + } 5571 + } 5618 5572 5619 5573 call_rcu(&event->rcu_head, free_event_rcu); 5620 5574 } ··· 5656 5600 5657 5601 /* 5658 5602 * Used to free events which have a known refcount of 1, such as in error paths 5659 - * where the event isn't exposed yet and inherited events. 5603 + * of inherited events. 5660 5604 */ 5661 5605 static void free_event(struct perf_event *event) 5662 5606 { 5663 5607 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, 5664 - "unexpected event refcount: %ld; ptr=%p\n", 5665 - atomic_long_read(&event->refcount), event)) { 5608 + "unexpected event refcount: %ld; ptr=%p\n", 5609 + atomic_long_read(&event->refcount), event)) { 5666 5610 /* leak to avoid use-after-free */ 5667 5611 return; 5668 5612 } ··· 5745 5689 { 5746 5690 struct perf_event_context *ctx = event->ctx; 5747 5691 struct perf_event *child, *tmp; 5748 - LIST_HEAD(free_list); 5749 5692 5750 5693 /* 5751 5694 * If we got here through err_alloc: free_event(event); we will not ··· 5773 5718 * Thus this guarantees that we will in fact observe and kill _ALL_ 5774 5719 * child events. 5775 5720 */ 5776 - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); 5721 + if (event->state > PERF_EVENT_STATE_REVOKED) { 5722 + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); 5723 + } else { 5724 + event->state = PERF_EVENT_STATE_DEAD; 5725 + } 5777 5726 5778 5727 perf_event_ctx_unlock(event, ctx); 5779 5728 5780 5729 again: 5781 5730 mutex_lock(&event->child_mutex); 5782 5731 list_for_each_entry(child, &event->child_list, child_list) { 5783 - void *var = NULL; 5784 - 5785 5732 /* 5786 5733 * Cannot change, child events are not migrated, see the 5787 5734 * comment with perf_event_ctx_lock_nested(). ··· 5816 5759 tmp = list_first_entry_or_null(&event->child_list, 5817 5760 struct perf_event, child_list); 5818 5761 if (tmp == child) { 5819 - perf_remove_from_context(child, DETACH_GROUP); 5820 - list_move(&child->child_list, &free_list); 5762 + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); 5821 5763 } else { 5822 - var = &ctx->refcount; 5764 + child = NULL; 5823 5765 } 5824 5766 5825 5767 mutex_unlock(&event->child_mutex); 5826 5768 mutex_unlock(&ctx->mutex); 5769 + 5770 + if (child) { 5771 + /* Last reference unless ->pending_task work is pending */ 5772 + put_event(child); 5773 + } 5827 5774 put_ctx(ctx); 5828 5775 5829 - if (var) { 5830 - /* 5831 - * If perf_event_free_task() has deleted all events from the 5832 - * ctx while the child_mutex got released above, make sure to 5833 - * notify about the preceding put_ctx(). 5834 - */ 5835 - smp_mb(); /* pairs with wait_var_event() */ 5836 - wake_up_var(var); 5837 - } 5838 5776 goto again; 5839 5777 } 5840 5778 mutex_unlock(&event->child_mutex); 5841 - 5842 - list_for_each_entry_safe(child, tmp, &free_list, child_list) { 5843 - void *var = &child->ctx->refcount; 5844 - 5845 - list_del(&child->child_list); 5846 - /* Last reference unless ->pending_task work is pending */ 5847 - put_event(child); 5848 - 5849 - /* 5850 - * Wake any perf_event_free_task() waiting for this event to be 5851 - * freed. 5852 - */ 5853 - smp_mb(); /* pairs with wait_var_event() */ 5854 - wake_up_var(var); 5855 - } 5856 5779 5857 5780 no_ctx: 5858 5781 /* ··· 6105 6068 struct perf_buffer *rb; 6106 6069 __poll_t events = EPOLLHUP; 6107 6070 6071 + if (event->state <= PERF_EVENT_STATE_REVOKED) 6072 + return EPOLLERR; 6073 + 6108 6074 poll_wait(file, &event->waitq, wait); 6075 + 6076 + if (event->state <= PERF_EVENT_STATE_REVOKED) 6077 + return EPOLLERR; 6109 6078 6110 6079 if (is_event_hup(event)) 6111 6080 return events; ··· 6210 6167 active = (event->state == PERF_EVENT_STATE_ACTIVE); 6211 6168 if (active) { 6212 6169 perf_pmu_disable(event->pmu); 6213 - /* 6214 - * We could be throttled; unthrottle now to avoid the tick 6215 - * trying to unthrottle while we already re-started the event. 6216 - */ 6217 - if (event->hw.interrupts == MAX_INTERRUPTS) { 6218 - event->hw.interrupts = 0; 6219 - perf_log_throttle(event, 1); 6220 - } 6221 6170 event->pmu->stop(event, PERF_EF_UPDATE); 6222 6171 } 6223 6172 ··· 6217 6182 6218 6183 if (active) { 6219 6184 event->pmu->start(event, PERF_EF_RELOAD); 6185 + /* 6186 + * Once the period is force-reset, the event starts immediately. 6187 + * But the event/group could be throttled. Unthrottle the 6188 + * event/group now to avoid the next tick trying to unthrottle 6189 + * while we already re-started the event/group. 6190 + */ 6191 + if (event->hw.interrupts == MAX_INTERRUPTS) 6192 + perf_event_unthrottle_group(event, true); 6220 6193 perf_pmu_enable(event->pmu); 6221 6194 } 6222 6195 } ··· 6282 6239 static int perf_event_set_filter(struct perf_event *event, void __user *arg); 6283 6240 static int perf_copy_attr(struct perf_event_attr __user *uattr, 6284 6241 struct perf_event_attr *attr); 6242 + static int __perf_event_set_bpf_prog(struct perf_event *event, 6243 + struct bpf_prog *prog, 6244 + u64 bpf_cookie); 6285 6245 6286 6246 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) 6287 6247 { 6288 6248 void (*func)(struct perf_event *); 6289 6249 u32 flags = arg; 6250 + 6251 + if (event->state <= PERF_EVENT_STATE_REVOKED) 6252 + return -ENODEV; 6290 6253 6291 6254 switch (cmd) { 6292 6255 case PERF_EVENT_IOC_ENABLE: ··· 6350 6301 if (IS_ERR(prog)) 6351 6302 return PTR_ERR(prog); 6352 6303 6353 - err = perf_event_set_bpf_prog(event, prog, 0); 6304 + err = __perf_event_set_bpf_prog(event, prog, 0); 6354 6305 if (err) { 6355 6306 bpf_prog_put(prog); 6356 6307 return err; ··· 6669 6620 call_rcu(&rb->rcu_head, rb_free_rcu); 6670 6621 } 6671 6622 6623 + typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); 6624 + 6625 + #define get_mapped(event, func) \ 6626 + ({ struct pmu *pmu; \ 6627 + mapped_f f = NULL; \ 6628 + guard(rcu)(); \ 6629 + pmu = READ_ONCE(event->pmu); \ 6630 + if (pmu) \ 6631 + f = pmu->func; \ 6632 + f; \ 6633 + }) 6634 + 6672 6635 static void perf_mmap_open(struct vm_area_struct *vma) 6673 6636 { 6674 6637 struct perf_event *event = vma->vm_file->private_data; 6638 + mapped_f mapped = get_mapped(event, event_mapped); 6675 6639 6676 6640 atomic_inc(&event->mmap_count); 6677 6641 atomic_inc(&event->rb->mmap_count); ··· 6692 6630 if (vma->vm_pgoff) 6693 6631 atomic_inc(&event->rb->aux_mmap_count); 6694 6632 6695 - if (event->pmu->event_mapped) 6696 - event->pmu->event_mapped(event, vma->vm_mm); 6633 + if (mapped) 6634 + mapped(event, vma->vm_mm); 6697 6635 } 6698 6636 6699 6637 static void perf_pmu_output_stop(struct perf_event *event); ··· 6709 6647 static void perf_mmap_close(struct vm_area_struct *vma) 6710 6648 { 6711 6649 struct perf_event *event = vma->vm_file->private_data; 6650 + mapped_f unmapped = get_mapped(event, event_unmapped); 6712 6651 struct perf_buffer *rb = ring_buffer_get(event); 6713 6652 struct user_struct *mmap_user = rb->mmap_user; 6714 6653 int mmap_locked = rb->mmap_locked; 6715 6654 unsigned long size = perf_data_size(rb); 6716 6655 bool detach_rest = false; 6717 6656 6718 - if (event->pmu->event_unmapped) 6719 - event->pmu->event_unmapped(event, vma->vm_mm); 6657 + /* FIXIES vs perf_pmu_unregister() */ 6658 + if (unmapped) 6659 + unmapped(event, vma->vm_mm); 6720 6660 6721 6661 /* 6722 6662 * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex ··· 6911 6847 unsigned long nr_pages; 6912 6848 long user_extra = 0, extra = 0; 6913 6849 int ret, flags = 0; 6850 + mapped_f mapped; 6914 6851 6915 6852 /* 6916 6853 * Don't allow mmap() of inherited per-task counters. This would ··· 6941 6876 6942 6877 mutex_lock(&event->mmap_mutex); 6943 6878 ret = -EINVAL; 6879 + 6880 + /* 6881 + * This relies on __pmu_detach_event() taking mmap_mutex after marking 6882 + * the event REVOKED. Either we observe the state, or __pmu_detach_event() 6883 + * will detach the rb created here. 6884 + */ 6885 + if (event->state <= PERF_EVENT_STATE_REVOKED) { 6886 + ret = -ENODEV; 6887 + goto unlock; 6888 + } 6944 6889 6945 6890 if (vma->vm_pgoff == 0) { 6946 6891 nr_pages -= 1; ··· 7130 7055 if (!ret) 7131 7056 ret = map_range(rb, vma); 7132 7057 7133 - if (!ret && event->pmu->event_mapped) 7134 - event->pmu->event_mapped(event, vma->vm_mm); 7058 + mapped = get_mapped(event, event_mapped); 7059 + if (mapped) 7060 + mapped(event, vma->vm_mm); 7135 7061 7136 7062 return ret; 7137 7063 } ··· 7142 7066 struct inode *inode = file_inode(filp); 7143 7067 struct perf_event *event = filp->private_data; 7144 7068 int retval; 7069 + 7070 + if (event->state <= PERF_EVENT_STATE_REVOKED) 7071 + return -ENODEV; 7145 7072 7146 7073 inode_lock(inode); 7147 7074 retval = fasync_helper(fd, filp, on, &event->fasync); ··· 10025 9946 10026 9947 void perf_event_itrace_started(struct perf_event *event) 10027 9948 { 10028 - event->attach_state |= PERF_ATTACH_ITRACE; 9949 + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); 10029 9950 } 10030 9951 10031 9952 static void perf_log_itrace_start(struct perf_event *event) ··· 10108 10029 hwc->interrupts = 1; 10109 10030 } else { 10110 10031 hwc->interrupts++; 10111 - if (unlikely(throttle && 10112 - hwc->interrupts > max_samples_per_tick)) { 10113 - __this_cpu_inc(perf_throttled_count); 10114 - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 10115 - hwc->interrupts = MAX_INTERRUPTS; 10116 - perf_log_throttle(event, 0); 10117 - ret = 1; 10118 - } 10032 + } 10033 + 10034 + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { 10035 + __this_cpu_inc(perf_throttled_count); 10036 + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); 10037 + perf_event_throttle_group(event); 10038 + ret = 1; 10119 10039 } 10120 10040 10121 10041 if (event->attr.freq) { ··· 11147 11069 return false; 11148 11070 } 11149 11071 11150 - int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, 11151 - u64 bpf_cookie) 11072 + static int __perf_event_set_bpf_prog(struct perf_event *event, 11073 + struct bpf_prog *prog, 11074 + u64 bpf_cookie) 11152 11075 { 11153 11076 bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; 11077 + 11078 + if (event->state <= PERF_EVENT_STATE_REVOKED) 11079 + return -ENODEV; 11154 11080 11155 11081 if (!perf_event_is_tracing(event)) 11156 11082 return perf_event_set_bpf_handler(event, prog, bpf_cookie); ··· 11190 11108 return perf_event_attach_bpf_prog(event, prog, bpf_cookie); 11191 11109 } 11192 11110 11111 + int perf_event_set_bpf_prog(struct perf_event *event, 11112 + struct bpf_prog *prog, 11113 + u64 bpf_cookie) 11114 + { 11115 + struct perf_event_context *ctx; 11116 + int ret; 11117 + 11118 + ctx = perf_event_ctx_lock(event); 11119 + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); 11120 + perf_event_ctx_unlock(event, ctx); 11121 + 11122 + return ret; 11123 + } 11124 + 11193 11125 void perf_event_free_bpf_prog(struct perf_event *event) 11194 11126 { 11195 11127 if (!event->prog) ··· 11226 11130 { 11227 11131 } 11228 11132 11229 - int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, 11133 + static int __perf_event_set_bpf_prog(struct perf_event *event, 11134 + struct bpf_prog *prog, 11135 + u64 bpf_cookie) 11136 + { 11137 + return -ENOENT; 11138 + } 11139 + 11140 + int perf_event_set_bpf_prog(struct perf_event *event, 11141 + struct bpf_prog *prog, 11230 11142 u64 bpf_cookie) 11231 11143 { 11232 11144 return -ENOENT; ··· 12339 12235 if (!pmu->event_idx) 12340 12236 pmu->event_idx = perf_event_idx_default; 12341 12237 12238 + INIT_LIST_HEAD(&pmu->events); 12239 + spin_lock_init(&pmu->events_lock); 12240 + 12342 12241 /* 12343 12242 * Now that the PMU is complete, make it visible to perf_try_init_event(). 12344 12243 */ ··· 12355 12248 } 12356 12249 EXPORT_SYMBOL_GPL(perf_pmu_register); 12357 12250 12358 - void perf_pmu_unregister(struct pmu *pmu) 12251 + static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, 12252 + struct perf_event_context *ctx) 12253 + { 12254 + /* 12255 + * De-schedule the event and mark it REVOKED. 12256 + */ 12257 + perf_event_exit_event(event, ctx, true); 12258 + 12259 + /* 12260 + * All _free_event() bits that rely on event->pmu: 12261 + * 12262 + * Notably, perf_mmap() relies on the ordering here. 12263 + */ 12264 + scoped_guard (mutex, &event->mmap_mutex) { 12265 + WARN_ON_ONCE(pmu->event_unmapped); 12266 + /* 12267 + * Mostly an empty lock sequence, such that perf_mmap(), which 12268 + * relies on mmap_mutex, is sure to observe the state change. 12269 + */ 12270 + } 12271 + 12272 + perf_event_free_bpf_prog(event); 12273 + perf_free_addr_filters(event); 12274 + 12275 + if (event->destroy) { 12276 + event->destroy(event); 12277 + event->destroy = NULL; 12278 + } 12279 + 12280 + if (event->pmu_ctx) { 12281 + put_pmu_ctx(event->pmu_ctx); 12282 + event->pmu_ctx = NULL; 12283 + } 12284 + 12285 + exclusive_event_destroy(event); 12286 + module_put(pmu->module); 12287 + 12288 + event->pmu = NULL; /* force fault instead of UAF */ 12289 + } 12290 + 12291 + static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) 12292 + { 12293 + struct perf_event_context *ctx; 12294 + 12295 + ctx = perf_event_ctx_lock(event); 12296 + __pmu_detach_event(pmu, event, ctx); 12297 + perf_event_ctx_unlock(event, ctx); 12298 + 12299 + scoped_guard (spinlock, &pmu->events_lock) 12300 + list_del(&event->pmu_list); 12301 + } 12302 + 12303 + static struct perf_event *pmu_get_event(struct pmu *pmu) 12304 + { 12305 + struct perf_event *event; 12306 + 12307 + guard(spinlock)(&pmu->events_lock); 12308 + list_for_each_entry(event, &pmu->events, pmu_list) { 12309 + if (atomic_long_inc_not_zero(&event->refcount)) 12310 + return event; 12311 + } 12312 + 12313 + return NULL; 12314 + } 12315 + 12316 + static bool pmu_empty(struct pmu *pmu) 12317 + { 12318 + guard(spinlock)(&pmu->events_lock); 12319 + return list_empty(&pmu->events); 12320 + } 12321 + 12322 + static void pmu_detach_events(struct pmu *pmu) 12323 + { 12324 + struct perf_event *event; 12325 + 12326 + for (;;) { 12327 + event = pmu_get_event(pmu); 12328 + if (!event) 12329 + break; 12330 + 12331 + pmu_detach_event(pmu, event); 12332 + put_event(event); 12333 + } 12334 + 12335 + /* 12336 + * wait for pending _free_event()s 12337 + */ 12338 + wait_var_event(pmu, pmu_empty(pmu)); 12339 + } 12340 + 12341 + int perf_pmu_unregister(struct pmu *pmu) 12359 12342 { 12360 12343 scoped_guard (mutex, &pmus_lock) { 12344 + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) 12345 + return -EINVAL; 12346 + 12361 12347 list_del_rcu(&pmu->entry); 12362 - idr_remove(&pmu_idr, pmu->type); 12363 12348 } 12364 12349 12365 12350 /* 12366 12351 * We dereference the pmu list under both SRCU and regular RCU, so 12367 12352 * synchronize against both of those. 12353 + * 12354 + * Notably, the entirety of event creation, from perf_init_event() 12355 + * (which will now fail, because of the above) until 12356 + * perf_install_in_context() should be under SRCU such that 12357 + * this synchronizes against event creation. This avoids trying to 12358 + * detach events that are not fully formed. 12368 12359 */ 12369 12360 synchronize_srcu(&pmus_srcu); 12370 12361 synchronize_rcu(); 12371 12362 12363 + if (pmu->event_unmapped && !pmu_empty(pmu)) { 12364 + /* 12365 + * Can't force remove events when pmu::event_unmapped() 12366 + * is used in perf_mmap_close(). 12367 + */ 12368 + guard(mutex)(&pmus_lock); 12369 + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); 12370 + list_add_rcu(&pmu->entry, &pmus); 12371 + return -EBUSY; 12372 + } 12373 + 12374 + scoped_guard (mutex, &pmus_lock) 12375 + idr_remove(&pmu_idr, pmu->type); 12376 + 12377 + /* 12378 + * PMU is removed from the pmus list, so no new events will 12379 + * be created, now take care of the existing ones. 12380 + */ 12381 + pmu_detach_events(pmu); 12382 + 12383 + /* 12384 + * PMU is unused, make it go away. 12385 + */ 12372 12386 perf_pmu_free(pmu); 12387 + return 0; 12373 12388 } 12374 12389 EXPORT_SYMBOL_GPL(perf_pmu_unregister); 12375 12390 ··· 12585 12356 struct pmu *pmu; 12586 12357 int type, ret; 12587 12358 12588 - guard(srcu)(&pmus_srcu); 12359 + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ 12589 12360 12590 12361 /* 12591 12362 * Save original type before calling pmu->event_init() since certain ··· 12809 12580 INIT_LIST_HEAD(&event->active_entry); 12810 12581 INIT_LIST_HEAD(&event->addr_filters.list); 12811 12582 INIT_HLIST_NODE(&event->hlist_entry); 12583 + INIT_LIST_HEAD(&event->pmu_list); 12812 12584 12813 12585 12814 12586 init_waitqueue_head(&event->waitq); ··· 12881 12651 12882 12652 hwc = &event->hw; 12883 12653 hwc->sample_period = attr->sample_period; 12884 - if (attr->freq && attr->sample_freq) 12654 + if (is_event_in_freq_mode(event)) 12885 12655 hwc->sample_period = 1; 12886 12656 hwc->last_period = hwc->sample_period; 12887 12657 ··· 12987 12757 12988 12758 /* symmetric to unaccount_event() in _free_event() */ 12989 12759 account_event(event); 12760 + 12761 + /* 12762 + * Event creation should be under SRCU, see perf_pmu_unregister(). 12763 + */ 12764 + lockdep_assert_held(&pmus_srcu); 12765 + scoped_guard (spinlock, &pmu->events_lock) 12766 + list_add(&event->pmu_list, &pmu->events); 12990 12767 12991 12768 return_ptr(event); 12992 12769 } ··· 13194 12957 goto unlock; 13195 12958 13196 12959 if (output_event) { 12960 + if (output_event->state <= PERF_EVENT_STATE_REVOKED) 12961 + goto unlock; 12962 + 13197 12963 /* get the rb we want to redirect to */ 13198 12964 rb = ring_buffer_get(output_event); 13199 12965 if (!rb) ··· 13378 13138 if (event_fd < 0) 13379 13139 return event_fd; 13380 13140 13141 + /* 13142 + * Event creation should be under SRCU, see perf_pmu_unregister(). 13143 + */ 13144 + guard(srcu)(&pmus_srcu); 13145 + 13381 13146 CLASS(fd, group)(group_fd); // group_fd == -1 => empty 13382 13147 if (group_fd != -1) { 13383 13148 if (!is_perf_file(group)) { ··· 13390 13145 goto err_fd; 13391 13146 } 13392 13147 group_leader = fd_file(group)->private_data; 13148 + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { 13149 + err = -ENODEV; 13150 + goto err_fd; 13151 + } 13393 13152 if (flags & PERF_FLAG_FD_OUTPUT) 13394 13153 output_event = group_leader; 13395 13154 if (flags & PERF_FLAG_FD_NO_GROUP) ··· 13690 13441 if (task) 13691 13442 up_read(&task->signal->exec_update_lock); 13692 13443 err_alloc: 13693 - free_event(event); 13444 + put_event(event); 13694 13445 err_task: 13695 13446 if (task) 13696 13447 put_task_struct(task); ··· 13726 13477 */ 13727 13478 if (attr->aux_output || attr->aux_action) 13728 13479 return ERR_PTR(-EINVAL); 13480 + 13481 + /* 13482 + * Event creation should be under SRCU, see perf_pmu_unregister(). 13483 + */ 13484 + guard(srcu)(&pmus_srcu); 13729 13485 13730 13486 event = perf_event_alloc(attr, cpu, task, NULL, NULL, 13731 13487 overflow_handler, context, -1); ··· 13803 13549 perf_unpin_context(ctx); 13804 13550 put_ctx(ctx); 13805 13551 err_alloc: 13806 - free_event(event); 13552 + put_event(event); 13807 13553 err: 13808 13554 return ERR_PTR(err); 13809 13555 } ··· 13943 13689 } 13944 13690 13945 13691 static void 13946 - perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) 13692 + perf_event_exit_event(struct perf_event *event, 13693 + struct perf_event_context *ctx, bool revoke) 13947 13694 { 13948 13695 struct perf_event *parent_event = event->parent; 13949 - unsigned long detach_flags = 0; 13696 + unsigned long detach_flags = DETACH_EXIT; 13697 + unsigned int attach_state; 13950 13698 13951 13699 if (parent_event) { 13952 13700 /* ··· 13963 13707 * Do destroy all inherited groups, we don't care about those 13964 13708 * and being thorough is better. 13965 13709 */ 13966 - detach_flags = DETACH_GROUP | DETACH_CHILD; 13710 + detach_flags |= DETACH_GROUP | DETACH_CHILD; 13967 13711 mutex_lock(&parent_event->child_mutex); 13712 + /* PERF_ATTACH_ITRACE might be set concurrently */ 13713 + attach_state = READ_ONCE(event->attach_state); 13968 13714 } 13969 13715 13970 - perf_remove_from_context(event, detach_flags | DETACH_EXIT); 13716 + if (revoke) 13717 + detach_flags |= DETACH_GROUP | DETACH_REVOKE; 13971 13718 13719 + perf_remove_from_context(event, detach_flags); 13972 13720 /* 13973 13721 * Child events can be freed. 13974 13722 */ 13975 13723 if (parent_event) { 13976 13724 mutex_unlock(&parent_event->child_mutex); 13725 + 13977 13726 /* 13978 - * Kick perf_poll() for is_event_hup(); 13727 + * Match the refcount initialization. Make sure it doesn't happen 13728 + * twice if pmu_detach_event() calls it on an already exited task. 13979 13729 */ 13980 - perf_event_wakeup(parent_event); 13981 - put_event(event); 13730 + if (attach_state & PERF_ATTACH_CHILD) { 13731 + /* 13732 + * Kick perf_poll() for is_event_hup(); 13733 + */ 13734 + perf_event_wakeup(parent_event); 13735 + /* 13736 + * pmu_detach_event() will have an extra refcount. 13737 + * perf_pending_task() might have one too. 13738 + */ 13739 + put_event(event); 13740 + } 13741 + 13982 13742 return; 13983 13743 } 13984 13744 ··· 14004 13732 perf_event_wakeup(event); 14005 13733 } 14006 13734 14007 - static void perf_event_exit_task_context(struct task_struct *child) 13735 + static void perf_event_exit_task_context(struct task_struct *task, bool exit) 14008 13736 { 14009 - struct perf_event_context *child_ctx, *clone_ctx = NULL; 13737 + struct perf_event_context *ctx, *clone_ctx = NULL; 14010 13738 struct perf_event *child_event, *next; 14011 13739 14012 - WARN_ON_ONCE(child != current); 14013 - 14014 - child_ctx = perf_pin_task_context(child); 14015 - if (!child_ctx) 13740 + ctx = perf_pin_task_context(task); 13741 + if (!ctx) 14016 13742 return; 14017 13743 14018 13744 /* ··· 14023 13753 * without ctx::mutex (it cannot because of the move_group double mutex 14024 13754 * lock thing). See the comments in perf_install_in_context(). 14025 13755 */ 14026 - mutex_lock(&child_ctx->mutex); 13756 + mutex_lock(&ctx->mutex); 14027 13757 14028 13758 /* 14029 13759 * In a single ctx::lock section, de-schedule the events and detach the 14030 13760 * context from the task such that we cannot ever get it scheduled back 14031 13761 * in. 14032 13762 */ 14033 - raw_spin_lock_irq(&child_ctx->lock); 14034 - task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); 13763 + raw_spin_lock_irq(&ctx->lock); 13764 + if (exit) 13765 + task_ctx_sched_out(ctx, NULL, EVENT_ALL); 14035 13766 14036 13767 /* 14037 13768 * Now that the context is inactive, destroy the task <-> ctx relation 14038 13769 * and mark the context dead. 14039 13770 */ 14040 - RCU_INIT_POINTER(child->perf_event_ctxp, NULL); 14041 - put_ctx(child_ctx); /* cannot be last */ 14042 - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); 14043 - put_task_struct(current); /* cannot be last */ 13771 + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); 13772 + put_ctx(ctx); /* cannot be last */ 13773 + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); 13774 + put_task_struct(task); /* cannot be last */ 14044 13775 14045 - clone_ctx = unclone_ctx(child_ctx); 14046 - raw_spin_unlock_irq(&child_ctx->lock); 13776 + clone_ctx = unclone_ctx(ctx); 13777 + raw_spin_unlock_irq(&ctx->lock); 14047 13778 14048 13779 if (clone_ctx) 14049 13780 put_ctx(clone_ctx); ··· 14054 13783 * won't get any samples after PERF_RECORD_EXIT. We can however still 14055 13784 * get a few PERF_RECORD_READ events. 14056 13785 */ 14057 - perf_event_task(child, child_ctx, 0); 13786 + if (exit) 13787 + perf_event_task(task, ctx, 0); 14058 13788 14059 - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) 14060 - perf_event_exit_event(child_event, child_ctx); 13789 + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) 13790 + perf_event_exit_event(child_event, ctx, false); 14061 13791 14062 - mutex_unlock(&child_ctx->mutex); 13792 + mutex_unlock(&ctx->mutex); 14063 13793 14064 - put_ctx(child_ctx); 13794 + if (!exit) { 13795 + /* 13796 + * perf_event_release_kernel() could still have a reference on 13797 + * this context. In that case we must wait for these events to 13798 + * have been freed (in particular all their references to this 13799 + * task must've been dropped). 13800 + * 13801 + * Without this copy_process() will unconditionally free this 13802 + * task (irrespective of its reference count) and 13803 + * _free_event()'s put_task_struct(event->hw.target) will be a 13804 + * use-after-free. 13805 + * 13806 + * Wait for all events to drop their context reference. 13807 + */ 13808 + wait_var_event(&ctx->refcount, 13809 + refcount_read(&ctx->refcount) == 1); 13810 + } 13811 + put_ctx(ctx); 14065 13812 } 14066 13813 14067 13814 /* 14068 - * When a child task exits, feed back event values to parent events. 13815 + * When a task exits, feed back event values to parent events. 14069 13816 * 14070 13817 * Can be called with exec_update_lock held when called from 14071 13818 * setup_new_exec(). 14072 13819 */ 14073 - void perf_event_exit_task(struct task_struct *child) 13820 + void perf_event_exit_task(struct task_struct *task) 14074 13821 { 14075 13822 struct perf_event *event, *tmp; 14076 13823 14077 - mutex_lock(&child->perf_event_mutex); 14078 - list_for_each_entry_safe(event, tmp, &child->perf_event_list, 13824 + WARN_ON_ONCE(task != current); 13825 + 13826 + mutex_lock(&task->perf_event_mutex); 13827 + list_for_each_entry_safe(event, tmp, &task->perf_event_list, 14079 13828 owner_entry) { 14080 13829 list_del_init(&event->owner_entry); 14081 13830 ··· 14106 13815 */ 14107 13816 smp_store_release(&event->owner, NULL); 14108 13817 } 14109 - mutex_unlock(&child->perf_event_mutex); 13818 + mutex_unlock(&task->perf_event_mutex); 14110 13819 14111 - perf_event_exit_task_context(child); 13820 + perf_event_exit_task_context(task, true); 14112 13821 14113 13822 /* 14114 13823 * The perf_event_exit_task_context calls perf_event_task 14115 - * with child's task_ctx, which generates EXIT events for 14116 - * child contexts and sets child->perf_event_ctxp[] to NULL. 13824 + * with task's task_ctx, which generates EXIT events for 13825 + * task contexts and sets task->perf_event_ctxp[] to NULL. 14117 13826 * At this point we need to send EXIT events to cpu contexts. 14118 13827 */ 14119 - perf_event_task(child, NULL, 0); 13828 + perf_event_task(task, NULL, 0); 14120 13829 14121 13830 /* 14122 13831 * Detach the perf_ctx_data for the system-wide event. 14123 13832 */ 14124 13833 guard(percpu_read)(&global_ctx_data_rwsem); 14125 - detach_task_ctx_data(child); 14126 - } 14127 - 14128 - static void perf_free_event(struct perf_event *event, 14129 - struct perf_event_context *ctx) 14130 - { 14131 - struct perf_event *parent = event->parent; 14132 - 14133 - if (WARN_ON_ONCE(!parent)) 14134 - return; 14135 - 14136 - mutex_lock(&parent->child_mutex); 14137 - list_del_init(&event->child_list); 14138 - mutex_unlock(&parent->child_mutex); 14139 - 14140 - raw_spin_lock_irq(&ctx->lock); 14141 - perf_group_detach(event); 14142 - list_del_event(event, ctx); 14143 - raw_spin_unlock_irq(&ctx->lock); 14144 - put_event(event); 13834 + detach_task_ctx_data(task); 14145 13835 } 14146 13836 14147 13837 /* ··· 14134 13862 */ 14135 13863 void perf_event_free_task(struct task_struct *task) 14136 13864 { 14137 - struct perf_event_context *ctx; 14138 - struct perf_event *event, *tmp; 14139 - 14140 - ctx = rcu_access_pointer(task->perf_event_ctxp); 14141 - if (!ctx) 14142 - return; 14143 - 14144 - mutex_lock(&ctx->mutex); 14145 - raw_spin_lock_irq(&ctx->lock); 14146 - /* 14147 - * Destroy the task <-> ctx relation and mark the context dead. 14148 - * 14149 - * This is important because even though the task hasn't been 14150 - * exposed yet the context has been (through child_list). 14151 - */ 14152 - RCU_INIT_POINTER(task->perf_event_ctxp, NULL); 14153 - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); 14154 - put_task_struct(task); /* cannot be last */ 14155 - raw_spin_unlock_irq(&ctx->lock); 14156 - 14157 - 14158 - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) 14159 - perf_free_event(event, ctx); 14160 - 14161 - mutex_unlock(&ctx->mutex); 14162 - 14163 - /* 14164 - * perf_event_release_kernel() could've stolen some of our 14165 - * child events and still have them on its free_list. In that 14166 - * case we must wait for these events to have been freed (in 14167 - * particular all their references to this task must've been 14168 - * dropped). 14169 - * 14170 - * Without this copy_process() will unconditionally free this 14171 - * task (irrespective of its reference count) and 14172 - * _free_event()'s put_task_struct(event->hw.target) will be a 14173 - * use-after-free. 14174 - * 14175 - * Wait for all events to drop their context reference. 14176 - */ 14177 - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); 14178 - put_ctx(ctx); /* must be last */ 13865 + perf_event_exit_task_context(task, false); 14179 13866 } 14180 13867 14181 13868 void perf_event_delayed_put(struct task_struct *task) ··· 14210 13979 */ 14211 13980 if (parent_event->parent) 14212 13981 parent_event = parent_event->parent; 13982 + 13983 + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) 13984 + return NULL; 13985 + 13986 + /* 13987 + * Event creation should be under SRCU, see perf_pmu_unregister(). 13988 + */ 13989 + guard(srcu)(&pmus_srcu); 14213 13990 14214 13991 child_event = perf_event_alloc(&parent_event->attr, 14215 13992 parent_event->cpu,
+20 -9
kernel/events/ring_buffer.c
··· 679 679 { 680 680 bool overwrite = !(flags & RING_BUFFER_WRITABLE); 681 681 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); 682 - int ret = -ENOMEM, max_order; 682 + bool use_contiguous_pages = event->pmu->capabilities & ( 683 + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); 684 + /* 685 + * Initialize max_order to 0 for page allocation. This allocates single 686 + * pages to minimize memory fragmentation. This is overridden if the 687 + * PMU needs or prefers contiguous pages (use_contiguous_pages = true). 688 + */ 689 + int max_order = 0; 690 + int ret = -ENOMEM; 683 691 684 692 if (!has_aux(event)) 685 693 return -EOPNOTSUPP; ··· 697 689 698 690 if (!overwrite) { 699 691 /* 700 - * Watermark defaults to half the buffer, and so does the 701 - * max_order, to aid PMU drivers in double buffering. 692 + * Watermark defaults to half the buffer, to aid PMU drivers 693 + * in double buffering. 702 694 */ 703 695 if (!watermark) 704 696 watermark = min_t(unsigned long, ··· 706 698 (unsigned long)nr_pages << (PAGE_SHIFT - 1)); 707 699 708 700 /* 709 - * Use aux_watermark as the basis for chunking to 710 - * help PMU drivers honor the watermark. 701 + * If using contiguous pages, use aux_watermark as the basis 702 + * for chunking to help PMU drivers honor the watermark. 711 703 */ 712 - max_order = get_order(watermark); 704 + if (use_contiguous_pages) 705 + max_order = get_order(watermark); 713 706 } else { 714 707 /* 715 - * We need to start with the max_order that fits in nr_pages, 716 - * not the other way around, hence ilog2() and not get_order. 708 + * If using contiguous pages, we need to start with the 709 + * max_order that fits in nr_pages, not the other way around, 710 + * hence ilog2() and not get_order. 717 711 */ 718 - max_order = ilog2(nr_pages); 712 + if (use_contiguous_pages) 713 + max_order = ilog2(nr_pages); 719 714 watermark = 0; 720 715 } 721 716
+325 -314
tools/include/uapi/linux/perf_event.h
··· 39 39 40 40 /* 41 41 * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE 42 + * 42 43 * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA 43 44 * AA: hardware event ID 44 45 * EEEEEEEE: PMU type ID 46 + * 45 47 * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB 46 48 * BB: hardware cache ID 47 49 * CC: hardware cache op ID 48 50 * DD: hardware cache op result ID 49 51 * EEEEEEEE: PMU type ID 50 - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. 52 + * 53 + * If the PMU type ID is 0, PERF_TYPE_RAW will be applied. 51 54 */ 52 - #define PERF_PMU_TYPE_SHIFT 32 53 - #define PERF_HW_EVENT_MASK 0xffffffff 55 + #define PERF_PMU_TYPE_SHIFT 32 56 + #define PERF_HW_EVENT_MASK 0xffffffff 54 57 55 58 /* 56 59 * Generalized performance event event_id types, used by the ··· 115 112 /* 116 113 * Special "software" events provided by the kernel, even if the hardware 117 114 * does not support performance events. These events measure various 118 - * physical and sw events of the kernel (and allow the profiling of them as 115 + * physical and SW events of the kernel (and allow the profiling of them as 119 116 * well): 120 117 */ 121 118 enum perf_sw_ids { ··· 170 167 }; 171 168 172 169 #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) 170 + 173 171 /* 174 - * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set 172 + * Values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set. 175 173 * 176 174 * If the user does not pass priv level information via branch_sample_type, 177 175 * the kernel uses the event's priv level. Branch and event priv levels do ··· 182 178 * of branches and therefore it supersedes all the other types. 183 179 */ 184 180 enum perf_branch_sample_type_shift { 185 - PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ 186 - PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ 187 - PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ 181 + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ 182 + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ 183 + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ 188 184 189 - PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ 190 - PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ 191 - PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ 192 - PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ 193 - PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ 194 - PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ 195 - PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ 185 + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ 186 + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ 187 + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ 188 + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ 189 + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ 190 + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ 191 + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ 196 192 PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ 197 193 198 - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ 194 + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* CALL/RET stack */ 199 195 PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ 200 196 PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ 201 197 ··· 214 210 }; 215 211 216 212 enum perf_branch_sample_type { 217 - PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, 218 - PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, 219 - PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, 213 + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, 214 + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, 215 + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, 220 216 221 - PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, 222 - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, 223 - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, 224 - PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, 225 - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, 226 - PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, 227 - PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, 228 - PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, 217 + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, 218 + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, 219 + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, 220 + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, 221 + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, 222 + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, 223 + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, 224 + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, 229 225 230 - PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, 231 - PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, 232 - PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, 226 + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, 227 + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, 228 + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, 233 229 234 - PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, 235 - PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, 230 + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, 231 + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, 236 232 237 - PERF_SAMPLE_BRANCH_TYPE_SAVE = 238 - 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, 233 + PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, 239 234 240 - PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, 235 + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, 241 236 242 - PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, 237 + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, 243 238 244 - PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, 239 + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, 245 240 246 - PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, 241 + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, 247 242 }; 248 243 249 244 /* 250 - * Common flow change classification 245 + * Common control flow change classifications: 251 246 */ 252 247 enum { 253 - PERF_BR_UNKNOWN = 0, /* unknown */ 254 - PERF_BR_COND = 1, /* conditional */ 255 - PERF_BR_UNCOND = 2, /* unconditional */ 256 - PERF_BR_IND = 3, /* indirect */ 257 - PERF_BR_CALL = 4, /* function call */ 258 - PERF_BR_IND_CALL = 5, /* indirect function call */ 259 - PERF_BR_RET = 6, /* function return */ 260 - PERF_BR_SYSCALL = 7, /* syscall */ 261 - PERF_BR_SYSRET = 8, /* syscall return */ 262 - PERF_BR_COND_CALL = 9, /* conditional function call */ 263 - PERF_BR_COND_RET = 10, /* conditional function return */ 264 - PERF_BR_ERET = 11, /* exception return */ 265 - PERF_BR_IRQ = 12, /* irq */ 266 - PERF_BR_SERROR = 13, /* system error */ 267 - PERF_BR_NO_TX = 14, /* not in transaction */ 268 - PERF_BR_EXTEND_ABI = 15, /* extend ABI */ 248 + PERF_BR_UNKNOWN = 0, /* Unknown */ 249 + PERF_BR_COND = 1, /* Conditional */ 250 + PERF_BR_UNCOND = 2, /* Unconditional */ 251 + PERF_BR_IND = 3, /* Indirect */ 252 + PERF_BR_CALL = 4, /* Function call */ 253 + PERF_BR_IND_CALL = 5, /* Indirect function call */ 254 + PERF_BR_RET = 6, /* Function return */ 255 + PERF_BR_SYSCALL = 7, /* Syscall */ 256 + PERF_BR_SYSRET = 8, /* Syscall return */ 257 + PERF_BR_COND_CALL = 9, /* Conditional function call */ 258 + PERF_BR_COND_RET = 10, /* Conditional function return */ 259 + PERF_BR_ERET = 11, /* Exception return */ 260 + PERF_BR_IRQ = 12, /* IRQ */ 261 + PERF_BR_SERROR = 13, /* System error */ 262 + PERF_BR_NO_TX = 14, /* Not in transaction */ 263 + PERF_BR_EXTEND_ABI = 15, /* Extend ABI */ 269 264 PERF_BR_MAX, 270 265 }; 271 266 272 267 /* 273 - * Common branch speculation outcome classification 268 + * Common branch speculation outcome classifications: 274 269 */ 275 270 enum { 276 - PERF_BR_SPEC_NA = 0, /* Not available */ 277 - PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ 278 - PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ 279 - PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ 271 + PERF_BR_SPEC_NA = 0, /* Not available */ 272 + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ 273 + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ 274 + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ 280 275 PERF_BR_SPEC_MAX, 281 276 }; 282 277 283 278 enum { 284 - PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ 285 - PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ 286 - PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ 287 - PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ 288 - PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ 289 - PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ 290 - PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ 291 - PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ 279 + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ 280 + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ 281 + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ 282 + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ 283 + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ 284 + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ 285 + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ 286 + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ 292 287 PERF_BR_NEW_MAX, 293 288 }; 294 289 295 290 enum { 296 - PERF_BR_PRIV_UNKNOWN = 0, 297 - PERF_BR_PRIV_USER = 1, 298 - PERF_BR_PRIV_KERNEL = 2, 299 - PERF_BR_PRIV_HV = 3, 291 + PERF_BR_PRIV_UNKNOWN = 0, 292 + PERF_BR_PRIV_USER = 1, 293 + PERF_BR_PRIV_KERNEL = 2, 294 + PERF_BR_PRIV_HV = 3, 300 295 }; 301 296 302 - #define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 303 - #define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 304 - #define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 305 - #define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 306 - #define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 297 + #define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 298 + #define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 299 + #define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 300 + #define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 301 + #define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 307 302 308 303 #define PERF_SAMPLE_BRANCH_PLM_ALL \ 309 304 (PERF_SAMPLE_BRANCH_USER|\ ··· 313 310 * Values to determine ABI of the registers dump. 314 311 */ 315 312 enum perf_sample_regs_abi { 316 - PERF_SAMPLE_REGS_ABI_NONE = 0, 317 - PERF_SAMPLE_REGS_ABI_32 = 1, 318 - PERF_SAMPLE_REGS_ABI_64 = 2, 313 + PERF_SAMPLE_REGS_ABI_NONE = 0, 314 + PERF_SAMPLE_REGS_ABI_32 = 1, 315 + PERF_SAMPLE_REGS_ABI_64 = 2, 319 316 }; 320 317 321 318 /* ··· 323 320 * abort events. Multiple bits can be set. 324 321 */ 325 322 enum { 326 - PERF_TXN_ELISION = (1 << 0), /* From elision */ 327 - PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ 328 - PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ 329 - PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ 330 - PERF_TXN_RETRY = (1 << 4), /* Retry possible */ 331 - PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ 332 - PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ 333 - PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ 323 + PERF_TXN_ELISION = (1 << 0), /* From elision */ 324 + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ 325 + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ 326 + PERF_TXN_ASYNC = (1 << 3), /* Instruction is not related */ 327 + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ 328 + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ 329 + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ 330 + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ 334 331 335 - PERF_TXN_MAX = (1 << 8), /* non-ABI */ 332 + PERF_TXN_MAX = (1 << 8), /* non-ABI */ 336 333 337 - /* bits 32..63 are reserved for the abort code */ 334 + /* Bits 32..63 are reserved for the abort code */ 338 335 339 - PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), 340 - PERF_TXN_ABORT_SHIFT = 32, 336 + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), 337 + PERF_TXN_ABORT_SHIFT = 32, 341 338 }; 342 339 343 340 /* ··· 372 369 PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ 373 370 }; 374 371 375 - #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 376 - #define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ 377 - #define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ 378 - #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ 379 - /* add: sample_stack_user */ 380 - #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ 381 - #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ 382 - #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ 383 - #define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ 384 - #define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ 372 + #define PERF_ATTR_SIZE_VER0 64 /* Size of first published 'struct perf_event_attr' */ 373 + #define PERF_ATTR_SIZE_VER1 72 /* Add: config2 */ 374 + #define PERF_ATTR_SIZE_VER2 80 /* Add: branch_sample_type */ 375 + #define PERF_ATTR_SIZE_VER3 96 /* Add: sample_regs_user */ 376 + /* Add: sample_stack_user */ 377 + #define PERF_ATTR_SIZE_VER4 104 /* Add: sample_regs_intr */ 378 + #define PERF_ATTR_SIZE_VER5 112 /* Add: aux_watermark */ 379 + #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ 380 + #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ 381 + #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ 385 382 386 383 /* 387 - * Hardware event_id to monitor via a performance monitoring event: 388 - * 389 - * @sample_max_stack: Max number of frame pointers in a callchain, 390 - * should be < /proc/sys/kernel/perf_event_max_stack 391 - * Max number of entries of branch stack 392 - * should be < hardware limit 384 + * 'struct perf_event_attr' contains various attributes that define 385 + * a performance event - most of them hardware related configuration 386 + * details, but also a lot of behavioral switches and values implemented 387 + * by the kernel. 393 388 */ 394 389 struct perf_event_attr { 395 390 ··· 397 396 __u32 type; 398 397 399 398 /* 400 - * Size of the attr structure, for fwd/bwd compat. 399 + * Size of the attr structure, for forward/backwards compatibility. 401 400 */ 402 401 __u32 size; 403 402 ··· 452 451 comm_exec : 1, /* flag comm events that are due to an exec */ 453 452 use_clockid : 1, /* use @clockid for time fields */ 454 453 context_switch : 1, /* context switch data */ 455 - write_backward : 1, /* Write ring buffer from end to beginning */ 454 + write_backward : 1, /* write ring buffer from end to beginning */ 456 455 namespaces : 1, /* include namespaces data */ 457 456 ksymbol : 1, /* include ksymbol events */ 458 - bpf_event : 1, /* include bpf events */ 457 + bpf_event : 1, /* include BPF events */ 459 458 aux_output : 1, /* generate AUX records instead of events */ 460 459 cgroup : 1, /* include cgroup events */ 461 460 text_poke : 1, /* include text poke events */ 462 - build_id : 1, /* use build id in mmap2 events */ 461 + build_id : 1, /* use build ID in mmap2 events */ 463 462 inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ 464 463 remove_on_exec : 1, /* event is removed from task on exec */ 465 464 sigtrap : 1, /* send synchronous SIGTRAP on event */ 466 465 __reserved_1 : 26; 467 466 468 467 union { 469 - __u32 wakeup_events; /* wakeup every n events */ 468 + __u32 wakeup_events; /* wake up every n events */ 470 469 __u32 wakeup_watermark; /* bytes before wakeup */ 471 470 }; 472 471 ··· 475 474 __u64 bp_addr; 476 475 __u64 kprobe_func; /* for perf_kprobe */ 477 476 __u64 uprobe_path; /* for perf_uprobe */ 478 - __u64 config1; /* extension of config */ 477 + __u64 config1; /* extension of config */ 479 478 }; 480 479 union { 481 480 __u64 bp_len; 482 - __u64 kprobe_addr; /* when kprobe_func == NULL */ 481 + __u64 kprobe_addr; /* when kprobe_func == NULL */ 483 482 __u64 probe_offset; /* for perf_[k,u]probe */ 484 - __u64 config2; /* extension of config1 */ 483 + __u64 config2; /* extension of config1 */ 485 484 }; 486 485 __u64 branch_sample_type; /* enum perf_branch_sample_type */ 487 486 ··· 511 510 * Wakeup watermark for AUX area 512 511 */ 513 512 __u32 aux_watermark; 513 + 514 + /* 515 + * Max number of frame pointers in a callchain, should be 516 + * lower than /proc/sys/kernel/perf_event_max_stack. 517 + * 518 + * Max number of entries of branch stack should be lower 519 + * than the hardware limit. 520 + */ 514 521 __u16 sample_max_stack; 522 + 515 523 __u16 __reserved_2; 516 524 __u32 aux_sample_size; 517 525 ··· 547 537 548 538 /* 549 539 * Structure used by below PERF_EVENT_IOC_QUERY_BPF command 550 - * to query bpf programs attached to the same perf tracepoint 540 + * to query BPF programs attached to the same perf tracepoint 551 541 * as the given perf event. 552 542 */ 553 543 struct perf_event_query_bpf { ··· 569 559 /* 570 560 * Ioctls that can be done on a perf event fd: 571 561 */ 572 - #define PERF_EVENT_IOC_ENABLE _IO ('$', 0) 573 - #define PERF_EVENT_IOC_DISABLE _IO ('$', 1) 574 - #define PERF_EVENT_IOC_REFRESH _IO ('$', 2) 575 - #define PERF_EVENT_IOC_RESET _IO ('$', 3) 576 - #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) 577 - #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) 578 - #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) 579 - #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) 580 - #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) 581 - #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) 562 + #define PERF_EVENT_IOC_ENABLE _IO ('$', 0) 563 + #define PERF_EVENT_IOC_DISABLE _IO ('$', 1) 564 + #define PERF_EVENT_IOC_REFRESH _IO ('$', 2) 565 + #define PERF_EVENT_IOC_RESET _IO ('$', 3) 566 + #define PERF_EVENT_IOC_PERIOD _IOW ('$', 4, __u64) 567 + #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) 568 + #define PERF_EVENT_IOC_SET_FILTER _IOW ('$', 6, char *) 569 + #define PERF_EVENT_IOC_ID _IOR ('$', 7, __u64 *) 570 + #define PERF_EVENT_IOC_SET_BPF _IOW ('$', 8, __u32) 571 + #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW ('$', 9, __u32) 582 572 #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) 583 - #define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) 573 + #define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW ('$', 11, struct perf_event_attr *) 584 574 585 575 enum perf_event_ioc_flags { 586 - PERF_IOC_FLAG_GROUP = 1U << 0, 576 + PERF_IOC_FLAG_GROUP = 1U << 0, 587 577 }; 588 578 589 579 /* ··· 594 584 __u32 compat_version; /* lowest version this is compat with */ 595 585 596 586 /* 597 - * Bits needed to read the hw events in user-space. 587 + * Bits needed to read the HW events in user-space. 598 588 * 599 589 * u32 seq, time_mult, time_shift, index, width; 600 590 * u64 count, enabled, running; ··· 632 622 __u32 index; /* hardware event identifier */ 633 623 __s64 offset; /* add to hardware event value */ 634 624 __u64 time_enabled; /* time event active */ 635 - __u64 time_running; /* time event on cpu */ 625 + __u64 time_running; /* time event on CPU */ 636 626 union { 637 627 __u64 capabilities; 638 628 struct { ··· 660 650 661 651 /* 662 652 * If cap_usr_time the below fields can be used to compute the time 663 - * delta since time_enabled (in ns) using rdtsc or similar. 653 + * delta since time_enabled (in ns) using RDTSC or similar. 664 654 * 665 655 * u64 quot, rem; 666 656 * u64 delta; ··· 733 723 * after reading this value. 734 724 * 735 725 * When the mapping is PROT_WRITE the @data_tail value should be 736 - * written by userspace to reflect the last read data, after issueing 726 + * written by user-space to reflect the last read data, after issuing 737 727 * an smp_mb() to separate the data read from the ->data_tail store. 738 728 * In this case the kernel will not over-write unread data. 739 729 * ··· 749 739 750 740 /* 751 741 * AUX area is defined by aux_{offset,size} fields that should be set 752 - * by the userspace, so that 742 + * by the user-space, so that 753 743 * 754 744 * aux_offset >= data_offset + data_size 755 745 * ··· 823 813 * Indicates that thread was preempted in TASK_RUNNING state. 824 814 * 825 815 * PERF_RECORD_MISC_MMAP_BUILD_ID: 826 - * Indicates that mmap2 event carries build id data. 816 + * Indicates that mmap2 event carries build ID data. 827 817 */ 828 818 #define PERF_RECORD_MISC_EXACT_IP (1 << 14) 829 819 #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) ··· 834 824 #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) 835 825 836 826 struct perf_event_header { 837 - __u32 type; 838 - __u16 misc; 839 - __u16 size; 827 + __u32 type; 828 + __u16 misc; 829 + __u16 size; 840 830 }; 841 831 842 832 struct perf_ns_link_info { 843 - __u64 dev; 844 - __u64 ino; 833 + __u64 dev; 834 + __u64 ino; 845 835 }; 846 836 847 837 enum { 848 - NET_NS_INDEX = 0, 849 - UTS_NS_INDEX = 1, 850 - IPC_NS_INDEX = 2, 851 - PID_NS_INDEX = 3, 852 - USER_NS_INDEX = 4, 853 - MNT_NS_INDEX = 5, 854 - CGROUP_NS_INDEX = 6, 838 + NET_NS_INDEX = 0, 839 + UTS_NS_INDEX = 1, 840 + IPC_NS_INDEX = 2, 841 + PID_NS_INDEX = 3, 842 + USER_NS_INDEX = 4, 843 + MNT_NS_INDEX = 5, 844 + CGROUP_NS_INDEX = 6, 855 845 856 - NR_NAMESPACES, /* number of available namespaces */ 846 + NR_NAMESPACES, /* number of available namespaces */ 857 847 }; 858 848 859 849 enum perf_event_type { ··· 869 859 * optional fields being ignored. 870 860 * 871 861 * struct sample_id { 872 - * { u32 pid, tid; } && PERF_SAMPLE_TID 873 - * { u64 time; } && PERF_SAMPLE_TIME 874 - * { u64 id; } && PERF_SAMPLE_ID 875 - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID 876 - * { u32 cpu, res; } && PERF_SAMPLE_CPU 862 + * { u32 pid, tid; } && PERF_SAMPLE_TID 863 + * { u64 time; } && PERF_SAMPLE_TIME 864 + * { u64 id; } && PERF_SAMPLE_ID 865 + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID 866 + * { u32 cpu, res; } && PERF_SAMPLE_CPU 877 867 * { u64 id; } && PERF_SAMPLE_IDENTIFIER 878 868 * } && perf_event_attr::sample_id_all 879 869 * ··· 884 874 885 875 /* 886 876 * The MMAP events record the PROT_EXEC mappings so that we can 887 - * correlate userspace IPs to code. They have the following structure: 877 + * correlate user-space IPs to code. They have the following structure: 888 878 * 889 879 * struct { 890 880 * struct perf_event_header header; ··· 894 884 * u64 len; 895 885 * u64 pgoff; 896 886 * char filename[]; 897 - * struct sample_id sample_id; 887 + * struct sample_id sample_id; 898 888 * }; 899 889 */ 900 890 PERF_RECORD_MMAP = 1, ··· 904 894 * struct perf_event_header header; 905 895 * u64 id; 906 896 * u64 lost; 907 - * struct sample_id sample_id; 897 + * struct sample_id sample_id; 908 898 * }; 909 899 */ 910 900 PERF_RECORD_LOST = 2, ··· 915 905 * 916 906 * u32 pid, tid; 917 907 * char comm[]; 918 - * struct sample_id sample_id; 908 + * struct sample_id sample_id; 919 909 * }; 920 910 */ 921 911 PERF_RECORD_COMM = 3, ··· 926 916 * u32 pid, ppid; 927 917 * u32 tid, ptid; 928 918 * u64 time; 929 - * struct sample_id sample_id; 919 + * struct sample_id sample_id; 930 920 * }; 931 921 */ 932 922 PERF_RECORD_EXIT = 4, ··· 937 927 * u64 time; 938 928 * u64 id; 939 929 * u64 stream_id; 940 - * struct sample_id sample_id; 930 + * struct sample_id sample_id; 941 931 * }; 942 932 */ 943 933 PERF_RECORD_THROTTLE = 5, ··· 949 939 * u32 pid, ppid; 950 940 * u32 tid, ptid; 951 941 * u64 time; 952 - * struct sample_id sample_id; 942 + * struct sample_id sample_id; 953 943 * }; 954 944 */ 955 945 PERF_RECORD_FORK = 7, ··· 960 950 * u32 pid, tid; 961 951 * 962 952 * struct read_format values; 963 - * struct sample_id sample_id; 953 + * struct sample_id sample_id; 964 954 * }; 965 955 */ 966 956 PERF_RECORD_READ = 8, ··· 1015 1005 * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS 1016 1006 * } && PERF_SAMPLE_BRANCH_STACK 1017 1007 * 1018 - * { u64 abi; # enum perf_sample_regs_abi 1019 - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER 1008 + * { u64 abi; # enum perf_sample_regs_abi 1009 + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER 1020 1010 * 1021 - * { u64 size; 1022 - * char data[size]; 1023 - * u64 dyn_size; } && PERF_SAMPLE_STACK_USER 1011 + * { u64 size; 1012 + * char data[size]; 1013 + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER 1024 1014 * 1025 1015 * { union perf_sample_weight 1026 1016 * { ··· 1045 1035 * { u64 abi; # enum perf_sample_regs_abi 1046 1036 * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR 1047 1037 * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR 1048 - * { u64 size; 1049 - * char data[size]; } && PERF_SAMPLE_AUX 1038 + * { u64 cgroup;} && PERF_SAMPLE_CGROUP 1050 1039 * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE 1051 1040 * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE 1041 + * { u64 size; 1042 + * char data[size]; } && PERF_SAMPLE_AUX 1052 1043 * }; 1053 1044 */ 1054 1045 PERF_RECORD_SAMPLE = 9, ··· 1081 1070 * }; 1082 1071 * u32 prot, flags; 1083 1072 * char filename[]; 1084 - * struct sample_id sample_id; 1073 + * struct sample_id sample_id; 1085 1074 * }; 1086 1075 */ 1087 1076 PERF_RECORD_MMAP2 = 10, ··· 1090 1079 * Records that new data landed in the AUX buffer part. 1091 1080 * 1092 1081 * struct { 1093 - * struct perf_event_header header; 1082 + * struct perf_event_header header; 1094 1083 * 1095 - * u64 aux_offset; 1096 - * u64 aux_size; 1084 + * u64 aux_offset; 1085 + * u64 aux_size; 1097 1086 * u64 flags; 1098 - * struct sample_id sample_id; 1087 + * struct sample_id sample_id; 1099 1088 * }; 1100 1089 */ 1101 1090 PERF_RECORD_AUX = 11, ··· 1178 1167 PERF_RECORD_KSYMBOL = 17, 1179 1168 1180 1169 /* 1181 - * Record bpf events: 1170 + * Record BPF events: 1182 1171 * enum perf_bpf_event_type { 1183 1172 * PERF_BPF_EVENT_UNKNOWN = 0, 1184 1173 * PERF_BPF_EVENT_PROG_LOAD = 1, ··· 1256 1245 #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) 1257 1246 1258 1247 enum perf_bpf_event_type { 1259 - PERF_BPF_EVENT_UNKNOWN = 0, 1260 - PERF_BPF_EVENT_PROG_LOAD = 1, 1261 - PERF_BPF_EVENT_PROG_UNLOAD = 2, 1262 - PERF_BPF_EVENT_MAX, /* non-ABI */ 1248 + PERF_BPF_EVENT_UNKNOWN = 0, 1249 + PERF_BPF_EVENT_PROG_LOAD = 1, 1250 + PERF_BPF_EVENT_PROG_UNLOAD = 2, 1251 + PERF_BPF_EVENT_MAX, /* non-ABI */ 1263 1252 }; 1264 1253 1265 - #define PERF_MAX_STACK_DEPTH 127 1266 - #define PERF_MAX_CONTEXTS_PER_STACK 8 1254 + #define PERF_MAX_STACK_DEPTH 127 1255 + #define PERF_MAX_CONTEXTS_PER_STACK 8 1267 1256 1268 1257 enum perf_callchain_context { 1269 - PERF_CONTEXT_HV = (__u64)-32, 1270 - PERF_CONTEXT_KERNEL = (__u64)-128, 1271 - PERF_CONTEXT_USER = (__u64)-512, 1258 + PERF_CONTEXT_HV = (__u64)-32, 1259 + PERF_CONTEXT_KERNEL = (__u64)-128, 1260 + PERF_CONTEXT_USER = (__u64)-512, 1272 1261 1273 - PERF_CONTEXT_GUEST = (__u64)-2048, 1274 - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, 1275 - PERF_CONTEXT_GUEST_USER = (__u64)-2560, 1262 + PERF_CONTEXT_GUEST = (__u64)-2048, 1263 + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, 1264 + PERF_CONTEXT_GUEST_USER = (__u64)-2560, 1276 1265 1277 - PERF_CONTEXT_MAX = (__u64)-4095, 1266 + PERF_CONTEXT_MAX = (__u64)-4095, 1278 1267 }; 1279 1268 1280 1269 /** 1281 1270 * PERF_RECORD_AUX::flags bits 1282 1271 */ 1283 - #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ 1284 - #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ 1285 - #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ 1286 - #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ 1272 + #define PERF_AUX_FLAG_TRUNCATED 0x0001 /* Record was truncated to fit */ 1273 + #define PERF_AUX_FLAG_OVERWRITE 0x0002 /* Snapshot from overwrite mode */ 1274 + #define PERF_AUX_FLAG_PARTIAL 0x0004 /* Record contains gaps */ 1275 + #define PERF_AUX_FLAG_COLLISION 0x0008 /* Sample collided with another */ 1287 1276 #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ 1288 1277 1289 1278 /* CoreSight PMU AUX buffer formats */ 1290 - #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ 1291 - #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ 1279 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ 1280 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ 1292 1281 1293 - #define PERF_FLAG_FD_NO_GROUP (1UL << 0) 1294 - #define PERF_FLAG_FD_OUTPUT (1UL << 1) 1295 - #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ 1296 - #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 1282 + #define PERF_FLAG_FD_NO_GROUP (1UL << 0) 1283 + #define PERF_FLAG_FD_OUTPUT (1UL << 1) 1284 + #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup ID, per-CPU mode only */ 1285 + #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 1297 1286 1298 1287 #if defined(__LITTLE_ENDIAN_BITFIELD) 1299 1288 union perf_mem_data_src { 1300 1289 __u64 val; 1301 1290 struct { 1302 - __u64 mem_op:5, /* type of opcode */ 1303 - mem_lvl:14, /* memory hierarchy level */ 1304 - mem_snoop:5, /* snoop mode */ 1305 - mem_lock:2, /* lock instr */ 1306 - mem_dtlb:7, /* tlb access */ 1307 - mem_lvl_num:4, /* memory hierarchy level number */ 1308 - mem_remote:1, /* remote */ 1309 - mem_snoopx:2, /* snoop mode, ext */ 1310 - mem_blk:3, /* access blocked */ 1311 - mem_hops:3, /* hop level */ 1312 - mem_rsvd:18; 1291 + __u64 mem_op : 5, /* Type of opcode */ 1292 + mem_lvl : 14, /* Memory hierarchy level */ 1293 + mem_snoop : 5, /* Snoop mode */ 1294 + mem_lock : 2, /* Lock instr */ 1295 + mem_dtlb : 7, /* TLB access */ 1296 + mem_lvl_num : 4, /* Memory hierarchy level number */ 1297 + mem_remote : 1, /* Remote */ 1298 + mem_snoopx : 2, /* Snoop mode, ext */ 1299 + mem_blk : 3, /* Access blocked */ 1300 + mem_hops : 3, /* Hop level */ 1301 + mem_rsvd : 18; 1313 1302 }; 1314 1303 }; 1315 1304 #elif defined(__BIG_ENDIAN_BITFIELD) 1316 1305 union perf_mem_data_src { 1317 1306 __u64 val; 1318 1307 struct { 1319 - __u64 mem_rsvd:18, 1320 - mem_hops:3, /* hop level */ 1321 - mem_blk:3, /* access blocked */ 1322 - mem_snoopx:2, /* snoop mode, ext */ 1323 - mem_remote:1, /* remote */ 1324 - mem_lvl_num:4, /* memory hierarchy level number */ 1325 - mem_dtlb:7, /* tlb access */ 1326 - mem_lock:2, /* lock instr */ 1327 - mem_snoop:5, /* snoop mode */ 1328 - mem_lvl:14, /* memory hierarchy level */ 1329 - mem_op:5; /* type of opcode */ 1308 + __u64 mem_rsvd : 18, 1309 + mem_hops : 3, /* Hop level */ 1310 + mem_blk : 3, /* Access blocked */ 1311 + mem_snoopx : 2, /* Snoop mode, ext */ 1312 + mem_remote : 1, /* Remote */ 1313 + mem_lvl_num : 4, /* Memory hierarchy level number */ 1314 + mem_dtlb : 7, /* TLB access */ 1315 + mem_lock : 2, /* Lock instr */ 1316 + mem_snoop : 5, /* Snoop mode */ 1317 + mem_lvl : 14, /* Memory hierarchy level */ 1318 + mem_op : 5; /* Type of opcode */ 1330 1319 }; 1331 1320 }; 1332 1321 #else 1333 - #error "Unknown endianness" 1322 + # error "Unknown endianness" 1334 1323 #endif 1335 1324 1336 - /* type of opcode (load/store/prefetch,code) */ 1337 - #define PERF_MEM_OP_NA 0x01 /* not available */ 1338 - #define PERF_MEM_OP_LOAD 0x02 /* load instruction */ 1339 - #define PERF_MEM_OP_STORE 0x04 /* store instruction */ 1340 - #define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ 1341 - #define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ 1342 - #define PERF_MEM_OP_SHIFT 0 1325 + /* Type of memory opcode: */ 1326 + #define PERF_MEM_OP_NA 0x0001 /* Not available */ 1327 + #define PERF_MEM_OP_LOAD 0x0002 /* Load instruction */ 1328 + #define PERF_MEM_OP_STORE 0x0004 /* Store instruction */ 1329 + #define PERF_MEM_OP_PFETCH 0x0008 /* Prefetch */ 1330 + #define PERF_MEM_OP_EXEC 0x0010 /* Code (execution) */ 1331 + #define PERF_MEM_OP_SHIFT 0 1343 1332 1344 1333 /* 1345 - * PERF_MEM_LVL_* namespace being depricated to some extent in the 1334 + * The PERF_MEM_LVL_* namespace is being deprecated to some extent in 1346 1335 * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. 1347 - * Supporting this namespace inorder to not break defined ABIs. 1336 + * We support this namespace in order to not break defined ABIs. 1348 1337 * 1349 - * memory hierarchy (memory level, hit or miss) 1338 + * Memory hierarchy (memory level, hit or miss) 1350 1339 */ 1351 - #define PERF_MEM_LVL_NA 0x01 /* not available */ 1352 - #define PERF_MEM_LVL_HIT 0x02 /* hit level */ 1353 - #define PERF_MEM_LVL_MISS 0x04 /* miss level */ 1354 - #define PERF_MEM_LVL_L1 0x08 /* L1 */ 1355 - #define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ 1356 - #define PERF_MEM_LVL_L2 0x20 /* L2 */ 1357 - #define PERF_MEM_LVL_L3 0x40 /* L3 */ 1358 - #define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ 1359 - #define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ 1360 - #define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ 1361 - #define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ 1362 - #define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ 1363 - #define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ 1364 - #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ 1365 - #define PERF_MEM_LVL_SHIFT 5 1340 + #define PERF_MEM_LVL_NA 0x0001 /* Not available */ 1341 + #define PERF_MEM_LVL_HIT 0x0002 /* Hit level */ 1342 + #define PERF_MEM_LVL_MISS 0x0004 /* Miss level */ 1343 + #define PERF_MEM_LVL_L1 0x0008 /* L1 */ 1344 + #define PERF_MEM_LVL_LFB 0x0010 /* Line Fill Buffer */ 1345 + #define PERF_MEM_LVL_L2 0x0020 /* L2 */ 1346 + #define PERF_MEM_LVL_L3 0x0040 /* L3 */ 1347 + #define PERF_MEM_LVL_LOC_RAM 0x0080 /* Local DRAM */ 1348 + #define PERF_MEM_LVL_REM_RAM1 0x0100 /* Remote DRAM (1 hop) */ 1349 + #define PERF_MEM_LVL_REM_RAM2 0x0200 /* Remote DRAM (2 hops) */ 1350 + #define PERF_MEM_LVL_REM_CCE1 0x0400 /* Remote Cache (1 hop) */ 1351 + #define PERF_MEM_LVL_REM_CCE2 0x0800 /* Remote Cache (2 hops) */ 1352 + #define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ 1353 + #define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ 1354 + #define PERF_MEM_LVL_SHIFT 5 1366 1355 1367 - #define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ 1368 - #define PERF_MEM_REMOTE_SHIFT 37 1356 + #define PERF_MEM_REMOTE_REMOTE 0x0001 /* Remote */ 1357 + #define PERF_MEM_REMOTE_SHIFT 37 1369 1358 1370 - #define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ 1371 - #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ 1372 - #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ 1373 - #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ 1374 - #define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ 1375 - #define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ 1376 - /* 0x7 available */ 1377 - #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ 1378 - #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ 1379 - #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ 1380 - #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ 1381 - #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ 1382 - #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ 1383 - #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ 1384 - #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ 1359 + #define PERF_MEM_LVLNUM_L1 0x0001 /* L1 */ 1360 + #define PERF_MEM_LVLNUM_L2 0x0002 /* L2 */ 1361 + #define PERF_MEM_LVLNUM_L3 0x0003 /* L3 */ 1362 + #define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ 1363 + #define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ 1364 + #define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ 1365 + /* 0x007 available */ 1366 + #define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ 1367 + #define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ 1368 + #define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ 1369 + #define PERF_MEM_LVLNUM_ANY_CACHE 0x000b /* Any cache */ 1370 + #define PERF_MEM_LVLNUM_LFB 0x000c /* LFB / L1 Miss Handling Buffer */ 1371 + #define PERF_MEM_LVLNUM_RAM 0x000d /* RAM */ 1372 + #define PERF_MEM_LVLNUM_PMEM 0x000e /* PMEM */ 1373 + #define PERF_MEM_LVLNUM_NA 0x000f /* N/A */ 1385 1374 1386 - #define PERF_MEM_LVLNUM_SHIFT 33 1375 + #define PERF_MEM_LVLNUM_SHIFT 33 1387 1376 1388 - /* snoop mode */ 1389 - #define PERF_MEM_SNOOP_NA 0x01 /* not available */ 1390 - #define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ 1391 - #define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ 1392 - #define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ 1393 - #define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ 1394 - #define PERF_MEM_SNOOP_SHIFT 19 1377 + /* Snoop mode */ 1378 + #define PERF_MEM_SNOOP_NA 0x0001 /* Not available */ 1379 + #define PERF_MEM_SNOOP_NONE 0x0002 /* No snoop */ 1380 + #define PERF_MEM_SNOOP_HIT 0x0004 /* Snoop hit */ 1381 + #define PERF_MEM_SNOOP_MISS 0x0008 /* Snoop miss */ 1382 + #define PERF_MEM_SNOOP_HITM 0x0010 /* Snoop hit modified */ 1383 + #define PERF_MEM_SNOOP_SHIFT 19 1395 1384 1396 - #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ 1397 - #define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ 1398 - #define PERF_MEM_SNOOPX_SHIFT 38 1385 + #define PERF_MEM_SNOOPX_FWD 0x0001 /* Forward */ 1386 + #define PERF_MEM_SNOOPX_PEER 0x0002 /* Transfer from peer */ 1387 + #define PERF_MEM_SNOOPX_SHIFT 38 1399 1388 1400 - /* locked instruction */ 1401 - #define PERF_MEM_LOCK_NA 0x01 /* not available */ 1402 - #define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ 1403 - #define PERF_MEM_LOCK_SHIFT 24 1389 + /* Locked instruction */ 1390 + #define PERF_MEM_LOCK_NA 0x0001 /* Not available */ 1391 + #define PERF_MEM_LOCK_LOCKED 0x0002 /* Locked transaction */ 1392 + #define PERF_MEM_LOCK_SHIFT 24 1404 1393 1405 1394 /* TLB access */ 1406 - #define PERF_MEM_TLB_NA 0x01 /* not available */ 1407 - #define PERF_MEM_TLB_HIT 0x02 /* hit level */ 1408 - #define PERF_MEM_TLB_MISS 0x04 /* miss level */ 1409 - #define PERF_MEM_TLB_L1 0x08 /* L1 */ 1410 - #define PERF_MEM_TLB_L2 0x10 /* L2 */ 1411 - #define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ 1412 - #define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ 1413 - #define PERF_MEM_TLB_SHIFT 26 1395 + #define PERF_MEM_TLB_NA 0x0001 /* Not available */ 1396 + #define PERF_MEM_TLB_HIT 0x0002 /* Hit level */ 1397 + #define PERF_MEM_TLB_MISS 0x0004 /* Miss level */ 1398 + #define PERF_MEM_TLB_L1 0x0008 /* L1 */ 1399 + #define PERF_MEM_TLB_L2 0x0010 /* L2 */ 1400 + #define PERF_MEM_TLB_WK 0x0020 /* Hardware Walker*/ 1401 + #define PERF_MEM_TLB_OS 0x0040 /* OS fault handler */ 1402 + #define PERF_MEM_TLB_SHIFT 26 1414 1403 1415 1404 /* Access blocked */ 1416 - #define PERF_MEM_BLK_NA 0x01 /* not available */ 1417 - #define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ 1418 - #define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ 1419 - #define PERF_MEM_BLK_SHIFT 40 1405 + #define PERF_MEM_BLK_NA 0x0001 /* Not available */ 1406 + #define PERF_MEM_BLK_DATA 0x0002 /* Data could not be forwarded */ 1407 + #define PERF_MEM_BLK_ADDR 0x0004 /* Address conflict */ 1408 + #define PERF_MEM_BLK_SHIFT 40 1420 1409 1421 - /* hop level */ 1422 - #define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ 1423 - #define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ 1424 - #define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ 1425 - #define PERF_MEM_HOPS_3 0x04 /* remote board */ 1410 + /* Hop level */ 1411 + #define PERF_MEM_HOPS_0 0x0001 /* Remote core, same node */ 1412 + #define PERF_MEM_HOPS_1 0x0002 /* Remote node, same socket */ 1413 + #define PERF_MEM_HOPS_2 0x0003 /* Remote socket, same board */ 1414 + #define PERF_MEM_HOPS_3 0x0004 /* Remote board */ 1426 1415 /* 5-7 available */ 1427 - #define PERF_MEM_HOPS_SHIFT 43 1416 + #define PERF_MEM_HOPS_SHIFT 43 1428 1417 1429 1418 #define PERF_MEM_S(a, s) \ 1430 1419 (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) 1431 1420 1432 1421 /* 1433 - * single taken branch record layout: 1422 + * Layout of single taken branch records: 1434 1423 * 1435 1424 * from: source instruction (may not always be a branch insn) 1436 1425 * to: branch target ··· 1449 1438 struct perf_branch_entry { 1450 1439 __u64 from; 1451 1440 __u64 to; 1452 - __u64 mispred:1, /* target mispredicted */ 1453 - predicted:1,/* target predicted */ 1454 - in_tx:1, /* in transaction */ 1455 - abort:1, /* transaction abort */ 1456 - cycles:16, /* cycle count to last branch */ 1457 - type:4, /* branch type */ 1458 - spec:2, /* branch speculation info */ 1459 - new_type:4, /* additional branch type */ 1460 - priv:3, /* privilege level */ 1461 - reserved:31; 1441 + __u64 mispred : 1, /* target mispredicted */ 1442 + predicted : 1, /* target predicted */ 1443 + in_tx : 1, /* in transaction */ 1444 + abort : 1, /* transaction abort */ 1445 + cycles : 16, /* cycle count to last branch */ 1446 + type : 4, /* branch type */ 1447 + spec : 2, /* branch speculation info */ 1448 + new_type : 4, /* additional branch type */ 1449 + priv : 3, /* privilege level */ 1450 + reserved : 31; 1462 1451 }; 1463 1452 1464 1453 /* Size of used info bits in struct perf_branch_entry */ 1465 1454 #define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 1466 1455 1467 1456 union perf_sample_weight { 1468 - __u64 full; 1457 + __u64 full; 1469 1458 #if defined(__LITTLE_ENDIAN_BITFIELD) 1470 1459 struct { 1471 - __u32 var1_dw; 1472 - __u16 var2_w; 1473 - __u16 var3_w; 1460 + __u32 var1_dw; 1461 + __u16 var2_w; 1462 + __u16 var3_w; 1474 1463 }; 1475 1464 #elif defined(__BIG_ENDIAN_BITFIELD) 1476 1465 struct { 1477 - __u16 var3_w; 1478 - __u16 var2_w; 1479 - __u32 var1_dw; 1466 + __u16 var3_w; 1467 + __u16 var2_w; 1468 + __u32 var1_dw; 1480 1469 }; 1481 1470 #else 1482 - #error "Unknown endianness" 1471 + # error "Unknown endianness" 1483 1472 #endif 1484 1473 }; 1485 1474
+12
tools/testing/selftests/bpf/bench.c
··· 526 526 extern const struct bench bench_trig_uretprobe_multi_push; 527 527 extern const struct bench bench_trig_uprobe_multi_ret; 528 528 extern const struct bench bench_trig_uretprobe_multi_ret; 529 + #ifdef __x86_64__ 530 + extern const struct bench bench_trig_uprobe_nop5; 531 + extern const struct bench bench_trig_uretprobe_nop5; 532 + extern const struct bench bench_trig_uprobe_multi_nop5; 533 + extern const struct bench bench_trig_uretprobe_multi_nop5; 534 + #endif 529 535 530 536 extern const struct bench bench_rb_libbpf; 531 537 extern const struct bench bench_rb_custom; ··· 592 586 &bench_trig_uretprobe_multi_push, 593 587 &bench_trig_uprobe_multi_ret, 594 588 &bench_trig_uretprobe_multi_ret, 589 + #ifdef __x86_64__ 590 + &bench_trig_uprobe_nop5, 591 + &bench_trig_uretprobe_nop5, 592 + &bench_trig_uprobe_multi_nop5, 593 + &bench_trig_uretprobe_multi_nop5, 594 + #endif 595 595 /* ringbuf/perfbuf benchmarks */ 596 596 &bench_rb_libbpf, 597 597 &bench_rb_custom,
+42
tools/testing/selftests/bpf/benchs/bench_trigger.c
··· 333 333 return NULL; 334 334 } 335 335 336 + #ifdef __x86_64__ 337 + __nocf_check __weak void uprobe_target_nop5(void) 338 + { 339 + asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00"); 340 + } 341 + 342 + static void *uprobe_producer_nop5(void *input) 343 + { 344 + while (true) 345 + uprobe_target_nop5(); 346 + return NULL; 347 + } 348 + #endif 349 + 336 350 static void usetup(bool use_retprobe, bool use_multi, void *target_addr) 337 351 { 338 352 size_t uprobe_offset; ··· 462 448 usetup(true, true /* use_multi */, &uprobe_target_ret); 463 449 } 464 450 451 + #ifdef __x86_64__ 452 + static void uprobe_nop5_setup(void) 453 + { 454 + usetup(false, false /* !use_multi */, &uprobe_target_nop5); 455 + } 456 + 457 + static void uretprobe_nop5_setup(void) 458 + { 459 + usetup(true, false /* !use_multi */, &uprobe_target_nop5); 460 + } 461 + 462 + static void uprobe_multi_nop5_setup(void) 463 + { 464 + usetup(false, true /* use_multi */, &uprobe_target_nop5); 465 + } 466 + 467 + static void uretprobe_multi_nop5_setup(void) 468 + { 469 + usetup(true, true /* use_multi */, &uprobe_target_nop5); 470 + } 471 + #endif 472 + 465 473 const struct bench bench_trig_syscall_count = { 466 474 .name = "trig-syscall-count", 467 475 .validate = trigger_validate, ··· 542 506 BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop"); 543 507 BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push"); 544 508 BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret"); 509 + #ifdef __x86_64__ 510 + BENCH_TRIG_USERMODE(uprobe_nop5, nop5, "uprobe-nop5"); 511 + BENCH_TRIG_USERMODE(uretprobe_nop5, nop5, "uretprobe-nop5"); 512 + BENCH_TRIG_USERMODE(uprobe_multi_nop5, nop5, "uprobe-multi-nop5"); 513 + BENCH_TRIG_USERMODE(uretprobe_multi_nop5, nop5, "uretprobe-multi-nop5"); 514 + #endif
+1 -1
tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
··· 2 2 3 3 set -eufo pipefail 4 4 5 - for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret} 5 + for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret,nop5} 6 6 do 7 7 summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-) 8 8 printf "%-15s: %s\n" $i "$summary"