Merge tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

Merge tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

- Combine perf and BPF for fast evalution of HW breakpoint
conditions

- Add LBR capture support outside of hardware events

- Trigger IO signals for watermark_wakeup

- Add RAPL support for Intel Arrow Lake and Lunar Lake

- Optimize frequency-throttling

- Miscellaneous cleanups & fixes

* tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
perf/bpf: Mark perf_event_set_bpf_handler() and perf_event_free_bpf_handler() as inline too
selftests/perf_events: Test FASYNC with watermark wakeups
perf/ring_buffer: Trigger IO signals for watermark_wakeup
perf: Move perf_event_fasync() to perf_event.h
perf/bpf: Change the !CONFIG_BPF_SYSCALL stubs to static inlines
selftest/bpf: Test a perf BPF program that suppresses side effects
perf/bpf: Allow a BPF program to suppress all sample side effects
perf/bpf: Remove unneeded uses_default_overflow_handler()
perf/bpf: Call BPF handler directly, not through overflow machinery
perf/bpf: Remove #ifdef CONFIG_BPF_SYSCALL from struct perf_event members
perf/bpf: Create bpf_overflow_handler() stub for !CONFIG_BPF_SYSCALL
perf/bpf: Reorder bpf_overflow_handler() ahead of __perf_event_overflow()
perf/x86/rapl: Add support for Intel Lunar Lake
perf/x86/rapl: Add support for Intel Arrow Lake
perf/core: Reduce PMU access to adjust sample freq
perf/core: Optimize perf_adjust_freq_unthr_context()
perf/x86/amd: Don't reject non-sampling events with configured LBR
perf/x86/amd: Support capturing LBR from software events
perf/x86/amd: Avoid taking branches before disabling LBR
perf/x86/amd: Ensure amd_pmu_core_disable_all() is always inlined
...

Linus Torvalds 2 years ago 17ca7fc2 48fc82c4

+525 -172

14 changed files

expand all collapse all

arch

arm

kernel

hw_breakpoint.c

arm64

kernel

hw_breakpoint.c

x86

events

amd

core.c

lbr.c

perf_event.h

rapl.c

include

linux

perf_event.h

kernel

events

core.c

ring_buffer.c

tools

testing

selftests

bpf

prog_tests

perf_skip.c

progs

test_perf_skip.c

perf_events

.gitignore

Makefile

watermark_signal.c

+4 -4

arch/arm/kernel/hw_breakpoint.c

reviewed

··· 626 626 hw->address &= ~alignment_mask; 627 627 hw->ctrl.len <<= offset; 628 628 629 629 - if (uses_default_overflow_handler(bp)) { 629 629 + if (is_default_overflow_handler(bp)) { 630 630 /* 631 631 * Mismatch breakpoints are required for single-stepping 632 632 * breakpoints. ··· 798 798 * Otherwise, insert a temporary mismatch breakpoint so that 799 799 * we can single-step over the watchpoint trigger. 800 800 */ 801 801 - if (!uses_default_overflow_handler(wp)) 801 801 + if (!is_default_overflow_handler(wp)) 802 802 continue; 803 803 step: 804 804 enable_single_step(wp, instruction_pointer(regs)); ··· 811 811 info->trigger = addr; 812 812 pr_debug("watchpoint fired: address = 0x%x\n", info->trigger); 813 813 perf_bp_event(wp, regs); 814 814 - if (uses_default_overflow_handler(wp)) 814 814 + if (is_default_overflow_handler(wp)) 815 815 enable_single_step(wp, instruction_pointer(regs)); 816 816 } 817 817 ··· 886 886 info->trigger = addr; 887 887 pr_debug("breakpoint fired: address = 0x%x\n", addr); 888 888 perf_bp_event(bp, regs); 889 889 - if (uses_default_overflow_handler(bp)) 889 889 + if (is_default_overflow_handler(bp)) 890 890 enable_single_step(bp, addr); 891 891 goto unlock; 892 892 }

+2 -2

arch/arm64/kernel/hw_breakpoint.c

reviewed

··· 655 655 perf_bp_event(bp, regs); 656 656 657 657 /* Do we need to handle the stepping? */ 658 658 - if (uses_default_overflow_handler(bp)) 658 658 + if (is_default_overflow_handler(bp)) 659 659 step = 1; 660 660 unlock: 661 661 rcu_read_unlock(); ··· 734 734 static int watchpoint_report(struct perf_event *wp, unsigned long addr, 735 735 struct pt_regs *regs) 736 736 { 737 737 - int step = uses_default_overflow_handler(wp); 737 737 + int step = is_default_overflow_handler(wp); 738 738 struct arch_hw_breakpoint *info = counter_arch_bp(wp); 739 739 740 740 info->trigger = addr;

+36 -1

arch/x86/events/amd/core.c

reviewed

··· 647 647 } 648 648 } 649 649 650 650 - static inline void amd_pmu_set_global_ctl(u64 ctl) 650 650 + static __always_inline void amd_pmu_set_global_ctl(u64 ctl) 651 651 { 652 652 wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); 653 653 } ··· 905 905 amd_brs_enable_all(); 906 906 907 907 return amd_pmu_adjust_nmi_window(handled); 908 908 + } 909 909 + 910 910 + /* 911 911 + * AMD-specific callback invoked through perf_snapshot_branch_stack static 912 912 + * call, defined in include/linux/perf_event.h. See its definition for API 913 913 + * details. It's up to caller to provide enough space in *entries* to fit all 914 914 + * LBR records, otherwise returned result will be truncated to *cnt* entries. 915 915 + */ 916 916 + static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) 917 917 + { 918 918 + struct cpu_hw_events *cpuc; 919 919 + unsigned long flags; 920 920 + 921 921 + /* 922 922 + * The sequence of steps to freeze LBR should be completely inlined 923 923 + * and contain no branches to minimize contamination of LBR snapshot 924 924 + */ 925 925 + local_irq_save(flags); 926 926 + amd_pmu_core_disable_all(); 927 927 + __amd_pmu_lbr_disable(); 928 928 + 929 929 + cpuc = this_cpu_ptr(&cpu_hw_events); 930 930 + 931 931 + amd_pmu_lbr_read(); 932 932 + cnt = min(cnt, x86_pmu.lbr_nr); 933 933 + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt); 934 934 + 935 935 + amd_pmu_v2_enable_all(0); 936 936 + local_irq_restore(flags); 937 937 + 938 938 + return cnt; 908 939 } 909 940 910 941 static int amd_pmu_v2_handle_irq(struct pt_regs *regs) ··· 1474 1443 static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); 1475 1444 static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); 1476 1445 static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); 1446 1446 + 1447 1447 + /* Only support branch_stack snapshot on perfmon v2 */ 1448 1448 + if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq) 1449 1449 + static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack); 1477 1450 } else if (!amd_brs_init()) { 1478 1451 /* 1479 1452 * BRS requires special event constraints and flushing on ctxsw.

+1 -12

arch/x86/events/amd/lbr.c

reviewed

··· 310 310 { 311 311 int ret = 0; 312 312 313 313 - /* LBR is not recommended in counting mode */ 314 314 - if (!is_sampling_event(event)) 315 315 - return -EINVAL; 316 316 - 317 313 ret = amd_pmu_lbr_setup_filter(event); 318 314 if (!ret) 319 315 event->attach_state |= PERF_ATTACH_SCHED_CB; ··· 410 414 void amd_pmu_lbr_disable_all(void) 411 415 { 412 416 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 413 413 - u64 dbg_ctl, dbg_extn_cfg; 414 417 415 418 if (!cpuc->lbr_users || !x86_pmu.lbr_nr) 416 419 return; 417 420 418 418 - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); 419 419 - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); 420 420 - 421 421 - if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { 422 422 - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); 423 423 - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 424 424 - } 421 421 + __amd_pmu_lbr_disable(); 425 422 } 426 423 427 424 __init int amd_pmu_lbr_init(void)

+13

arch/x86/events/perf_event.h

reviewed

··· 1329 1329 void amd_pmu_lbr_disable_all(void); 1330 1330 int amd_pmu_lbr_hw_config(struct perf_event *event); 1331 1331 1332 1332 + static __always_inline void __amd_pmu_lbr_disable(void) 1333 1333 + { 1334 1334 + u64 dbg_ctl, dbg_extn_cfg; 1335 1335 + 1336 1336 + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); 1337 1337 + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); 1338 1338 + 1339 1339 + if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { 1340 1340 + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); 1341 1341 + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 1342 1342 + } 1343 1343 + } 1344 1344 + 1332 1345 #ifdef CONFIG_PERF_EVENTS_AMD_BRS 1333 1346 1334 1347 #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */

+4 -3

arch/x86/events/rapl.c

reviewed

··· 675 675 static int __init init_rapl_pmus(void) 676 676 { 677 677 int maxdie = topology_max_packages() * topology_max_dies_per_package(); 678 678 - size_t size; 679 678 680 680 - size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); 681 681 - rapl_pmus = kzalloc(size, GFP_KERNEL); 679 679 + rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL); 682 680 if (!rapl_pmus) 683 681 return -ENOMEM; 684 682 ··· 806 808 X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), 807 809 X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl), 808 810 X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), 811 811 + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl), 812 812 + X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl), 813 813 + X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl), 809 814 {}, 810 815 }; 811 816 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);

+17 -20

include/linux/perf_event.h

reviewed

··· 809 809 u64 (*clock)(void); 810 810 perf_overflow_handler_t overflow_handler; 811 811 void *overflow_handler_context; 812 812 - #ifdef CONFIG_BPF_SYSCALL 813 813 - perf_overflow_handler_t orig_overflow_handler; 814 812 struct bpf_prog *prog; 815 813 u64 bpf_cookie; 816 816 - #endif 817 814 818 815 #ifdef CONFIG_EVENT_TRACING 819 816 struct trace_event_call *tp_event; ··· 880 883 881 884 unsigned int nr_events; 882 885 unsigned int nr_cgroups; 886 886 + unsigned int nr_freq; 883 887 884 888 atomic_t refcount; /* event <-> epc */ 885 889 struct rcu_head rcu_head; ··· 894 896 */ 895 897 int rotate_necessary; 896 898 }; 899 899 + 900 900 + static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) 901 901 + { 902 902 + return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); 903 903 + } 897 904 898 905 struct perf_event_groups { 899 906 struct rb_root tree; ··· 1345 1342 struct pt_regs *regs); 1346 1343 1347 1344 static inline bool 1348 1348 - __is_default_overflow_handler(perf_overflow_handler_t overflow_handler) 1345 1345 + is_default_overflow_handler(struct perf_event *event) 1349 1346 { 1347 1347 + perf_overflow_handler_t overflow_handler = event->overflow_handler; 1348 1348 + 1350 1349 if (likely(overflow_handler == perf_event_output_forward)) 1351 1350 return true; 1352 1351 if (unlikely(overflow_handler == perf_event_output_backward)) 1353 1352 return true; 1354 1353 return false; 1355 1354 } 1356 1356 - 1357 1357 - #define is_default_overflow_handler(event) \ 1358 1358 - __is_default_overflow_handler((event)->overflow_handler) 1359 1359 - 1360 1360 - #ifdef CONFIG_BPF_SYSCALL 1361 1361 - static inline bool uses_default_overflow_handler(struct perf_event *event) 1362 1362 - { 1363 1363 - if (likely(is_default_overflow_handler(event))) 1364 1364 - return true; 1365 1365 - 1366 1366 - return __is_default_overflow_handler(event->orig_overflow_handler); 1367 1367 - } 1368 1368 - #else 1369 1369 - #define uses_default_overflow_handler(event) \ 1370 1370 - is_default_overflow_handler(event) 1371 1371 - #endif 1372 1355 1373 1356 extern void 1374 1357 perf_event_header__init_id(struct perf_event_header *header, ··· 1684 1695 ifh = &event->parent->addr_filters; 1685 1696 1686 1697 return ifh; 1698 1698 + } 1699 1699 + 1700 1700 + static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) 1701 1701 + { 1702 1702 + /* Only the parent has fasync state */ 1703 1703 + if (event->parent) 1704 1704 + event = event->parent; 1705 1705 + return &event->fasync; 1687 1706 } 1688 1707 1689 1708 extern void perf_event_addr_filters_sync(struct perf_event *event);

+144 -129

kernel/events/core.c

reviewed

··· 2302 2302 2303 2303 if (!is_software_event(event)) 2304 2304 cpc->active_oncpu--; 2305 2305 - if (event->attr.freq && event->attr.sample_freq) 2305 2305 + if (event->attr.freq && event->attr.sample_freq) { 2306 2306 ctx->nr_freq--; 2307 2307 + epc->nr_freq--; 2308 2308 + } 2307 2309 if (event->attr.exclusive || !cpc->active_oncpu) 2308 2310 cpc->exclusive = 0; 2309 2311 ··· 2560 2558 2561 2559 if (!is_software_event(event)) 2562 2560 cpc->active_oncpu++; 2563 2563 - if (event->attr.freq && event->attr.sample_freq) 2561 2561 + if (event->attr.freq && event->attr.sample_freq) { 2564 2562 ctx->nr_freq++; 2565 2565 - 2563 2563 + epc->nr_freq++; 2564 2564 + } 2566 2565 if (event->attr.exclusive) 2567 2566 cpc->exclusive = 1; 2568 2567 ··· 4126 4123 } 4127 4124 } 4128 4125 4129 4129 - /* 4130 4130 - * combine freq adjustment with unthrottling to avoid two passes over the 4131 4131 - * events. At the same time, make sure, having freq events does not change 4132 4132 - * the rate of unthrottling as that would introduce bias. 4133 4133 - */ 4134 4134 - static void 4135 4135 - perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) 4126 4126 + static void perf_adjust_freq_unthr_events(struct list_head *event_list) 4136 4127 { 4137 4128 struct perf_event *event; 4138 4129 struct hw_perf_event *hwc; 4139 4130 u64 now, period = TICK_NSEC; 4140 4131 s64 delta; 4141 4132 4142 4142 - /* 4143 4143 - * only need to iterate over all events iff: 4144 4144 - * - context have events in frequency mode (needs freq adjust) 4145 4145 - * - there are events to unthrottle on this cpu 4146 4146 - */ 4147 4147 - if (!(ctx->nr_freq || unthrottle)) 4148 4148 - return; 4149 4149 - 4150 4150 - raw_spin_lock(&ctx->lock); 4151 4151 - 4152 4152 - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4133 4133 + list_for_each_entry(event, event_list, active_list) { 4153 4134 if (event->state != PERF_EVENT_STATE_ACTIVE) 4154 4135 continue; 4155 4136 ··· 4141 4154 if (!event_filter_match(event)) 4142 4155 continue; 4143 4156 4144 4144 - perf_pmu_disable(event->pmu); 4145 4145 - 4146 4157 hwc = &event->hw; 4147 4158 4148 4159 if (hwc->interrupts == MAX_INTERRUPTS) { 4149 4160 hwc->interrupts = 0; 4150 4161 perf_log_throttle(event, 1); 4151 4151 - event->pmu->start(event, 0); 4162 4162 + if (!event->attr.freq || !event->attr.sample_freq) 4163 4163 + event->pmu->start(event, 0); 4152 4164 } 4153 4165 4154 4166 if (!event->attr.freq || !event->attr.sample_freq) 4155 4155 - goto next; 4167 4167 + continue; 4156 4168 4157 4169 /* 4158 4170 * stop the event and update event->count ··· 4173 4187 perf_adjust_period(event, period, delta, false); 4174 4188 4175 4189 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); 4176 4176 - next: 4177 4177 - perf_pmu_enable(event->pmu); 4190 4190 + } 4191 4191 + } 4192 4192 + 4193 4193 + /* 4194 4194 + * combine freq adjustment with unthrottling to avoid two passes over the 4195 4195 + * events. At the same time, make sure, having freq events does not change 4196 4196 + * the rate of unthrottling as that would introduce bias. 4197 4197 + */ 4198 4198 + static void 4199 4199 + perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) 4200 4200 + { 4201 4201 + struct perf_event_pmu_context *pmu_ctx; 4202 4202 + 4203 4203 + /* 4204 4204 + * only need to iterate over all events iff: 4205 4205 + * - context have events in frequency mode (needs freq adjust) 4206 4206 + * - there are events to unthrottle on this cpu 4207 4207 + */ 4208 4208 + if (!(ctx->nr_freq || unthrottle)) 4209 4209 + return; 4210 4210 + 4211 4211 + raw_spin_lock(&ctx->lock); 4212 4212 + 4213 4213 + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { 4214 4214 + if (!(pmu_ctx->nr_freq || unthrottle)) 4215 4215 + continue; 4216 4216 + if (!perf_pmu_ctx_is_active(pmu_ctx)) 4217 4217 + continue; 4218 4218 + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) 4219 4219 + continue; 4220 4220 + 4221 4221 + perf_pmu_disable(pmu_ctx->pmu); 4222 4222 + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); 4223 4223 + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); 4224 4224 + perf_pmu_enable(pmu_ctx->pmu); 4178 4225 } 4179 4226 4180 4227 raw_spin_unlock(&ctx->lock); ··· 6702 6683 * If there's data, ensure we set the poll() state and publish everything 6703 6684 * to user-space before waking everybody up. 6704 6685 */ 6705 6705 - 6706 6706 - static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) 6707 6707 - { 6708 6708 - /* only the parent has fasync state */ 6709 6709 - if (event->parent) 6710 6710 - event = event->parent; 6711 6711 - return &event->fasync; 6712 6712 - } 6713 6686 6714 6687 void perf_event_wakeup(struct perf_event *event) 6715 6688 { ··· 9555 9544 return true; 9556 9545 } 9557 9546 9547 9547 + #ifdef CONFIG_BPF_SYSCALL 9548 9548 + static int bpf_overflow_handler(struct perf_event *event, 9549 9549 + struct perf_sample_data *data, 9550 9550 + struct pt_regs *regs) 9551 9551 + { 9552 9552 + struct bpf_perf_event_data_kern ctx = { 9553 9553 + .data = data, 9554 9554 + .event = event, 9555 9555 + }; 9556 9556 + struct bpf_prog *prog; 9557 9557 + int ret = 0; 9558 9558 + 9559 9559 + ctx.regs = perf_arch_bpf_user_pt_regs(regs); 9560 9560 + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) 9561 9561 + goto out; 9562 9562 + rcu_read_lock(); 9563 9563 + prog = READ_ONCE(event->prog); 9564 9564 + if (prog) { 9565 9565 + perf_prepare_sample(data, event, regs); 9566 9566 + ret = bpf_prog_run(prog, &ctx); 9567 9567 + } 9568 9568 + rcu_read_unlock(); 9569 9569 + out: 9570 9570 + __this_cpu_dec(bpf_prog_active); 9571 9571 + 9572 9572 + return ret; 9573 9573 + } 9574 9574 + 9575 9575 + static inline int perf_event_set_bpf_handler(struct perf_event *event, 9576 9576 + struct bpf_prog *prog, 9577 9577 + u64 bpf_cookie) 9578 9578 + { 9579 9579 + if (event->overflow_handler_context) 9580 9580 + /* hw breakpoint or kernel counter */ 9581 9581 + return -EINVAL; 9582 9582 + 9583 9583 + if (event->prog) 9584 9584 + return -EEXIST; 9585 9585 + 9586 9586 + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) 9587 9587 + return -EINVAL; 9588 9588 + 9589 9589 + if (event->attr.precise_ip && 9590 9590 + prog->call_get_stack && 9591 9591 + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || 9592 9592 + event->attr.exclude_callchain_kernel || 9593 9593 + event->attr.exclude_callchain_user)) { 9594 9594 + /* 9595 9595 + * On perf_event with precise_ip, calling bpf_get_stack() 9596 9596 + * may trigger unwinder warnings and occasional crashes. 9597 9597 + * bpf_get_[stack|stackid] works around this issue by using 9598 9598 + * callchain attached to perf_sample_data. If the 9599 9599 + * perf_event does not full (kernel and user) callchain 9600 9600 + * attached to perf_sample_data, do not allow attaching BPF 9601 9601 + * program that calls bpf_get_[stack|stackid]. 9602 9602 + */ 9603 9603 + return -EPROTO; 9604 9604 + } 9605 9605 + 9606 9606 + event->prog = prog; 9607 9607 + event->bpf_cookie = bpf_cookie; 9608 9608 + return 0; 9609 9609 + } 9610 9610 + 9611 9611 + static inline void perf_event_free_bpf_handler(struct perf_event *event) 9612 9612 + { 9613 9613 + struct bpf_prog *prog = event->prog; 9614 9614 + 9615 9615 + if (!prog) 9616 9616 + return; 9617 9617 + 9618 9618 + event->prog = NULL; 9619 9619 + bpf_prog_put(prog); 9620 9620 + } 9621 9621 + #else 9622 9622 + static inline int bpf_overflow_handler(struct perf_event *event, 9623 9623 + struct perf_sample_data *data, 9624 9624 + struct pt_regs *regs) 9625 9625 + { 9626 9626 + return 1; 9627 9627 + } 9628 9628 + 9629 9629 + static inline int perf_event_set_bpf_handler(struct perf_event *event, 9630 9630 + struct bpf_prog *prog, 9631 9631 + u64 bpf_cookie) 9632 9632 + { 9633 9633 + return -EOPNOTSUPP; 9634 9634 + } 9635 9635 + 9636 9636 + static inline void perf_event_free_bpf_handler(struct perf_event *event) 9637 9637 + { 9638 9638 + } 9639 9639 + #endif 9640 9640 + 9558 9641 /* 9559 9642 * Generic event overflow handling, sampling. 9560 9643 */ ··· 9668 9563 return 0; 9669 9564 9670 9565 ret = __perf_event_account_interrupt(event, throttle); 9566 9566 + 9567 9567 + if (event->prog && !bpf_overflow_handler(event, data, regs)) 9568 9568 + return ret; 9671 9569 9672 9570 /* 9673 9571 * XXX event_limit might not quite work as expected on inherited ··· 10529 10421 { 10530 10422 ftrace_profile_free_filter(event); 10531 10423 } 10532 10532 - 10533 10533 - #ifdef CONFIG_BPF_SYSCALL 10534 10534 - static void bpf_overflow_handler(struct perf_event *event, 10535 10535 - struct perf_sample_data *data, 10536 10536 - struct pt_regs *regs) 10537 10537 - { 10538 10538 - struct bpf_perf_event_data_kern ctx = { 10539 10539 - .data = data, 10540 10540 - .event = event, 10541 10541 - }; 10542 10542 - struct bpf_prog *prog; 10543 10543 - int ret = 0; 10544 10544 - 10545 10545 - ctx.regs = perf_arch_bpf_user_pt_regs(regs); 10546 10546 - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) 10547 10547 - goto out; 10548 10548 - rcu_read_lock(); 10549 10549 - prog = READ_ONCE(event->prog); 10550 10550 - if (prog) { 10551 10551 - perf_prepare_sample(data, event, regs); 10552 10552 - ret = bpf_prog_run(prog, &ctx); 10553 10553 - } 10554 10554 - rcu_read_unlock(); 10555 10555 - out: 10556 10556 - __this_cpu_dec(bpf_prog_active); 10557 10557 - if (!ret) 10558 10558 - return; 10559 10559 - 10560 10560 - event->orig_overflow_handler(event, data, regs); 10561 10561 - } 10562 10562 - 10563 10563 - static int perf_event_set_bpf_handler(struct perf_event *event, 10564 10564 - struct bpf_prog *prog, 10565 10565 - u64 bpf_cookie) 10566 10566 - { 10567 10567 - if (event->overflow_handler_context) 10568 10568 - /* hw breakpoint or kernel counter */ 10569 10569 - return -EINVAL; 10570 10570 - 10571 10571 - if (event->prog) 10572 10572 - return -EEXIST; 10573 10573 - 10574 10574 - if (prog->type != BPF_PROG_TYPE_PERF_EVENT) 10575 10575 - return -EINVAL; 10576 10576 - 10577 10577 - if (event->attr.precise_ip && 10578 10578 - prog->call_get_stack && 10579 10579 - (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || 10580 10580 - event->attr.exclude_callchain_kernel || 10581 10581 - event->attr.exclude_callchain_user)) { 10582 10582 - /* 10583 10583 - * On perf_event with precise_ip, calling bpf_get_stack() 10584 10584 - * may trigger unwinder warnings and occasional crashes. 10585 10585 - * bpf_get_[stack|stackid] works around this issue by using 10586 10586 - * callchain attached to perf_sample_data. If the 10587 10587 - * perf_event does not full (kernel and user) callchain 10588 10588 - * attached to perf_sample_data, do not allow attaching BPF 10589 10589 - * program that calls bpf_get_[stack|stackid]. 10590 10590 - */ 10591 10591 - return -EPROTO; 10592 10592 - } 10593 10593 - 10594 10594 - event->prog = prog; 10595 10595 - event->bpf_cookie = bpf_cookie; 10596 10596 - event->orig_overflow_handler = READ_ONCE(event->overflow_handler); 10597 10597 - WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); 10598 10598 - return 0; 10599 10599 - } 10600 10600 - 10601 10601 - static void perf_event_free_bpf_handler(struct perf_event *event) 10602 10602 - { 10603 10603 - struct bpf_prog *prog = event->prog; 10604 10604 - 10605 10605 - if (!prog) 10606 10606 - return; 10607 10607 - 10608 10608 - WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); 10609 10609 - event->prog = NULL; 10610 10610 - bpf_prog_put(prog); 10611 10611 - } 10612 10612 - #else 10613 10613 - static int perf_event_set_bpf_handler(struct perf_event *event, 10614 10614 - struct bpf_prog *prog, 10615 10615 - u64 bpf_cookie) 10616 10616 - { 10617 10617 - return -EOPNOTSUPP; 10618 10618 - } 10619 10619 - static void perf_event_free_bpf_handler(struct perf_event *event) 10620 10620 - { 10621 10621 - } 10622 10622 - #endif 10623 10424 10624 10425 /* 10625 10426 * returns true if the event is a tracepoint, or a kprobe/upprobe created ··· 11988 11971 overflow_handler = parent_event->overflow_handler; 11989 11972 context = parent_event->overflow_handler_context; 11990 11973 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) 11991 11991 - if (overflow_handler == bpf_overflow_handler) { 11974 11974 + if (parent_event->prog) { 11992 11975 struct bpf_prog *prog = parent_event->prog; 11993 11976 11994 11977 bpf_prog_inc(prog); 11995 11978 event->prog = prog; 11996 11996 - event->orig_overflow_handler = 11997 11997 - parent_event->orig_overflow_handler; 11998 11979 } 11999 11980 #endif 12000 11981 }

kernel/events/ring_buffer.c

reviewed

··· 22 22 atomic_set(&handle->rb->poll, EPOLLIN); 23 23 24 24 handle->event->pending_wakeup = 1; 25 25 + 26 26 + if (*perf_event_fasync(handle->event) && !handle->event->pending_kill) 27 27 + handle->event->pending_kill = POLL_IN; 28 28 + 25 29 irq_work_queue(&handle->event->pending_irq); 26 30 } 27 31

+137

tools/testing/selftests/bpf/prog_tests/perf_skip.c

reviewed

··· 1 1 + // SPDX-License-Identifier: GPL-2.0 2 2 + #define _GNU_SOURCE 3 3 + 4 4 + #include <test_progs.h> 5 5 + #include "test_perf_skip.skel.h" 6 6 + #include <linux/compiler.h> 7 7 + #include <linux/hw_breakpoint.h> 8 8 + #include <sys/mman.h> 9 9 + 10 10 + #ifndef TRAP_PERF 11 11 + #define TRAP_PERF 6 12 12 + #endif 13 13 + 14 14 + int sigio_count, sigtrap_count; 15 15 + 16 16 + static void handle_sigio(int sig __always_unused) 17 17 + { 18 18 + ++sigio_count; 19 19 + } 20 20 + 21 21 + static void handle_sigtrap(int signum __always_unused, 22 22 + siginfo_t *info, 23 23 + void *ucontext __always_unused) 24 24 + { 25 25 + ASSERT_EQ(info->si_code, TRAP_PERF, "si_code"); 26 26 + ++sigtrap_count; 27 27 + } 28 28 + 29 29 + static noinline int test_function(void) 30 30 + { 31 31 + asm volatile (""); 32 32 + return 0; 33 33 + } 34 34 + 35 35 + void serial_test_perf_skip(void) 36 36 + { 37 37 + struct sigaction action = {}; 38 38 + struct sigaction previous_sigtrap; 39 39 + sighandler_t previous_sigio = SIG_ERR; 40 40 + struct test_perf_skip *skel = NULL; 41 41 + struct perf_event_attr attr = {}; 42 42 + int perf_fd = -1; 43 43 + int err; 44 44 + struct f_owner_ex owner; 45 45 + struct bpf_link *prog_link = NULL; 46 46 + 47 47 + action.sa_flags = SA_SIGINFO | SA_NODEFER; 48 48 + action.sa_sigaction = handle_sigtrap; 49 49 + sigemptyset(&action.sa_mask); 50 50 + if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction")) 51 51 + return; 52 52 + 53 53 + previous_sigio = signal(SIGIO, handle_sigio); 54 54 + if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal")) 55 55 + goto cleanup; 56 56 + 57 57 + skel = test_perf_skip__open_and_load(); 58 58 + if (!ASSERT_OK_PTR(skel, "skel_load")) 59 59 + goto cleanup; 60 60 + 61 61 + attr.type = PERF_TYPE_BREAKPOINT; 62 62 + attr.size = sizeof(attr); 63 63 + attr.bp_type = HW_BREAKPOINT_X; 64 64 + attr.bp_addr = (uintptr_t)test_function; 65 65 + attr.bp_len = sizeof(long); 66 66 + attr.sample_period = 1; 67 67 + attr.sample_type = PERF_SAMPLE_IP; 68 68 + attr.pinned = 1; 69 69 + attr.exclude_kernel = 1; 70 70 + attr.exclude_hv = 1; 71 71 + attr.precise_ip = 3; 72 72 + attr.sigtrap = 1; 73 73 + attr.remove_on_exec = 1; 74 74 + 75 75 + perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); 76 76 + if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) { 77 77 + printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n"); 78 78 + test__skip(); 79 79 + goto cleanup; 80 80 + } 81 81 + if (!ASSERT_OK(perf_fd < 0, "perf_event_open")) 82 82 + goto cleanup; 83 83 + 84 84 + /* Configure the perf event to signal on sample. */ 85 85 + err = fcntl(perf_fd, F_SETFL, O_ASYNC); 86 86 + if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)")) 87 87 + goto cleanup; 88 88 + 89 89 + owner.type = F_OWNER_TID; 90 90 + owner.pid = syscall(__NR_gettid); 91 91 + err = fcntl(perf_fd, F_SETOWN_EX, &owner); 92 92 + if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)")) 93 93 + goto cleanup; 94 94 + 95 95 + /* Allow at most one sample. A sample rejected by bpf should 96 96 + * not count against this. 97 97 + */ 98 98 + err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1); 99 99 + if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)")) 100 100 + goto cleanup; 101 101 + 102 102 + prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd); 103 103 + if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event")) 104 104 + goto cleanup; 105 105 + 106 106 + /* Configure the bpf program to suppress the sample. */ 107 107 + skel->bss->ip = (uintptr_t)test_function; 108 108 + test_function(); 109 109 + 110 110 + ASSERT_EQ(sigio_count, 0, "sigio_count"); 111 111 + ASSERT_EQ(sigtrap_count, 0, "sigtrap_count"); 112 112 + 113 113 + /* Configure the bpf program to allow the sample. */ 114 114 + skel->bss->ip = 0; 115 115 + test_function(); 116 116 + 117 117 + ASSERT_EQ(sigio_count, 1, "sigio_count"); 118 118 + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); 119 119 + 120 120 + /* Test that the sample above is the only one allowed (by perf, not 121 121 + * by bpf) 122 122 + */ 123 123 + test_function(); 124 124 + 125 125 + ASSERT_EQ(sigio_count, 1, "sigio_count"); 126 126 + ASSERT_EQ(sigtrap_count, 1, "sigtrap_count"); 127 127 + 128 128 + cleanup: 129 129 + bpf_link__destroy(prog_link); 130 130 + if (perf_fd >= 0) 131 131 + close(perf_fd); 132 132 + test_perf_skip__destroy(skel); 133 133 + 134 134 + if (previous_sigio != SIG_ERR) 135 135 + signal(SIGIO, previous_sigio); 136 136 + sigaction(SIGTRAP, &previous_sigtrap, NULL); 137 137 + }

+15

tools/testing/selftests/bpf/progs/test_perf_skip.c

reviewed

··· 1 1 + // SPDX-License-Identifier: GPL-2.0 2 2 + #include "vmlinux.h" 3 3 + #include <bpf/bpf_helpers.h> 4 4 + #include <bpf/bpf_tracing.h> 5 5 + 6 6 + uintptr_t ip; 7 7 + 8 8 + SEC("perf_event") 9 9 + int handler(struct bpf_perf_event_data *data) 10 10 + { 11 11 + /* Skip events that have the correct ip. */ 12 12 + return ip != PT_REGS_IP(&data->regs); 13 13 + } 14 14 + 15 15 + char _license[] SEC("license") = "GPL";

tools/testing/selftests/perf_events/.gitignore

reviewed

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 sigtrap_threads 3 3 remove_on_exec 4 4 + watermark_signal

+1 -1

tools/testing/selftests/perf_events/Makefile

reviewed

··· 2 2 CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) 3 3 LDFLAGS += -lpthread 4 4 5 5 - TEST_GEN_PROGS := sigtrap_threads remove_on_exec 5 5 + TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal 6 6 include ../lib.mk

+146

tools/testing/selftests/perf_events/watermark_signal.c

reviewed

··· 1 1 + // SPDX-License-Identifier: GPL-2.0 2 2 + #define _GNU_SOURCE 3 3 + 4 4 + #include <errno.h> 5 5 + #include <fcntl.h> 6 6 + #include <linux/perf_event.h> 7 7 + #include <stddef.h> 8 8 + #include <sched.h> 9 9 + #include <signal.h> 10 10 + #include <stdlib.h> 11 11 + #include <string.h> 12 12 + #include <sys/ioctl.h> 13 13 + #include <sys/mman.h> 14 14 + #include <sys/syscall.h> 15 15 + #include <sys/wait.h> 16 16 + #include <unistd.h> 17 17 + 18 18 + #include "../kselftest_harness.h" 19 19 + 20 20 + #define __maybe_unused __attribute__((__unused__)) 21 21 + 22 22 + static int sigio_count; 23 23 + 24 24 + static void handle_sigio(int signum __maybe_unused, 25 25 + siginfo_t *oh __maybe_unused, 26 26 + void *uc __maybe_unused) 27 27 + { 28 28 + ++sigio_count; 29 29 + } 30 30 + 31 31 + static void do_child(void) 32 32 + { 33 33 + raise(SIGSTOP); 34 34 + 35 35 + for (int i = 0; i < 20; ++i) 36 36 + sleep(1); 37 37 + 38 38 + raise(SIGSTOP); 39 39 + 40 40 + exit(0); 41 41 + } 42 42 + 43 43 + TEST(watermark_signal) 44 44 + { 45 45 + struct perf_event_attr attr; 46 46 + struct perf_event_mmap_page *p = NULL; 47 47 + struct sigaction previous_sigio, sigio = { 0 }; 48 48 + pid_t child = -1; 49 49 + int child_status; 50 50 + int fd = -1; 51 51 + long page_size = sysconf(_SC_PAGE_SIZE); 52 52 + 53 53 + sigio.sa_sigaction = handle_sigio; 54 54 + EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0); 55 55 + 56 56 + memset(&attr, 0, sizeof(attr)); 57 57 + attr.size = sizeof(attr); 58 58 + attr.type = PERF_TYPE_SOFTWARE; 59 59 + attr.config = PERF_COUNT_SW_DUMMY; 60 60 + attr.sample_period = 1; 61 61 + attr.disabled = 1; 62 62 + attr.watermark = 1; 63 63 + attr.context_switch = 1; 64 64 + attr.wakeup_watermark = 1; 65 65 + 66 66 + child = fork(); 67 67 + EXPECT_GE(child, 0); 68 68 + if (child == 0) 69 69 + do_child(); 70 70 + else if (child < 0) { 71 71 + perror("fork()"); 72 72 + goto cleanup; 73 73 + } 74 74 + 75 75 + if (waitpid(child, &child_status, WSTOPPED) != child || 76 76 + !(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) { 77 77 + fprintf(stderr, 78 78 + "failed to sycnhronize with child errno=%d status=%x\n", 79 79 + errno, 80 80 + child_status); 81 81 + goto cleanup; 82 82 + } 83 83 + 84 84 + fd = syscall(__NR_perf_event_open, &attr, child, -1, -1, 85 85 + PERF_FLAG_FD_CLOEXEC); 86 86 + if (fd < 0) { 87 87 + fprintf(stderr, "failed opening event %llx\n", attr.config); 88 88 + goto cleanup; 89 89 + } 90 90 + 91 91 + if (fcntl(fd, F_SETFL, FASYNC)) { 92 92 + perror("F_SETFL FASYNC"); 93 93 + goto cleanup; 94 94 + } 95 95 + 96 96 + if (fcntl(fd, F_SETOWN, getpid())) { 97 97 + perror("F_SETOWN getpid()"); 98 98 + goto cleanup; 99 99 + } 100 100 + 101 101 + if (fcntl(fd, F_SETSIG, SIGIO)) { 102 102 + perror("F_SETSIG SIGIO"); 103 103 + goto cleanup; 104 104 + } 105 105 + 106 106 + p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 107 107 + if (p == NULL) { 108 108 + perror("mmap"); 109 109 + goto cleanup; 110 110 + } 111 111 + 112 112 + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) { 113 113 + perror("PERF_EVENT_IOC_ENABLE"); 114 114 + goto cleanup; 115 115 + } 116 116 + 117 117 + if (kill(child, SIGCONT) < 0) { 118 118 + perror("SIGCONT"); 119 119 + goto cleanup; 120 120 + } 121 121 + 122 122 + if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR) 123 123 + fprintf(stderr, 124 124 + "expected SIGIO to terminate wait errno=%d status=%x\n%d", 125 125 + errno, 126 126 + child_status, 127 127 + sigio_count); 128 128 + 129 129 + EXPECT_GE(sigio_count, 1); 130 130 + 131 131 + cleanup: 132 132 + if (p != NULL) 133 133 + munmap(p, 2 * page_size); 134 134 + 135 135 + if (fd >= 0) 136 136 + close(fd); 137 137 + 138 138 + if (child > 0) { 139 139 + kill(child, SIGKILL); 140 140 + waitpid(child, NULL, 0); 141 141 + } 142 142 + 143 143 + sigaction(SIGIO, &previous_sigio, NULL); 144 144 + } 145 145 + 146 146 + TEST_HARNESS_MAIN