perf stat: Support inherit events during fork() for bperf

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

bperf has a nice ability to share PMUs, but it still does not support
inherit events during fork(), resulting in some deviations in its stat
results compared with perf.

perf stat result:
$ ./perf stat -e cycles,instructions -- ./perf test -w sqrtloop
Performance counter stats for './perf test -w sqrtloop':

2,316,038,116 cycles
2,859,350,725 instructions

1.009603637 seconds time elapsed

1.004196000 seconds user
0.003950000 seconds sys

bperf stat result:
$ ./perf stat --bpf-counters -e cycles,instructions -- \
./perf test -w sqrtloop

Performance counter stats for './perf test -w sqrtloop':

18,762,093 cycles
23,487,766 instructions

1.008913769 seconds time elapsed

1.003248000 seconds user
0.004069000 seconds sys

In order to support event inheritance, two new bpf programs are added
to monitor the fork and exit of tasks respectively. When a task is
created, add it to the filter map to enable counting, and reuse the
`accum_key` of its parent task to count together with the parent task.
When a task exits, remove it from the filter map to disable counting.

After support:
$ ./perf stat --bpf-counters -e cycles,instructions -- \
./perf test -w sqrtloop

Performance counter stats for './perf test -w sqrtloop':

2,316,252,189 cycles
2,859,946,547 instructions

1.009422314 seconds time elapsed

1.003597000 seconds user
0.004270000 seconds sys

Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
Cc: song@kernel.org
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20241021110201.325617-2-wutengda@huaweicloud.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>

authored by

Tengda Wu and committed by

Namhyung Kim 1 year ago 07dc3a6d ba993e5a

+126 -14

5 changed files

expand all

tools

perf

builtin-stat.c

util

bpf_counter.c

bpf_skel

bperf_follower.bpf.c

bperf_u.h

target.h

tools/perf/builtin-stat.c

··· 2641 2641 } else if (big_num_opt == 0) /* User passed --no-big-num */ 2642 2642 stat_config.big_num = false; 2643 2643 2644 + target.inherit = !stat_config.no_inherit; 2644 2645 err = target__validate(&target); 2645 2646 if (err) { 2646 2647 target__strerror(&target, err, errbuf, BUFSIZ);

+28 -7

tools/perf/util/bpf_counter.c

··· 394 394 } 395 395 396 396 static struct perf_cpu_map *all_cpu_map; 397 + static __u32 filter_entry_cnt; 397 398 398 399 static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, 399 400 struct perf_event_attr_map_entry *entry) ··· 445 444 return err; 446 445 } 447 446 447 + static int bperf_attach_follower_program(struct bperf_follower_bpf *skel, 448 + enum bperf_filter_type filter_type, 449 + bool inherit) 450 + { 451 + struct bpf_link *link; 452 + int err = 0; 453 + 454 + if ((filter_type == BPERF_FILTER_PID || 455 + filter_type == BPERF_FILTER_TGID) && inherit) 456 + /* attach all follower bpf progs to enable event inheritance */ 457 + err = bperf_follower_bpf__attach(skel); 458 + else { 459 + link = bpf_program__attach(skel->progs.fexit_XXX); 460 + if (IS_ERR(link)) 461 + err = PTR_ERR(link); 462 + } 463 + 464 + return err; 465 + } 466 + 448 467 static int bperf__load(struct evsel *evsel, struct target *target) 449 468 { 450 469 struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff}; 451 470 int attr_map_fd, diff_map_fd = -1, err; 452 471 enum bperf_filter_type filter_type; 453 - __u32 filter_entry_cnt, i; 472 + __u32 i; 454 473 455 474 if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt)) 456 475 return -1; ··· 550 529 /* set up reading map */ 551 530 bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, 552 531 filter_entry_cnt); 553 - /* set up follower filter based on target */ 554 - bpf_map__set_max_entries(evsel->follower_skel->maps.filter, 555 - filter_entry_cnt); 556 532 err = bperf_follower_bpf__load(evsel->follower_skel); 557 533 if (err) { 558 534 pr_err("Failed to load follower skeleton\n"); ··· 561 543 for (i = 0; i < filter_entry_cnt; i++) { 562 544 int filter_map_fd; 563 545 __u32 key; 546 + struct bperf_filter_value fval = { i, 0 }; 564 547 565 548 if (filter_type == BPERF_FILTER_PID || 566 549 filter_type == BPERF_FILTER_TGID) ··· 572 553 break; 573 554 574 555 filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); 575 - bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY); 556 + bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY); 576 557 } 577 558 578 559 evsel->follower_skel->bss->type = filter_type; 560 + evsel->follower_skel->bss->inherit = target->inherit; 579 561 580 - err = bperf_follower_bpf__attach(evsel->follower_skel); 562 + err = bperf_attach_follower_program(evsel->follower_skel, filter_type, 563 + target->inherit); 581 564 582 565 out: 583 566 if (err && evsel->bperf_leader_link_fd >= 0) ··· 644 623 bperf_sync_counters(evsel); 645 624 reading_map_fd = bpf_map__fd(skel->maps.accum_readings); 646 625 647 - for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) { 626 + for (i = 0; i < filter_entry_cnt; i++) { 648 627 struct perf_cpu entry; 649 628 __u32 cpu; 650 629

+91 -7

tools/perf/util/bpf_skel/bperf_follower.bpf.c

··· 5 5 #include <bpf/bpf_tracing.h> 6 6 #include "bperf_u.h" 7 7 8 + #define MAX_ENTRIES 102400 9 + 8 10 struct { 9 11 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 10 12 __uint(key_size, sizeof(__u32)); ··· 24 22 struct { 25 23 __uint(type, BPF_MAP_TYPE_HASH); 26 24 __uint(key_size, sizeof(__u32)); 27 - __uint(value_size, sizeof(__u32)); 25 + __uint(value_size, sizeof(struct bperf_filter_value)); 26 + __uint(max_entries, MAX_ENTRIES); 27 + __uint(map_flags, BPF_F_NO_PREALLOC); 28 28 } filter SEC(".maps"); 29 29 30 30 enum bperf_filter_type type = 0; 31 31 int enabled = 0; 32 + int inherit; 32 33 33 34 SEC("fexit/XXX") 34 35 int BPF_PROG(fexit_XXX) 35 36 { 36 37 struct bpf_perf_event_value *diff_val, *accum_val; 37 38 __u32 filter_key, zero = 0; 38 - __u32 *accum_key; 39 + __u32 accum_key; 40 + struct bperf_filter_value *fval; 39 41 40 42 if (!enabled) 41 43 return 0; 42 44 43 45 switch (type) { 44 46 case BPERF_FILTER_GLOBAL: 45 - accum_key = &zero; 47 + accum_key = zero; 46 48 goto do_add; 47 49 case BPERF_FILTER_CPU: 48 50 filter_key = bpf_get_smp_processor_id(); ··· 55 49 filter_key = bpf_get_current_pid_tgid() & 0xffffffff; 56 50 break; 57 51 case BPERF_FILTER_TGID: 58 - filter_key = bpf_get_current_pid_tgid() >> 32; 52 + /* Use pid as the filter_key to exclude new task counts 53 + * when inherit is disabled. Don't worry about the existing 54 + * children in TGID losing their counts, bpf_counter has 55 + * already added them to the filter map via perf_thread_map 56 + * before this bpf prog runs. 57 + */ 58 + filter_key = inherit ? 59 + bpf_get_current_pid_tgid() >> 32 : 60 + bpf_get_current_pid_tgid() & 0xffffffff; 59 61 break; 60 62 default: 61 63 return 0; 62 64 } 63 65 64 - accum_key = bpf_map_lookup_elem(&filter, &filter_key); 65 - if (!accum_key) 66 + fval = bpf_map_lookup_elem(&filter, &filter_key); 67 + if (!fval) 66 68 return 0; 69 + 70 + accum_key = fval->accum_key; 71 + if (fval->exited) 72 + bpf_map_delete_elem(&filter, &filter_key); 67 73 68 74 do_add: 69 75 diff_val = bpf_map_lookup_elem(&diff_readings, &zero); 70 76 if (!diff_val) 71 77 return 0; 72 78 73 - accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); 79 + accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key); 74 80 if (!accum_val) 75 81 return 0; 76 82 77 83 accum_val->counter += diff_val->counter; 78 84 accum_val->enabled += diff_val->enabled; 79 85 accum_val->running += diff_val->running; 86 + 87 + return 0; 88 + } 89 + 90 + /* The program is only used for PID or TGID filter types. */ 91 + SEC("tp_btf/task_newtask") 92 + int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags) 93 + { 94 + __u32 parent_key, child_key; 95 + struct bperf_filter_value *parent_fval; 96 + struct bperf_filter_value child_fval = { 0 }; 97 + 98 + if (!enabled) 99 + return 0; 100 + 101 + switch (type) { 102 + case BPERF_FILTER_PID: 103 + parent_key = bpf_get_current_pid_tgid() & 0xffffffff; 104 + child_key = task->pid; 105 + break; 106 + case BPERF_FILTER_TGID: 107 + parent_key = bpf_get_current_pid_tgid() >> 32; 108 + child_key = task->tgid; 109 + if (child_key == parent_key) 110 + return 0; 111 + break; 112 + default: 113 + return 0; 114 + } 115 + 116 + /* Check if the current task is one of the target tasks to be counted */ 117 + parent_fval = bpf_map_lookup_elem(&filter, &parent_key); 118 + if (!parent_fval) 119 + return 0; 120 + 121 + /* Start counting for the new task by adding it into filter map, 122 + * inherit the accum key of its parent task so that they can be 123 + * counted together. 124 + */ 125 + child_fval.accum_key = parent_fval->accum_key; 126 + child_fval.exited = 0; 127 + bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST); 128 + 129 + return 0; 130 + } 131 + 132 + /* The program is only used for PID or TGID filter types. */ 133 + SEC("tp_btf/sched_process_exit") 134 + int BPF_PROG(on_exittask, struct task_struct *task) 135 + { 136 + __u32 pid; 137 + struct bperf_filter_value *fval; 138 + 139 + if (!enabled) 140 + return 0; 141 + 142 + /* Stop counting for this task by removing it from filter map. 143 + * For TGID type, if the pid can be found in the map, it means that 144 + * this pid belongs to the leader task. After the task exits, the 145 + * tgid of its child tasks (if any) will be 1, so the pid can be 146 + * safely removed. 147 + */ 148 + pid = task->pid; 149 + fval = bpf_map_lookup_elem(&filter, &pid); 150 + if (fval) 151 + fval->exited = 1; 80 152 81 153 return 0; 82 154 }

tools/perf/util/bpf_skel/bperf_u.h

··· 11 11 BPERF_FILTER_TGID, 12 12 }; 13 13 14 + struct bperf_filter_value { 15 + __u32 accum_key; 16 + __u8 exited; 17 + }; 18 + 14 19 #endif /* __BPERF_STAT_U_H */

tools/perf/util/target.h

··· 17 17 bool default_per_cpu; 18 18 bool per_thread; 19 19 bool use_bpf; 20 + bool inherit; 20 21 int initial_delay; 21 22 const char *attr_map; 22 23 };