Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf stat: Enable BPF counter with --for-each-cgroup

Recently bperf was added to use BPF to count perf events for various
purposes. This is an extension for the approach and targetting to
cgroup usages.

Unlike the other bperf, it doesn't share the events with other
processes but it'd reduce unnecessary events (and the overhead of
multiplexing) for each monitored cgroup within the perf session.

When --for-each-cgroup is used with --bpf-counters, it will open
cgroup-switches event per cpu internally and attach the new BPF
program to read given perf_events and to aggregate the results for
cgroups. It's only called when task is switched to a task in a
different cgroup.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lore.kernel.org/lkml/20210701211227.1403788-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Namhyung Kim and committed by
Arnaldo Carvalho de Melo
944138f0 892ba7f1

+523 -1
+16 -1
tools/perf/Makefile.perf
··· 1017 1017 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp) 1018 1018 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h 1019 1019 SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h 1020 + SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h 1020 1021 1021 1022 ifdef BUILD_BPF_SKEL 1022 1023 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool ··· 1031 1030 CFLAGS= $(MAKE) -C ../bpf/bpftool \ 1032 1031 OUTPUT=$(SKEL_TMP_OUT)/ bootstrap 1033 1032 1034 - $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT) 1033 + VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ 1034 + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ 1035 + ../../vmlinux \ 1036 + /sys/kernel/btf/vmlinux \ 1037 + /boot/vmlinux-$(shell uname -r) 1038 + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) 1039 + 1040 + $(SKEL_OUT)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) 1041 + ifeq ($(VMLINUX_H),) 1042 + $(QUIET_GEN)$(BPFTOOL) btf dump file $< format c > $@ 1043 + else 1044 + $(Q)cp "$(VMLINUX_H)" $@ 1045 + endif 1046 + 1047 + $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) $(SKEL_OUT)/vmlinux.h | $(SKEL_TMP_OUT) 1035 1048 $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \ 1036 1049 -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ 1037 1050
+1
tools/perf/util/Build
··· 141 141 perf-$(CONFIG_LIBBPF) += bpf-loader.o 142 142 perf-$(CONFIG_LIBBPF) += bpf_map.o 143 143 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o 144 + perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o 144 145 perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o 145 146 perf-$(CONFIG_LIBELF) += symbol-elf.o 146 147 perf-$(CONFIG_LIBELF) += probe-file.o
+5
tools/perf/util/bpf_counter.c
··· 18 18 #include "evsel.h" 19 19 #include "evlist.h" 20 20 #include "target.h" 21 + #include "cgroup.h" 21 22 #include "cpumap.h" 22 23 #include "thread_map.h" 23 24 ··· 743 742 .destroy = bperf__destroy, 744 743 }; 745 744 745 + extern struct bpf_counter_ops bperf_cgrp_ops; 746 + 746 747 static inline bool bpf_counter_skip(struct evsel *evsel) 747 748 { 748 749 return list_empty(&evsel->bpf_counter_list) && ··· 762 759 { 763 760 if (target->bpf_str) 764 761 evsel->bpf_counter_ops = &bpf_program_profiler_ops; 762 + else if (cgrp_event_expanded && target->use_bpf) 763 + evsel->bpf_counter_ops = &bperf_cgrp_ops; 765 764 else if (target->use_bpf || evsel->bpf_counter || 766 765 evsel__match_bpf_counter_events(evsel->name)) 767 766 evsel->bpf_counter_ops = &bperf_ops;
+307
tools/perf/util/bpf_counter_cgroup.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + /* Copyright (c) 2021 Facebook */ 4 + /* Copyright (c) 2021 Google */ 5 + 6 + #include <assert.h> 7 + #include <limits.h> 8 + #include <unistd.h> 9 + #include <sys/file.h> 10 + #include <sys/time.h> 11 + #include <sys/resource.h> 12 + #include <linux/err.h> 13 + #include <linux/zalloc.h> 14 + #include <linux/perf_event.h> 15 + #include <api/fs/fs.h> 16 + #include <perf/bpf_perf.h> 17 + 18 + #include "affinity.h" 19 + #include "bpf_counter.h" 20 + #include "cgroup.h" 21 + #include "counts.h" 22 + #include "debug.h" 23 + #include "evsel.h" 24 + #include "evlist.h" 25 + #include "target.h" 26 + #include "cpumap.h" 27 + #include "thread_map.h" 28 + 29 + #include "bpf_skel/bperf_cgroup.skel.h" 30 + 31 + static struct perf_event_attr cgrp_switch_attr = { 32 + .type = PERF_TYPE_SOFTWARE, 33 + .config = PERF_COUNT_SW_CGROUP_SWITCHES, 34 + .size = sizeof(cgrp_switch_attr), 35 + .sample_period = 1, 36 + .disabled = 1, 37 + }; 38 + 39 + static struct evsel *cgrp_switch; 40 + static struct bperf_cgroup_bpf *skel; 41 + 42 + #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0)) 43 + 44 + static int bperf_load_program(struct evlist *evlist) 45 + { 46 + struct bpf_link *link; 47 + struct evsel *evsel; 48 + struct cgroup *cgrp, *leader_cgrp; 49 + __u32 i, cpu; 50 + __u32 nr_cpus = evlist->core.all_cpus->nr; 51 + int total_cpus = cpu__max_cpu(); 52 + int map_size, map_fd; 53 + int prog_fd, err; 54 + 55 + skel = bperf_cgroup_bpf__open(); 56 + if (!skel) { 57 + pr_err("Failed to open cgroup skeleton\n"); 58 + return -1; 59 + } 60 + 61 + skel->rodata->num_cpus = total_cpus; 62 + skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups; 63 + 64 + BUG_ON(evlist->core.nr_entries % nr_cgroups != 0); 65 + 66 + /* we need one copy of events per cpu for reading */ 67 + map_size = total_cpus * evlist->core.nr_entries / nr_cgroups; 68 + bpf_map__resize(skel->maps.events, map_size); 69 + bpf_map__resize(skel->maps.cgrp_idx, nr_cgroups); 70 + /* previous result is saved in a per-cpu array */ 71 + map_size = evlist->core.nr_entries / nr_cgroups; 72 + bpf_map__resize(skel->maps.prev_readings, map_size); 73 + /* cgroup result needs all events (per-cpu) */ 74 + map_size = evlist->core.nr_entries; 75 + bpf_map__resize(skel->maps.cgrp_readings, map_size); 76 + 77 + set_max_rlimit(); 78 + 79 + err = bperf_cgroup_bpf__load(skel); 80 + if (err) { 81 + pr_err("Failed to load cgroup skeleton\n"); 82 + goto out; 83 + } 84 + 85 + if (cgroup_is_v2("perf_event") > 0) 86 + skel->bss->use_cgroup_v2 = 1; 87 + 88 + err = -1; 89 + 90 + cgrp_switch = evsel__new(&cgrp_switch_attr); 91 + if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) { 92 + pr_err("Failed to open cgroup switches event\n"); 93 + goto out; 94 + } 95 + 96 + for (i = 0; i < nr_cpus; i++) { 97 + link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch, 98 + FD(cgrp_switch, i)); 99 + if (IS_ERR(link)) { 100 + pr_err("Failed to attach cgroup program\n"); 101 + err = PTR_ERR(link); 102 + goto out; 103 + } 104 + } 105 + 106 + /* 107 + * Update cgrp_idx map from cgroup-id to event index. 108 + */ 109 + cgrp = NULL; 110 + i = 0; 111 + 112 + evlist__for_each_entry(evlist, evsel) { 113 + if (cgrp == NULL || evsel->cgrp == leader_cgrp) { 114 + leader_cgrp = evsel->cgrp; 115 + evsel->cgrp = NULL; 116 + 117 + /* open single copy of the events w/o cgroup */ 118 + err = evsel__open_per_cpu(evsel, evlist->core.all_cpus, -1); 119 + if (err) { 120 + pr_err("Failed to open first cgroup events\n"); 121 + goto out; 122 + } 123 + 124 + map_fd = bpf_map__fd(skel->maps.events); 125 + for (cpu = 0; cpu < nr_cpus; cpu++) { 126 + int fd = FD(evsel, cpu); 127 + __u32 idx = evsel->idx * total_cpus + 128 + evlist->core.all_cpus->map[cpu]; 129 + 130 + err = bpf_map_update_elem(map_fd, &idx, &fd, 131 + BPF_ANY); 132 + if (err < 0) { 133 + pr_err("Failed to update perf_event fd\n"); 134 + goto out; 135 + } 136 + } 137 + 138 + evsel->cgrp = leader_cgrp; 139 + } 140 + evsel->supported = true; 141 + 142 + if (evsel->cgrp == cgrp) 143 + continue; 144 + 145 + cgrp = evsel->cgrp; 146 + 147 + if (read_cgroup_id(cgrp) < 0) { 148 + pr_err("Failed to get cgroup id\n"); 149 + err = -1; 150 + goto out; 151 + } 152 + 153 + map_fd = bpf_map__fd(skel->maps.cgrp_idx); 154 + err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY); 155 + if (err < 0) { 156 + pr_err("Failed to update cgroup index map\n"); 157 + goto out; 158 + } 159 + 160 + i++; 161 + } 162 + 163 + /* 164 + * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check 165 + * whether the kernel support it 166 + */ 167 + prog_fd = bpf_program__fd(skel->progs.trigger_read); 168 + err = bperf_trigger_reading(prog_fd, 0); 169 + if (err) { 170 + pr_warning("The kernel does not support test_run for raw_tp BPF programs.\n" 171 + "Therefore, --for-each-cgroup might show inaccurate readings\n"); 172 + err = 0; 173 + } 174 + 175 + out: 176 + return err; 177 + } 178 + 179 + static int bperf_cgrp__load(struct evsel *evsel, 180 + struct target *target __maybe_unused) 181 + { 182 + static bool bperf_loaded = false; 183 + 184 + evsel->bperf_leader_prog_fd = -1; 185 + evsel->bperf_leader_link_fd = -1; 186 + 187 + if (!bperf_loaded && bperf_load_program(evsel->evlist)) 188 + return -1; 189 + 190 + bperf_loaded = true; 191 + /* just to bypass bpf_counter_skip() */ 192 + evsel->follower_skel = (struct bperf_follower_bpf *)skel; 193 + 194 + return 0; 195 + } 196 + 197 + static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused, 198 + int cpu __maybe_unused, int fd __maybe_unused) 199 + { 200 + /* nothing to do */ 201 + return 0; 202 + } 203 + 204 + /* 205 + * trigger the leader prog on each cpu, so the cgrp_reading map could get 206 + * the latest results. 207 + */ 208 + static int bperf_cgrp__sync_counters(struct evlist *evlist) 209 + { 210 + int i, cpu; 211 + int nr_cpus = evlist->core.all_cpus->nr; 212 + int prog_fd = bpf_program__fd(skel->progs.trigger_read); 213 + 214 + for (i = 0; i < nr_cpus; i++) { 215 + cpu = evlist->core.all_cpus->map[i]; 216 + bperf_trigger_reading(prog_fd, cpu); 217 + } 218 + 219 + return 0; 220 + } 221 + 222 + static int bperf_cgrp__enable(struct evsel *evsel) 223 + { 224 + if (evsel->idx) 225 + return 0; 226 + 227 + bperf_cgrp__sync_counters(evsel->evlist); 228 + 229 + skel->bss->enabled = 1; 230 + return 0; 231 + } 232 + 233 + static int bperf_cgrp__disable(struct evsel *evsel) 234 + { 235 + if (evsel->idx) 236 + return 0; 237 + 238 + bperf_cgrp__sync_counters(evsel->evlist); 239 + 240 + skel->bss->enabled = 0; 241 + return 0; 242 + } 243 + 244 + static int bperf_cgrp__read(struct evsel *evsel) 245 + { 246 + struct evlist *evlist = evsel->evlist; 247 + int i, cpu, nr_cpus = evlist->core.all_cpus->nr; 248 + int total_cpus = cpu__max_cpu(); 249 + struct perf_counts_values *counts; 250 + struct bpf_perf_event_value *values; 251 + int reading_map_fd, err = 0; 252 + __u32 idx; 253 + 254 + if (evsel->idx) 255 + return 0; 256 + 257 + bperf_cgrp__sync_counters(evsel->evlist); 258 + 259 + values = calloc(total_cpus, sizeof(*values)); 260 + if (values == NULL) 261 + return -ENOMEM; 262 + 263 + reading_map_fd = bpf_map__fd(skel->maps.cgrp_readings); 264 + 265 + evlist__for_each_entry(evlist, evsel) { 266 + idx = evsel->idx; 267 + err = bpf_map_lookup_elem(reading_map_fd, &idx, values); 268 + if (err) { 269 + pr_err("bpf map lookup falied: idx=%u, event=%s, cgrp=%s\n", 270 + idx, evsel__name(evsel), evsel->cgrp->name); 271 + goto out; 272 + } 273 + 274 + for (i = 0; i < nr_cpus; i++) { 275 + cpu = evlist->core.all_cpus->map[i]; 276 + 277 + counts = perf_counts(evsel->counts, i, 0); 278 + counts->val = values[cpu].counter; 279 + counts->ena = values[cpu].enabled; 280 + counts->run = values[cpu].running; 281 + } 282 + } 283 + 284 + out: 285 + free(values); 286 + return err; 287 + } 288 + 289 + static int bperf_cgrp__destroy(struct evsel *evsel) 290 + { 291 + if (evsel->idx) 292 + return 0; 293 + 294 + bperf_cgroup_bpf__destroy(skel); 295 + evsel__delete(cgrp_switch); // it'll destroy on_switch progs too 296 + 297 + return 0; 298 + } 299 + 300 + struct bpf_counter_ops bperf_cgrp_ops = { 301 + .load = bperf_cgrp__load, 302 + .enable = bperf_cgrp__enable, 303 + .disable = bperf_cgrp__disable, 304 + .read = bperf_cgrp__read, 305 + .install_pe = bperf_cgrp__install_pe, 306 + .destroy = bperf_cgrp__destroy, 307 + };
+191
tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (c) 2021 Facebook 3 + // Copyright (c) 2021 Google 4 + #include "vmlinux.h" 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + #include <bpf/bpf_core_read.h> 8 + 9 + #define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary 10 + #define MAX_EVENTS 32 // max events per cgroup: arbitrary 11 + 12 + // NOTE: many of map and global data will be modified before loading 13 + // from the userspace (perf tool) using the skeleton helpers. 14 + 15 + // single set of global perf events to measure 16 + struct { 17 + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 18 + __uint(key_size, sizeof(__u32)); 19 + __uint(value_size, sizeof(int)); 20 + __uint(max_entries, 1); 21 + } events SEC(".maps"); 22 + 23 + // from cgroup id to event index 24 + struct { 25 + __uint(type, BPF_MAP_TYPE_HASH); 26 + __uint(key_size, sizeof(__u64)); 27 + __uint(value_size, sizeof(__u32)); 28 + __uint(max_entries, 1); 29 + } cgrp_idx SEC(".maps"); 30 + 31 + // per-cpu event snapshots to calculate delta 32 + struct { 33 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 34 + __uint(key_size, sizeof(__u32)); 35 + __uint(value_size, sizeof(struct bpf_perf_event_value)); 36 + } prev_readings SEC(".maps"); 37 + 38 + // aggregated event values for each cgroup (per-cpu) 39 + // will be read from the user-space 40 + struct { 41 + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 42 + __uint(key_size, sizeof(__u32)); 43 + __uint(value_size, sizeof(struct bpf_perf_event_value)); 44 + } cgrp_readings SEC(".maps"); 45 + 46 + const volatile __u32 num_events = 1; 47 + const volatile __u32 num_cpus = 1; 48 + 49 + int enabled = 0; 50 + int use_cgroup_v2 = 0; 51 + 52 + static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) 53 + { 54 + struct task_struct *p = (void *)bpf_get_current_task(); 55 + struct cgroup *cgrp; 56 + register int i = 0; 57 + __u32 *elem; 58 + int level; 59 + int cnt; 60 + 61 + cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup); 62 + level = BPF_CORE_READ(cgrp, level); 63 + 64 + for (cnt = 0; i < MAX_LEVELS; i++) { 65 + __u64 cgrp_id; 66 + 67 + if (i > level) 68 + break; 69 + 70 + // convert cgroup-id to a map index 71 + cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]); 72 + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 73 + if (!elem) 74 + continue; 75 + 76 + cgrps[cnt++] = *elem; 77 + if (cnt == size) 78 + break; 79 + } 80 + 81 + return cnt; 82 + } 83 + 84 + static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) 85 + { 86 + register int i = 0; 87 + __u32 *elem; 88 + int cnt; 89 + 90 + for (cnt = 0; i < MAX_LEVELS; i++) { 91 + __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); 92 + 93 + if (cgrp_id == 0) 94 + break; 95 + 96 + // convert cgroup-id to a map index 97 + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 98 + if (!elem) 99 + continue; 100 + 101 + cgrps[cnt++] = *elem; 102 + if (cnt == size) 103 + break; 104 + } 105 + 106 + return cnt; 107 + } 108 + 109 + static int bperf_cgroup_count(void) 110 + { 111 + register __u32 idx = 0; // to have it in a register to pass BPF verifier 112 + register int c = 0; 113 + struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; 114 + __u32 cpu = bpf_get_smp_processor_id(); 115 + __u32 cgrp_idx[MAX_LEVELS]; 116 + int cgrp_cnt; 117 + __u32 key, cgrp; 118 + long err; 119 + 120 + if (use_cgroup_v2) 121 + cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); 122 + else 123 + cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); 124 + 125 + for ( ; idx < MAX_EVENTS; idx++) { 126 + if (idx == num_events) 127 + break; 128 + 129 + // XXX: do not pass idx directly (for verifier) 130 + key = idx; 131 + // this is per-cpu array for diff 132 + prev_val = bpf_map_lookup_elem(&prev_readings, &key); 133 + if (!prev_val) { 134 + val.counter = val.enabled = val.running = 0; 135 + bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); 136 + 137 + prev_val = bpf_map_lookup_elem(&prev_readings, &key); 138 + if (!prev_val) 139 + continue; 140 + } 141 + 142 + // read from global perf_event array 143 + key = idx * num_cpus + cpu; 144 + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 145 + if (err) 146 + continue; 147 + 148 + if (enabled) { 149 + delta.counter = val.counter - prev_val->counter; 150 + delta.enabled = val.enabled - prev_val->enabled; 151 + delta.running = val.running - prev_val->running; 152 + 153 + for (c = 0; c < MAX_LEVELS; c++) { 154 + if (c == cgrp_cnt) 155 + break; 156 + 157 + cgrp = cgrp_idx[c]; 158 + 159 + // aggregate the result by cgroup 160 + key = cgrp * num_events + idx; 161 + cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); 162 + if (cgrp_val) { 163 + cgrp_val->counter += delta.counter; 164 + cgrp_val->enabled += delta.enabled; 165 + cgrp_val->running += delta.running; 166 + } else { 167 + bpf_map_update_elem(&cgrp_readings, &key, 168 + &delta, BPF_ANY); 169 + } 170 + } 171 + } 172 + 173 + *prev_val = val; 174 + } 175 + return 0; 176 + } 177 + 178 + // This will be attached to cgroup-switches event for each cpu 179 + SEC("perf_events") 180 + int BPF_PROG(on_cgrp_switch) 181 + { 182 + return bperf_cgroup_count(); 183 + } 184 + 185 + SEC("raw_tp/sched_switch") 186 + int BPF_PROG(trigger_read) 187 + { 188 + return bperf_cgroup_count(); 189 + } 190 + 191 + char LICENSE[] SEC("license") = "Dual BSD/GPL";
+2
tools/perf/util/cgroup.c
··· 18 18 #include <regex.h> 19 19 20 20 int nr_cgroups; 21 + bool cgrp_event_expanded; 21 22 22 23 /* used to match cgroup name with patterns */ 23 24 struct cgroup_name { ··· 485 484 } 486 485 487 486 ret = 0; 487 + cgrp_event_expanded = true; 488 488 489 489 out_err: 490 490 evlist__delete(orig_list);
+1
tools/perf/util/cgroup.h
··· 18 18 }; 19 19 20 20 extern int nr_cgroups; /* number of explicit cgroups defined */ 21 + extern bool cgrp_event_expanded; 21 22 22 23 struct cgroup *cgroup__get(struct cgroup *cgroup); 23 24 void cgroup__put(struct cgroup *cgroup);