perf record: Enable off-cpu analysis with BPF

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Add --off-cpu option to enable the off-cpu profiling with BPF. It'd
use a bpf_output event and rename it to "offcpu-time". Samples will
be synthesized at the end of the record session using data from a BPF
map which contains the aggregated off-cpu time at context switches.
So it needs root privilege to get the off-cpu profiling.

Each sample will have a separate user stacktrace so it will skip
kernel threads. The sample ip will be set from the stacktrace and
other sample data will be updated accordingly. Currently it only
handles some basic sample types.

The sample timestamp is set to a dummy value just not to bother with
other events during the sorting. So it has a very big initial value
and increase it on processing each samples.

Good thing is that it can be used together with regular profiling like
cpu cycles. If you don't want to that, you can use a dummy event to
enable off-cpu profiling only.

Example output:
$ sudo perf record --off-cpu perf bench sched messaging -l 1000

$ sudo perf report --stdio --call-graph=no
# Total Lost Samples: 0
#
# Samples: 41K of event 'cycles'
# Event count (approx.): 42137343851
...

# Samples: 1K of event 'offcpu-time'
# Event count (approx.): 587990831640
#
# Children Self Command Shared Object Symbol
# ........ ........ ............... .................. .........................
#
81.66% 0.00% sched-messaging libc-2.33.so [.] __libc_start_main
81.66% 0.00% sched-messaging perf [.] cmd_bench
81.66% 0.00% sched-messaging perf [.] main
81.66% 0.00% sched-messaging perf [.] run_builtin
81.43% 0.00% sched-messaging perf [.] bench_sched_messaging
40.86% 40.86% sched-messaging libpthread-2.33.so [.] __read
37.66% 37.66% sched-messaging libpthread-2.33.so [.] __write
2.91% 2.91% sched-messaging libc-2.33.so [.] __poll
...

As you can see it spent most of off-cpu time in read and write in
bench_sched_messaging(). The --call-graph=no was added just to make
the output concise here.

It uses perf hooks facility to control BPF program during the record
session rather than adding new BPF/off-cpu specific calls.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-3-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

authored by

Namhyung Kim and committed by

Arnaldo Carvalho de Melo 3 years ago edc41a10 303ead45

+404

7 changed files

expand all

tools

perf

Documentation

perf-record.txt

Makefile.perf

builtin-record.c

util

Build

bpf_off_cpu.c

bpf_skel

off_cpu.bpf.c

off_cpu.h

+10

tools/perf/Documentation/perf-record.txt

··· 758 758 If the URLs is not specified, the value of DEBUGINFOD_URLS 759 759 system environment variable is used. 760 760 761 + --off-cpu:: 762 + Enable off-cpu profiling with BPF. The BPF program will collect 763 + task scheduling information with (user) stacktrace and save them 764 + as sample data of a software event named "offcpu-time". The 765 + sample period will have the time the task slept in nanoseconds. 766 + 767 + Note that BPF can collect stack traces using frame pointer ("fp") 768 + only, as of now. So the applications built without the frame 769 + pointer might see bogus addresses. 770 + 761 771 SEE ALSO 762 772 -------- 763 773 linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]

tools/perf/Makefile.perf

··· 1038 1038 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h 1039 1039 SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h 1040 1040 SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h 1041 + SKELETONS += $(SKEL_OUT)/off_cpu.skel.h 1041 1042 1042 1043 $(SKEL_TMP_OUT) $(LIBBPF_OUTPUT): 1043 1044 $(Q)$(MKDIR) -p $@

+25

tools/perf/builtin-record.c

··· 49 49 #include "util/clockid.h" 50 50 #include "util/pmu-hybrid.h" 51 51 #include "util/evlist-hybrid.h" 52 + #include "util/off_cpu.h" 52 53 #include "asm/bug.h" 53 54 #include "perf.h" 54 55 #include "cputopo.h" ··· 163 162 bool buildid_mmap; 164 163 bool timestamp_filename; 165 164 bool timestamp_boundary; 165 + bool off_cpu; 166 166 struct switch_output switch_output; 167 167 unsigned long long samples; 168 168 unsigned long output_max_size; /* = 0: unlimited */ ··· 888 886 evsel__set_sample_bit(evsel, TIME); 889 887 890 888 return 0; 889 + } 890 + 891 + static int record__config_off_cpu(struct record *rec) 892 + { 893 + return off_cpu_prepare(rec->evlist); 891 894 } 892 895 893 896 static bool record__kcore_readable(struct machine *machine) ··· 2598 2591 } else 2599 2592 status = err; 2600 2593 2594 + if (rec->off_cpu) 2595 + rec->bytes_written += off_cpu_write(rec->session); 2596 + 2601 2597 record__synthesize(rec, true); 2602 2598 /* this will be recalculated during process_buildids() */ 2603 2599 rec->samples = 0; ··· 3325 3315 OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec", 3326 3316 "write collected trace data into several data files using parallel threads", 3327 3317 record__parse_threads), 3318 + OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"), 3328 3319 OPT_END() 3329 3320 }; 3330 3321 ··· 3747 3736 # undef REASON 3748 3737 #endif 3749 3738 3739 + #ifndef HAVE_BPF_SKEL 3740 + # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c) 3741 + set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true); 3742 + # undef set_nobuild 3743 + #endif 3744 + 3750 3745 rec->opts.affinity = PERF_AFFINITY_SYS; 3751 3746 3752 3747 rec->evlist = evlist__new(); ··· 3985 3968 err = record__config_text_poke(rec->evlist); 3986 3969 if (err) { 3987 3970 pr_err("record__config_text_poke failed, error %d\n", err); 3971 + goto out; 3972 + } 3973 + } 3974 + 3975 + if (rec->off_cpu) { 3976 + err = record__config_off_cpu(rec); 3977 + if (err) { 3978 + pr_err("record__config_off_cpu failed, error %d\n", err); 3988 3979 goto out; 3989 3980 } 3990 3981 }

tools/perf/util/Build

··· 147 147 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o 148 148 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o 149 149 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o 150 + perf-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o 150 151 perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o 151 152 perf-$(CONFIG_LIBELF) += symbol-elf.o 152 153 perf-$(CONFIG_LIBELF) += probe-file.o

+204

tools/perf/util/bpf_off_cpu.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include "util/bpf_counter.h" 3 + #include "util/debug.h" 4 + #include "util/evsel.h" 5 + #include "util/evlist.h" 6 + #include "util/off_cpu.h" 7 + #include "util/perf-hooks.h" 8 + #include "util/session.h" 9 + #include <bpf/bpf.h> 10 + 11 + #include "bpf_skel/off_cpu.skel.h" 12 + 13 + #define MAX_STACKS 32 14 + /* we don't need actual timestamp, just want to put the samples at last */ 15 + #define OFF_CPU_TIMESTAMP (~0ull << 32) 16 + 17 + static struct off_cpu_bpf *skel; 18 + 19 + struct off_cpu_key { 20 + u32 pid; 21 + u32 tgid; 22 + u32 stack_id; 23 + u32 state; 24 + }; 25 + 26 + union off_cpu_data { 27 + struct perf_event_header hdr; 28 + u64 array[1024 / sizeof(u64)]; 29 + }; 30 + 31 + static int off_cpu_config(struct evlist *evlist) 32 + { 33 + struct evsel *evsel; 34 + struct perf_event_attr attr = { 35 + .type = PERF_TYPE_SOFTWARE, 36 + .config = PERF_COUNT_SW_BPF_OUTPUT, 37 + .size = sizeof(attr), /* to capture ABI version */ 38 + }; 39 + char *evname = strdup(OFFCPU_EVENT); 40 + 41 + if (evname == NULL) 42 + return -ENOMEM; 43 + 44 + evsel = evsel__new(&attr); 45 + if (!evsel) { 46 + free(evname); 47 + return -ENOMEM; 48 + } 49 + 50 + evsel->core.attr.freq = 1; 51 + evsel->core.attr.sample_period = 1; 52 + /* off-cpu analysis depends on stack trace */ 53 + evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN; 54 + 55 + evlist__add(evlist, evsel); 56 + 57 + free(evsel->name); 58 + evsel->name = evname; 59 + 60 + return 0; 61 + } 62 + 63 + static void off_cpu_start(void *arg __maybe_unused) 64 + { 65 + skel->bss->enabled = 1; 66 + } 67 + 68 + static void off_cpu_finish(void *arg __maybe_unused) 69 + { 70 + skel->bss->enabled = 0; 71 + off_cpu_bpf__destroy(skel); 72 + } 73 + 74 + int off_cpu_prepare(struct evlist *evlist) 75 + { 76 + int err; 77 + 78 + if (off_cpu_config(evlist) < 0) { 79 + pr_err("Failed to config off-cpu BPF event\n"); 80 + return -1; 81 + } 82 + 83 + set_max_rlimit(); 84 + 85 + skel = off_cpu_bpf__open_and_load(); 86 + if (!skel) { 87 + pr_err("Failed to open off-cpu BPF skeleton\n"); 88 + return -1; 89 + } 90 + 91 + err = off_cpu_bpf__attach(skel); 92 + if (err) { 93 + pr_err("Failed to attach off-cpu BPF skeleton\n"); 94 + goto out; 95 + } 96 + 97 + if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) || 98 + perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) { 99 + pr_err("Failed to attach off-cpu skeleton\n"); 100 + goto out; 101 + } 102 + 103 + return 0; 104 + 105 + out: 106 + off_cpu_bpf__destroy(skel); 107 + return -1; 108 + } 109 + 110 + int off_cpu_write(struct perf_session *session) 111 + { 112 + int bytes = 0, size; 113 + int fd, stack; 114 + u64 sample_type, val, sid = 0; 115 + struct evsel *evsel; 116 + struct perf_data_file *file = &session->data->file; 117 + struct off_cpu_key prev, key; 118 + union off_cpu_data data = { 119 + .hdr = { 120 + .type = PERF_RECORD_SAMPLE, 121 + .misc = PERF_RECORD_MISC_USER, 122 + }, 123 + }; 124 + u64 tstamp = OFF_CPU_TIMESTAMP; 125 + 126 + skel->bss->enabled = 0; 127 + 128 + evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT); 129 + if (evsel == NULL) { 130 + pr_err("%s evsel not found\n", OFFCPU_EVENT); 131 + return 0; 132 + } 133 + 134 + sample_type = evsel->core.attr.sample_type; 135 + 136 + if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) { 137 + if (evsel->core.id) 138 + sid = evsel->core.id[0]; 139 + } 140 + 141 + fd = bpf_map__fd(skel->maps.off_cpu); 142 + stack = bpf_map__fd(skel->maps.stacks); 143 + memset(&prev, 0, sizeof(prev)); 144 + 145 + while (!bpf_map_get_next_key(fd, &prev, &key)) { 146 + int n = 1; /* start from perf_event_header */ 147 + int ip_pos = -1; 148 + 149 + bpf_map_lookup_elem(fd, &key, &val); 150 + 151 + if (sample_type & PERF_SAMPLE_IDENTIFIER) 152 + data.array[n++] = sid; 153 + if (sample_type & PERF_SAMPLE_IP) { 154 + ip_pos = n; 155 + data.array[n++] = 0; /* will be updated */ 156 + } 157 + if (sample_type & PERF_SAMPLE_TID) 158 + data.array[n++] = (u64)key.pid << 32 | key.tgid; 159 + if (sample_type & PERF_SAMPLE_TIME) 160 + data.array[n++] = tstamp; 161 + if (sample_type & PERF_SAMPLE_ID) 162 + data.array[n++] = sid; 163 + if (sample_type & PERF_SAMPLE_CPU) 164 + data.array[n++] = 0; 165 + if (sample_type & PERF_SAMPLE_PERIOD) 166 + data.array[n++] = val; 167 + if (sample_type & PERF_SAMPLE_CALLCHAIN) { 168 + int len = 0; 169 + 170 + /* data.array[n] is callchain->nr (updated later) */ 171 + data.array[n + 1] = PERF_CONTEXT_USER; 172 + data.array[n + 2] = 0; 173 + 174 + bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]); 175 + while (data.array[n + 2 + len]) 176 + len++; 177 + 178 + /* update length of callchain */ 179 + data.array[n] = len + 1; 180 + 181 + /* update sample ip with the first callchain entry */ 182 + if (ip_pos >= 0) 183 + data.array[ip_pos] = data.array[n + 2]; 184 + 185 + /* calculate sample callchain data array length */ 186 + n += len + 2; 187 + } 188 + /* TODO: handle more sample types */ 189 + 190 + size = n * sizeof(u64); 191 + data.hdr.size = size; 192 + bytes += size; 193 + 194 + if (perf_data_file__write(file, &data, size) < 0) { 195 + pr_err("failed to write perf data, error: %m\n"); 196 + return bytes; 197 + } 198 + 199 + prev = key; 200 + /* increase dummy timestamp to sort later samples */ 201 + tstamp++; 202 + } 203 + return bytes; 204 + }

+139

tools/perf/util/bpf_skel/off_cpu.bpf.c

··· 1 + // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 + // Copyright (c) 2022 Google 3 + #include "vmlinux.h" 4 + #include <bpf/bpf_helpers.h> 5 + #include <bpf/bpf_tracing.h> 6 + #include <bpf/bpf_core_read.h> 7 + 8 + /* task->flags for off-cpu analysis */ 9 + #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 + 11 + /* task->state for off-cpu analysis */ 12 + #define TASK_INTERRUPTIBLE 0x0001 13 + #define TASK_UNINTERRUPTIBLE 0x0002 14 + 15 + #define MAX_STACKS 32 16 + #define MAX_ENTRIES 102400 17 + 18 + struct tstamp_data { 19 + __u32 stack_id; 20 + __u32 state; 21 + __u64 timestamp; 22 + }; 23 + 24 + struct offcpu_key { 25 + __u32 pid; 26 + __u32 tgid; 27 + __u32 stack_id; 28 + __u32 state; 29 + }; 30 + 31 + struct { 32 + __uint(type, BPF_MAP_TYPE_STACK_TRACE); 33 + __uint(key_size, sizeof(__u32)); 34 + __uint(value_size, MAX_STACKS * sizeof(__u64)); 35 + __uint(max_entries, MAX_ENTRIES); 36 + } stacks SEC(".maps"); 37 + 38 + struct { 39 + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 40 + __uint(map_flags, BPF_F_NO_PREALLOC); 41 + __type(key, int); 42 + __type(value, struct tstamp_data); 43 + } tstamp SEC(".maps"); 44 + 45 + struct { 46 + __uint(type, BPF_MAP_TYPE_HASH); 47 + __uint(key_size, sizeof(struct offcpu_key)); 48 + __uint(value_size, sizeof(__u64)); 49 + __uint(max_entries, MAX_ENTRIES); 50 + } off_cpu SEC(".maps"); 51 + 52 + /* old kernel task_struct definition */ 53 + struct task_struct___old { 54 + long state; 55 + } __attribute__((preserve_access_index)); 56 + 57 + int enabled = 0; 58 + 59 + /* 60 + * Old kernel used to call it task_struct->state and now it's '__state'. 61 + * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 62 + * 63 + * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 64 + */ 65 + static inline int get_task_state(struct task_struct *t) 66 + { 67 + if (bpf_core_field_exists(t->__state)) 68 + return BPF_CORE_READ(t, __state); 69 + 70 + /* recast pointer to capture task_struct___old type for compiler */ 71 + struct task_struct___old *t_old = (void *)t; 72 + 73 + /* now use old "state" name of the field */ 74 + return BPF_CORE_READ(t_old, state); 75 + } 76 + 77 + SEC("tp_btf/sched_switch") 78 + int on_switch(u64 *ctx) 79 + { 80 + __u64 ts; 81 + int state; 82 + __u32 stack_id; 83 + struct task_struct *prev, *next; 84 + struct tstamp_data *pelem; 85 + 86 + if (!enabled) 87 + return 0; 88 + 89 + prev = (struct task_struct *)ctx[1]; 90 + next = (struct task_struct *)ctx[2]; 91 + state = get_task_state(prev); 92 + 93 + ts = bpf_ktime_get_ns(); 94 + 95 + if (prev->flags & PF_KTHREAD) 96 + goto next; 97 + if (state != TASK_INTERRUPTIBLE && 98 + state != TASK_UNINTERRUPTIBLE) 99 + goto next; 100 + 101 + stack_id = bpf_get_stackid(ctx, &stacks, 102 + BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 103 + 104 + pelem = bpf_task_storage_get(&tstamp, prev, NULL, 105 + BPF_LOCAL_STORAGE_GET_F_CREATE); 106 + if (!pelem) 107 + goto next; 108 + 109 + pelem->timestamp = ts; 110 + pelem->state = state; 111 + pelem->stack_id = stack_id; 112 + 113 + next: 114 + pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 115 + 116 + if (pelem && pelem->timestamp) { 117 + struct offcpu_key key = { 118 + .pid = next->pid, 119 + .tgid = next->tgid, 120 + .stack_id = pelem->stack_id, 121 + .state = pelem->state, 122 + }; 123 + __u64 delta = ts - pelem->timestamp; 124 + __u64 *total; 125 + 126 + total = bpf_map_lookup_elem(&off_cpu, &key); 127 + if (total) 128 + *total += delta; 129 + else 130 + bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 131 + 132 + /* prevent to reuse the timestamp later */ 133 + pelem->timestamp = 0; 134 + } 135 + 136 + return 0; 137 + } 138 + 139 + char LICENSE[] SEC("license") = "Dual BSD/GPL";

+24

tools/perf/util/off_cpu.h

··· 1 + #ifndef PERF_UTIL_OFF_CPU_H 2 + #define PERF_UTIL_OFF_CPU_H 3 + 4 + struct evlist; 5 + struct perf_session; 6 + 7 + #define OFFCPU_EVENT "offcpu-time" 8 + 9 + #ifdef HAVE_BPF_SKEL 10 + int off_cpu_prepare(struct evlist *evlist); 11 + int off_cpu_write(struct perf_session *session); 12 + #else 13 + static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused) 14 + { 15 + return -1; 16 + } 17 + 18 + static inline int off_cpu_write(struct perf_session *session __maybe_unused) 19 + { 20 + return -1; 21 + } 22 + #endif 23 + 24 + #endif /* PERF_UTIL_OFF_CPU_H */