Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf trace: Do not hardcode the size of the tracepoint common_ fields

We shouldn't hardcode the size of the tracepoint common_ fields, use the
offset of the 'id'/'__syscallnr' field in the sys_enter event instead.

This caused the augmented syscalls code to fail on a particular build of a
PREEMPT_RT_FULL kernel where these extra 'common_migrate_disable' and
'common_padding' fields were before the syscall id one:

# cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/format
name: sys_enter
ID: 22
format:
field:unsigned short common_type; offset:0; size:2; signed:0;
field:unsigned char common_flags; offset:2; size:1; signed:0;
field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
field:int common_pid; offset:4; size:4; signed:1;
field:unsigned short common_migrate_disable; offset:8; size:2; signed:0;
field:unsigned short common_padding; offset:10; size:2; signed:0;

field:long id; offset:16; size:8; signed:1;
field:unsigned long args[6]; offset:24; size:48; signed:0;

print fmt: "NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", REC->id, REC->args[0], REC->args[1], REC->args[2], REC->args[3], REC->args[4], REC->args[5]
#

All those 'common_' prefixed fields are zeroed when they hit a BPF tracepoint
hook, we better just discard those, i.e. somehow pass an offset to the
BPF program from the start of the ctx and make adjustments in the 'perf trace'
handlers to adjust the offset of the syscall arg offsets obtained from tracefs.

Till then, fix it the quick way and add this to the augmented_raw_syscalls.c to
bet it to work in such kernels:

diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index 53c233370fae..1f746f931e13 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -38,12 +38,14 @@ struct bpf_map SEC("maps") syscalls = {

struct syscall_enter_args {
unsigned long long common_tp_fields;
+ long rt_common_tp_fields;
long syscall_nr;
unsigned long args[6];
};

struct syscall_exit_args {
unsigned long long common_tp_fields;
+ long rt_common_tp_fields;
long syscall_nr;
long ret;
};

Just to check that this was the case. Fix it properly later, for now remove the
hardcoding of the offset in the 'perf trace' side and document the situation
with this patch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Luis Cláudio Gonçalves <lclaudio@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-2pqavrktqkliu5b9nzouio21@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

+52 -21
+52 -21
tools/perf/builtin-trace.c
··· 112 112 } stats; 113 113 unsigned int max_stack; 114 114 unsigned int min_stack; 115 - bool sort_events; 115 + int raw_augmented_syscalls_args_size; 116 116 bool raw_augmented_syscalls; 117 + bool sort_events; 117 118 bool not_ev_qualifier; 118 119 bool live; 119 120 bool full_time; ··· 284 283 return -ENOENT; 285 284 } 286 285 287 - static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel) 286 + static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp) 288 287 { 289 288 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 290 289 291 - if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */ 292 - if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap)) 290 + if (evsel->priv != NULL) { 291 + struct tep_format_field *syscall_id = perf_evsel__field(tp, "id"); 292 + if (syscall_id == NULL) 293 + syscall_id = perf_evsel__field(tp, "__syscall_nr"); 294 + if (syscall_id == NULL) 295 + goto out_delete; 296 + if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap)) 293 297 goto out_delete; 294 298 295 299 return 0; ··· 1774 1768 return printed; 1775 1769 } 1776 1770 1777 - static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, bool raw_augmented) 1771 + static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size) 1778 1772 { 1779 1773 void *augmented_args = NULL; 1780 1774 /* 1781 1775 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter 1782 - * and there we get all 6 syscall args plus the tracepoint common 1783 - * fields (sizeof(long)) and the syscall_nr (another long). So we check 1784 - * if that is the case and if so don't look after the sc->args_size, 1785 - * but always after the full raw_syscalls:sys_enter payload, which is 1786 - * fixed. 1776 + * and there we get all 6 syscall args plus the tracepoint common fields 1777 + * that gets calculated at the start and the syscall_nr (another long). 1778 + * So we check if that is the case and if so don't look after the 1779 + * sc->args_size but always after the full raw_syscalls:sys_enter payload, 1780 + * which is fixed. 1787 1781 * 1788 1782 * We'll revisit this later to pass s->args_size to the BPF augmenter 1789 1783 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it ··· 1791 1785 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace 1792 1786 * traffic to just what is needed for each syscall. 1793 1787 */ 1794 - int args_size = raw_augmented ? (8 * (int)sizeof(long)) : sc->args_size; 1788 + int args_size = raw_augmented_args_size ?: sc->args_size; 1795 1789 1796 1790 *augmented_args_size = sample->raw_size - args_size; 1797 1791 if (*augmented_args_size > 0) ··· 1845 1839 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one. 1846 1840 */ 1847 1841 if (evsel != trace->syscalls.events.sys_enter) 1848 - augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); 1842 + augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); 1849 1843 ttrace->entry_time = sample->time; 1850 1844 msg = ttrace->entry_str; 1851 1845 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); ··· 1903 1897 goto out_put; 1904 1898 1905 1899 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1906 - augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls); 1900 + augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); 1907 1901 syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); 1908 1902 fprintf(trace->output, "%s", msg); 1909 1903 err = 0; ··· 3820 3814 * syscall. 3821 3815 */ 3822 3816 if (trace.syscalls.events.augmented) { 3823 - evsel = trace.syscalls.events.augmented; 3824 - 3825 - if (perf_evsel__init_augmented_syscall_tp(evsel) || 3826 - perf_evsel__init_augmented_syscall_tp_args(evsel)) 3827 - goto out; 3828 - evsel->handler = trace__sys_enter; 3829 - 3830 3817 evlist__for_each_entry(trace.evlist, evsel) { 3831 3818 bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0; 3832 3819 ··· 3828 3829 goto init_augmented_syscall_tp; 3829 3830 } 3830 3831 3832 + if (strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_enter") == 0) { 3833 + struct perf_evsel *augmented = trace.syscalls.events.augmented; 3834 + if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) || 3835 + perf_evsel__init_augmented_syscall_tp_args(augmented)) 3836 + goto out; 3837 + augmented->handler = trace__sys_enter; 3838 + } 3839 + 3831 3840 if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) { 3841 + struct syscall_tp *sc; 3832 3842 init_augmented_syscall_tp: 3833 - perf_evsel__init_augmented_syscall_tp(evsel); 3843 + if (perf_evsel__init_augmented_syscall_tp(evsel, evsel)) 3844 + goto out; 3845 + sc = evsel->priv; 3846 + /* 3847 + * For now with BPF raw_augmented we hook into 3848 + * raw_syscalls:sys_enter and there we get all 3849 + * 6 syscall args plus the tracepoint common 3850 + * fields and the syscall_nr (another long). 3851 + * So we check if that is the case and if so 3852 + * don't look after the sc->args_size but 3853 + * always after the full raw_syscalls:sys_enter 3854 + * payload, which is fixed. 3855 + * 3856 + * We'll revisit this later to pass 3857 + * s->args_size to the BPF augmenter (now 3858 + * tools/perf/examples/bpf/augmented_raw_syscalls.c, 3859 + * so that it copies only what we need for each 3860 + * syscall, like what happens when we use 3861 + * syscalls:sys_enter_NAME, so that we reduce 3862 + * the kernel/userspace traffic to just what is 3863 + * needed for each syscall. 3864 + */ 3865 + if (trace.raw_augmented_syscalls) 3866 + trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset; 3834 3867 perf_evsel__init_augmented_syscall_tp_ret(evsel); 3835 3868 evsel->handler = trace__sys_exit; 3836 3869 }