Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bpf: Change syscall_nr type to int in struct syscall_tp_t

linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
for lazy preemption")) that adds an extra member to struct trace_entry.
This causes the offset of args field in struct trace_event_raw_sys_enter
be different from the one in struct syscall_trace_enter:

struct trace_event_raw_sys_enter {
struct trace_entry ent; /* 0 12 */

/* XXX last struct has 3 bytes of padding */
/* XXX 4 bytes hole, try to pack */

long int id; /* 16 8 */
long unsigned int args[6]; /* 24 48 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
char __data[]; /* 72 0 */

/* size: 72, cachelines: 2, members: 4 */
/* sum members: 68, holes: 1, sum holes: 4 */
/* paddings: 1, sum paddings: 3 */
/* last cacheline: 8 bytes */
};

struct syscall_trace_enter {
struct trace_entry ent; /* 0 12 */

/* XXX last struct has 3 bytes of padding */

int nr; /* 12 4 */
long unsigned int args[]; /* 16 0 */

/* size: 16, cachelines: 1, members: 3 */
/* paddings: 1, sum paddings: 3 */
/* last cacheline: 16 bytes */
};

This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
test_profiler testcase because max_ctx_offset is calculated based on the
former struct, while off on the latter:

10488 if (is_tracepoint || is_syscall_tp) {
10489 int off = trace_event_get_offsets(event->tp_event);
10490
10491 if (prog->aux->max_ctx_offset > off)
10492 return -EACCES;
10493 }

What bpf program is actually getting is a pointer to struct
syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
the problem by aligning struct syscall_tp_t with struct
syscall_trace_(enter|exit) and changing the tests to use these structs
to dereference context.

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/bpf/20231013054219.172920-1-asavkov@redhat.com

authored by

Artem Savkov and committed by
Andrii Nakryiko
ba8ea723 9c1292ec

+5 -5
+2 -2
kernel/trace/trace_syscalls.c
··· 556 556 { 557 557 struct syscall_tp_t { 558 558 struct trace_entry ent; 559 - unsigned long syscall_nr; 559 + int syscall_nr; 560 560 unsigned long args[SYSCALL_DEFINE_MAXARGS]; 561 561 } __aligned(8) param; 562 562 int i; ··· 661 661 { 662 662 struct syscall_tp_t { 663 663 struct trace_entry ent; 664 - unsigned long syscall_nr; 664 + int syscall_nr; 665 665 unsigned long ret; 666 666 } __aligned(8) param; 667 667
+1 -1
tools/testing/selftests/bpf/progs/profiler.inc.h
··· 609 609 } 610 610 611 611 SEC("tracepoint/syscalls/sys_enter_kill") 612 - int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx) 612 + int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx) 613 613 { 614 614 struct bpf_func_stats_ctx stats_ctx; 615 615
+2 -2
tools/testing/selftests/bpf/progs/test_vmlinux.c
··· 16 16 bool fentry_called = false; 17 17 18 18 SEC("tp/syscalls/sys_enter_nanosleep") 19 - int handle__tp(struct trace_event_raw_sys_enter *args) 19 + int handle__tp(struct syscall_trace_enter *args) 20 20 { 21 21 struct __kernel_timespec *ts; 22 22 long tv_nsec; 23 23 24 - if (args->id != __NR_nanosleep) 24 + if (args->nr != __NR_nanosleep) 25 25 return 0; 26 26 27 27 ts = (void *)args->args[0];