Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf: tracing: Have perf system calls read user space

Allow some of the system call events to read user space buffers. Instead
of just showing the pointer into user space, allow perf events to also
record the content of those pointers. For example:

# perf record -e syscalls:sys_enter_openat ls /usr/bin
[..]
# perf script
ls 1024 [005] 52.902721: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbae321c "/etc/ld.so.cache", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.902899: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae140 "/lib/x86_64-linux-gnu/libselinux.so.1", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.903471: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae690 "/lib/x86_64-linux-gnu/libcap.so.2", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.903946: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaebe0 "/lib/x86_64-linux-gnu/libc.so.6", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.904629: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaf110 "/lib/x86_64-linux-gnu/libpcre2-8.so.0", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.906985: syscalls:sys_enter_openat: dfd: 0xffffffffffffff9c, filename: 0x7fc1dba92904 "/proc/filesystems", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.907323: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dba19490 "/usr/lib/locale/locale-archive", flags: 0x00080000, mode: 0x00000000
ls 1024 [005] 52.907746: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x556fb888dcd0 "/usr/bin", flags: 0x00090800, mode: 0x00000000

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.593925979@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+90 -46
+90 -46
kernel/trace/trace_syscalls.c
··· 468 468 return buf; 469 469 } 470 470 471 + static int 472 + syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, 473 + char **buffer, int *size, int *user_size) 474 + { 475 + struct syscall_user_buffer *sbuf; 476 + 477 + /* If the syscall_buffer is NULL, tracing is being shutdown */ 478 + sbuf = READ_ONCE(syscall_buffer); 479 + if (!sbuf) 480 + return -1; 481 + 482 + *buffer = sys_fault_user(sys_data, sbuf, args, user_size); 483 + /* 484 + * user_size is the amount of data to append. 485 + * Need to add 4 for the meta field that points to 486 + * the user memory at the end of the event and also 487 + * stores its size. 488 + */ 489 + *size = 4 + *user_size; 490 + return 0; 491 + } 492 + 493 + static void syscall_put_data(struct syscall_metadata *sys_data, 494 + struct syscall_trace_enter *entry, 495 + char *buffer, int size) 496 + { 497 + void *ptr; 498 + int val; 499 + 500 + /* 501 + * Set the pointer to point to the meta data of the event 502 + * that has information about the stored user space memory. 503 + */ 504 + ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; 505 + 506 + /* 507 + * The meta data will store the offset of the user data from 508 + * the beginning of the event. 509 + */ 510 + val = (ptr - (void *)entry) + 4; 511 + 512 + /* Store the offset and the size into the meta data */ 513 + *(int *)ptr = val | (size << 16); 514 + 515 + /* Nothing to do if the user space was empty or faulted */ 516 + if (size) { 517 + /* Now store the user space data into the event */ 518 + ptr += 4; 519 + memcpy(ptr, buffer, size); 520 + } 521 + } 522 + 471 523 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) 472 524 { 473 525 struct trace_array *tr = data; ··· 563 511 syscall_get_arguments(current, regs, args); 564 512 565 513 if (mayfault) { 566 - struct syscall_user_buffer *sbuf; 567 - 568 - /* If the syscall_buffer is NULL, tracing is being shutdown */ 569 - sbuf = READ_ONCE(syscall_buffer); 570 - if (!sbuf) 514 + if (syscall_get_data(sys_data, args, &user_ptr, 515 + &size, &user_size) < 0) 571 516 return; 572 - 573 - user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size); 574 - /* 575 - * user_size is the amount of data to append. 576 - * Need to add 4 for the meta field that points to 577 - * the user memory at the end of the event and also 578 - * stores its size. 579 - */ 580 - size = 4 + user_size; 581 517 } 582 518 583 519 size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; ··· 579 539 580 540 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); 581 541 582 - if (mayfault) { 583 - void *ptr; 584 - int val; 585 - 586 - /* 587 - * Set the pointer to point to the meta data of the event 588 - * that has information about the stored user space memory. 589 - */ 590 - ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; 591 - 592 - /* 593 - * The meta data will store the offset of the user data from 594 - * the beginning of the event. 595 - */ 596 - val = (ptr - (void *)entry) + 4; 597 - 598 - /* Store the offset and the size into the meta data */ 599 - *(int *)ptr = val | (user_size << 16); 600 - 601 - /* Nothing to do if the user space was empty or faulted */ 602 - if (user_size) { 603 - /* Now store the user space data into the event */ 604 - ptr += 4; 605 - memcpy(ptr, user_ptr, user_size); 606 - } 607 - } 542 + if (mayfault) 543 + syscall_put_data(sys_data, entry, user_ptr, user_size); 608 544 609 545 trace_event_buffer_commit(&fbuffer); 610 546 } ··· 1012 996 struct hlist_head *head; 1013 997 unsigned long args[6]; 1014 998 bool valid_prog_array; 999 + bool mayfault; 1000 + char *user_ptr; 1015 1001 int syscall_nr; 1002 + int user_size; 1016 1003 int rctx; 1017 - int size; 1004 + int size = 0; 1018 1005 1019 1006 /* 1020 1007 * Syscall probe called with preemption enabled, but the ring ··· 1036 1017 if (!sys_data) 1037 1018 return; 1038 1019 1020 + syscall_get_arguments(current, regs, args); 1021 + 1022 + /* Check if this syscall event faults in user space memory */ 1023 + mayfault = sys_data->user_mask != 0; 1024 + 1025 + if (mayfault) { 1026 + if (syscall_get_data(sys_data, args, &user_ptr, 1027 + &size, &user_size) < 0) 1028 + return; 1029 + } 1030 + 1039 1031 head = this_cpu_ptr(sys_data->enter_event->perf_events); 1040 1032 valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); 1041 1033 if (!valid_prog_array && hlist_empty(head)) 1042 1034 return; 1043 1035 1044 1036 /* get the size after alignment with the u32 buffer size field */ 1045 - size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 1037 + size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 1046 1038 size = ALIGN(size + sizeof(u32), sizeof(u64)); 1047 1039 size -= sizeof(u32); 1048 1040 ··· 1062 1032 return; 1063 1033 1064 1034 rec->nr = syscall_nr; 1065 - syscall_get_arguments(current, regs, args); 1066 1035 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); 1036 + 1037 + if (mayfault) 1038 + syscall_put_data(sys_data, rec, user_ptr, user_size); 1067 1039 1068 1040 if ((valid_prog_array && 1069 1041 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || ··· 1081 1049 1082 1050 static int perf_sysenter_enable(struct trace_event_call *call) 1083 1051 { 1052 + struct syscall_metadata *sys_data = call->data; 1084 1053 int num; 1054 + int ret; 1085 1055 1086 - num = ((struct syscall_metadata *)call->data)->syscall_nr; 1056 + num = sys_data->syscall_nr; 1087 1057 1088 1058 guard(mutex)(&syscall_trace_lock); 1059 + if (sys_data->user_mask) { 1060 + ret = syscall_fault_buffer_enable(); 1061 + if (ret < 0) 1062 + return ret; 1063 + } 1089 1064 if (!sys_perf_refcount_enter) { 1090 - int ret = register_trace_sys_enter(perf_syscall_enter, NULL); 1065 + ret = register_trace_sys_enter(perf_syscall_enter, NULL); 1091 1066 if (ret) { 1092 1067 pr_info("event trace: Could not activate syscall entry trace point"); 1068 + if (sys_data->user_mask) 1069 + syscall_fault_buffer_disable(); 1093 1070 return ret; 1094 1071 } 1095 1072 } ··· 1109 1068 1110 1069 static void perf_sysenter_disable(struct trace_event_call *call) 1111 1070 { 1071 + struct syscall_metadata *sys_data = call->data; 1112 1072 int num; 1113 1073 1114 - num = ((struct syscall_metadata *)call->data)->syscall_nr; 1074 + num = sys_data->syscall_nr; 1115 1075 1116 1076 guard(mutex)(&syscall_trace_lock); 1117 1077 sys_perf_refcount_enter--; 1118 1078 clear_bit(num, enabled_perf_enter_syscalls); 1119 1079 if (!sys_perf_refcount_enter) 1120 1080 unregister_trace_sys_enter(perf_syscall_enter, NULL); 1081 + if (sys_data->user_mask) 1082 + syscall_fault_buffer_disable(); 1121 1083 } 1122 1084 1123 1085 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,