Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tracing: Have system call events record user array data

For system call events that have a length field, add a "user_arg_size"
parameter to the system call meta data that denotes the index of the args
array that holds the size of arg that the user_mask field has a bit set
for.

The "user_mask" has a bit set that denotes the arg that points to an array
in the user space address space and if a system call event has the
user_mask field set and the user_arg_size set, it will then record the
content of that address into the trace event, up to the size defined by
SYSCALL_FAULT_BUF_SZ - 1.

This allows the output to look like:

sys_write(fd: 0xa, buf: 0x5646978d13c0 (01:00:05:00:00:00:00:00:01:87:55:89:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00), count: 0x20)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.763528474@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+90 -35
+3 -1
include/trace/syscall.h
··· 16 16 * @name: name of the syscall 17 17 * @syscall_nr: number of the syscall 18 18 * @nb_args: number of parameters it takes 19 + * @user_arg_size: holds @arg that has size of the user space to read 19 20 * @user_mask: mask of @args that will read user space 20 21 * @types: list of types as strings 21 22 * @args: list of args as strings (args[i] matches types[i]) ··· 27 26 struct syscall_metadata { 28 27 const char *name; 29 28 int syscall_nr; 30 - short nb_args; 29 + u8 nb_args; 30 + s8 user_arg_size; 31 31 short user_mask; 32 32 const char **types; 33 33 const char **args;
+87 -34
kernel/trace/trace_syscalls.c
··· 124 124 return entry->name; 125 125 } 126 126 127 - /* Added to user strings when max limit is reached */ 127 + /* Added to user strings or arrays when max limit is reached */ 128 128 #define EXTRA "..." 129 129 130 130 static enum print_line_t ··· 136 136 struct trace_entry *ent = iter->ent; 137 137 struct syscall_trace_enter *trace; 138 138 struct syscall_metadata *entry; 139 - int i, syscall, val; 139 + int i, syscall, val, len; 140 140 unsigned char *ptr; 141 - int len; 142 141 143 142 trace = (typeof(trace))ent; 144 143 syscall = trace->nr; ··· 184 185 ptr = (void *)ent + (val & 0xffff); 185 186 len = val >> 16; 186 187 187 - trace_seq_printf(s, " \"%.*s\"", len, ptr); 188 + if (entry->user_arg_size < 0) { 189 + trace_seq_printf(s, " \"%.*s\"", len, ptr); 190 + continue; 191 + } 192 + 193 + val = trace->args[entry->user_arg_size]; 194 + 195 + trace_seq_puts(s, " ("); 196 + for (int x = 0; x < len; x++, ptr++) { 197 + if (x) 198 + trace_seq_putc(s, ':'); 199 + trace_seq_printf(s, "%02x", *ptr); 200 + } 201 + if (len < val) 202 + trace_seq_printf(s, ", %s", EXTRA); 203 + 204 + trace_seq_putc(s, ')'); 188 205 } 189 206 190 207 trace_seq_putc(s, ')'); ··· 265 250 if (!(BIT(i) & entry->user_mask)) 266 251 continue; 267 252 268 - /* Add the format for the user space string */ 269 - pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); 253 + /* Add the format for the user space string or array */ 254 + if (entry->user_arg_size < 0) 255 + pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); 256 + else 257 + pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); 270 258 } 271 259 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 272 260 ··· 278 260 ", ((unsigned long)(REC->%s))", entry->args[i]); 279 261 if (!(BIT(i) & entry->user_mask)) 280 262 continue; 281 - /* The user space string for arg has name __<arg>_val */ 282 - pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", 283 - entry->args[i]); 263 + /* The user space data for arg has name __<arg>_val */ 264 + if (entry->user_arg_size < 0) { 265 + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", 266 + entry->args[i]); 267 + } else { 268 + pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", 269 + entry->args[i]); 270 + } 284 271 } 285 272 286 273 #undef LEN_OR_ZERO ··· 356 333 idx = ffs(mask) - 1; 357 334 358 335 /* 359 - * User space strings are faulted into a temporary buffer and then 360 - * added as a dynamic string to the end of the event. 361 - * The user space string name for the arg pointer is "__<arg>_val". 336 + * User space data is faulted into a temporary buffer and then 337 + * added as a dynamic string or array to the end of the event. 338 + * The user space data name for the arg pointer is "__<arg>_val". 362 339 */ 363 340 len = strlen(meta->args[idx]) + sizeof("___val"); 364 341 arg = kmalloc(len, GFP_KERNEL); ··· 454 431 struct syscall_user_buffer *sbuf, 455 432 unsigned long *args, unsigned int *data_size) 456 433 { 434 + trace_user_buf_copy syscall_copy = syscall_copy_user; 457 435 unsigned long size = SYSCALL_FAULT_BUF_SZ - 1; 458 436 unsigned long mask = sys_data->user_mask; 459 437 int idx = ffs(mask) - 1; 438 + bool array = false; 460 439 char *ptr; 461 440 char *buf; 462 441 ··· 466 441 ptr = (char *)args[idx]; 467 442 *data_size = 0; 468 443 444 + /* 445 + * If this system call event has a size argument, use 446 + * it to define how much of user space memory to read, 447 + * and read it as an array and not a string. 448 + */ 449 + if (sys_data->user_arg_size >= 0) { 450 + array = true; 451 + size = args[sys_data->user_arg_size]; 452 + if (size > SYSCALL_FAULT_BUF_SZ - 1) 453 + size = SYSCALL_FAULT_BUF_SZ - 1; 454 + /* use normal copy_from_user() */ 455 + syscall_copy = NULL; 456 + } 457 + 469 458 buf = trace_user_fault_read(&sbuf->buf, ptr, size, 470 - syscall_copy_user, &size); 459 + syscall_copy, &size); 471 460 if (!buf) 472 461 return NULL; 473 462 474 - /* Replace any non-printable characters with '.' */ 475 - for (int i = 0; i < size; i++) { 476 - if (!isprint(buf[i])) 477 - buf[i] = '.'; 478 - } 463 + /* For strings, replace any non-printable characters with '.' */ 464 + if (!array) { 465 + for (int i = 0; i < size; i++) { 466 + if (!isprint(buf[i])) 467 + buf[i] = '.'; 468 + } 479 469 480 - /* 481 - * If the text was truncated due to our max limit, add "..." to 482 - * the string. 483 - */ 484 - if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { 485 - strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), 486 - EXTRA, sizeof(EXTRA)); 487 - size = SYSCALL_FAULT_BUF_SZ; 488 - } else { 489 - buf[size++] = '\0'; 470 + /* 471 + * If the text was truncated due to our max limit, add "..." to 472 + * the string. 473 + */ 474 + if (size > SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA)) { 475 + strscpy(buf + SYSCALL_FAULT_BUF_SZ - sizeof(EXTRA), 476 + EXTRA, sizeof(EXTRA)); 477 + size = SYSCALL_FAULT_BUF_SZ; 478 + } else { 479 + buf[size++] = '\0'; 480 + } 490 481 } 491 482 492 483 *data_size = size; ··· 533 492 534 493 static void syscall_put_data(struct syscall_metadata *sys_data, 535 494 struct syscall_trace_enter *entry, 536 - char *buffer, int size) 495 + char *buffer, int size, int user_size) 537 496 { 538 497 void *ptr; 539 498 int val; ··· 551 510 val = (ptr - (void *)entry) + 4; 552 511 553 512 /* Store the offset and the size into the meta data */ 554 - *(int *)ptr = val | (size << 16); 513 + *(int *)ptr = val | (user_size << 16); 514 + 515 + if (WARN_ON_ONCE((ptr - (void *)entry + user_size) > size)) 516 + user_size = 0; 555 517 556 518 /* Nothing to do if the user space was empty or faulted */ 557 - if (size) { 519 + if (user_size) { 558 520 /* Now store the user space data into the event */ 559 521 ptr += 4; 560 - memcpy(ptr, buffer, size); 522 + memcpy(ptr, buffer, user_size); 561 523 } 562 524 } 563 525 ··· 624 580 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); 625 581 626 582 if (mayfault) 627 - syscall_put_data(sys_data, entry, user_ptr, user_size); 583 + syscall_put_data(sys_data, entry, user_ptr, size, user_size); 628 584 629 585 trace_event_buffer_commit(&fbuffer); 630 586 } ··· 771 727 if (sys_data->enter_event != call) 772 728 return; 773 729 730 + sys_data->user_arg_size = -1; 731 + 774 732 switch (nr) { 733 + /* user arg 1 with size arg at 2 */ 734 + case __NR_write: 735 + case __NR_mq_timedsend: 736 + case __NR_pwrite64: 737 + sys_data->user_mask = BIT(1); 738 + sys_data->user_arg_size = 2; 739 + break; 775 740 /* user arg at position 0 */ 776 741 #ifdef __NR_access 777 742 case __NR_access: ··· 1118 1065 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); 1119 1066 1120 1067 if (mayfault) 1121 - syscall_put_data(sys_data, rec, user_ptr, user_size); 1068 + syscall_put_data(sys_data, rec, user_ptr, size, user_size); 1122 1069 1123 1070 if ((valid_prog_array && 1124 1071 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||