Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tracing: Add a config and syscall_user_buf_size file to limit amount written

When a system call that can copy user space addresses into the ring
buffer, it can copy up to 511 bytes of data. This can waste precious ring
buffer space if the user isn't interested in the output. Add a new file
"syscall_user_buf_size" that gets initialized to a new config
CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 63.

The config also is used to limit how much perf can read from user space.

Also lower the max down to 165, as this isn't to record everything that a
system call may be passing through to the kernel. 165 is more than enough.

The reason for 165 is because adding one for the nul terminating byte, as
well as possibly needing to append the "..." string turns it into 170
bytes. As this needs to save up to 3 arguments and 3 * 170 is 510 which
fits nicely in 512 bytes (a power of 2).

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.260068913@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+105 -22
+8
Documentation/trace/ftrace.rst
··· 366 366 for each function. The displayed address is the patch-site address 367 367 and can differ from /proc/kallsyms address. 368 368 369 + syscall_user_buf_size: 370 + 371 + Some system call trace events will record the data from a user 372 + space address that one of the parameters point to. The amount of 373 + data per event is limited. This file holds the max number of bytes 374 + that will be recorded into the ring buffer to hold this data. 375 + The max value is currently 165. 376 + 369 377 dyn_ftrace_total_info: 370 378 371 379 This file is for debugging purposes. The number of functions that
+14
kernel/trace/Kconfig
··· 575 575 help 576 576 Basic tracer to catch the syscall entry and exit events. 577 577 578 + config TRACE_SYSCALL_BUF_SIZE_DEFAULT 579 + int "System call user read max size" 580 + range 0 165 581 + default 63 582 + depends on FTRACE_SYSCALLS 583 + help 584 + Some system call trace events will record the data from a user 585 + space address that one of the parameters point to. The amount of 586 + data per event is limited. That limit is set by this config and 587 + this config also affects how much user space data perf can read. 588 + 589 + For a tracing instance, this size may be changed by writing into 590 + its syscall_user_buf_size file. 591 + 578 592 config TRACER_SNAPSHOT 579 593 bool "Create a snapshot trace buffer" 580 594 select TRACER_MAX_TRACE
+52
kernel/trace/trace.c
··· 6912 6912 } 6913 6913 6914 6914 static ssize_t 6915 + tracing_syscall_buf_read(struct file *filp, char __user *ubuf, 6916 + size_t cnt, loff_t *ppos) 6917 + { 6918 + struct inode *inode = file_inode(filp); 6919 + struct trace_array *tr = inode->i_private; 6920 + char buf[64]; 6921 + int r; 6922 + 6923 + r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz); 6924 + 6925 + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 6926 + } 6927 + 6928 + static ssize_t 6929 + tracing_syscall_buf_write(struct file *filp, const char __user *ubuf, 6930 + size_t cnt, loff_t *ppos) 6931 + { 6932 + struct inode *inode = file_inode(filp); 6933 + struct trace_array *tr = inode->i_private; 6934 + unsigned long val; 6935 + int ret; 6936 + 6937 + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 6938 + if (ret) 6939 + return ret; 6940 + 6941 + if (val > SYSCALL_FAULT_USER_MAX) 6942 + val = SYSCALL_FAULT_USER_MAX; 6943 + 6944 + tr->syscall_buf_sz = val; 6945 + 6946 + *ppos += cnt; 6947 + 6948 + return cnt; 6949 + } 6950 + 6951 + static ssize_t 6915 6952 tracing_entries_read(struct file *filp, char __user *ubuf, 6916 6953 size_t cnt, loff_t *ppos) 6917 6954 { ··· 8076 8039 .open = tracing_open_generic_tr, 8077 8040 .read = tracing_entries_read, 8078 8041 .write = tracing_entries_write, 8042 + .llseek = generic_file_llseek, 8043 + .release = tracing_release_generic_tr, 8044 + }; 8045 + 8046 + static const struct file_operations tracing_syscall_buf_fops = { 8047 + .open = tracing_open_generic_tr, 8048 + .read = tracing_syscall_buf_read, 8049 + .write = tracing_syscall_buf_write, 8079 8050 .llseek = generic_file_llseek, 8080 8051 .release = tracing_release_generic_tr, 8081 8052 }; ··· 10190 10145 10191 10146 raw_spin_lock_init(&tr->start_lock); 10192 10147 10148 + tr->syscall_buf_sz = global_trace.syscall_buf_sz; 10149 + 10193 10150 tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 10194 10151 #ifdef CONFIG_TRACER_MAX_TRACE 10195 10152 spin_lock_init(&tr->snapshot_trigger_lock); ··· 10507 10460 10508 10461 trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, 10509 10462 tr, &buffer_subbuf_size_fops); 10463 + 10464 + trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, 10465 + tr, &tracing_syscall_buf_fops); 10510 10466 10511 10467 create_trace_options_dir(tr); 10512 10468 ··· 11435 11385 register_die_notifier(&trace_die_notifier); 11436 11386 11437 11387 global_trace.flags = TRACE_ARRAY_FL_GLOBAL; 11388 + 11389 + global_trace.syscall_buf_sz = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; 11438 11390 11439 11391 INIT_LIST_HEAD(&global_trace.systems); 11440 11392 INIT_LIST_HEAD(&global_trace.events);
+3
kernel/trace/trace.h
··· 131 131 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) 132 132 #define HIST_STACKTRACE_SKIP 5 133 133 134 + #define SYSCALL_FAULT_USER_MAX 165 135 + 134 136 /* 135 137 * syscalls are special, and need special handling, this is why 136 138 * they are not included in trace_entries.h ··· 432 430 int function_enabled; 433 431 #endif 434 432 int no_filter_buffering_ref; 433 + unsigned int syscall_buf_sz; 435 434 struct list_head hist_vars; 436 435 #ifdef CONFIG_TRACER_SNAPSHOT 437 436 struct cond_snapshot *cond_snapshot;
+28 -22
kernel/trace/trace_syscalls.c
··· 390 390 /* 391 391 * Create a per CPU temporary buffer to copy user space pointers into. 392 392 * 393 + * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. 394 + * (defined in kernel/trace/trace.h) 395 + 396 + * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the 397 + * nul terminating byte and possibly appended EXTRA (4 bytes). 398 + * 393 399 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use 394 - * to copy memory from user space addresses into. 395 - * 396 - * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space. 397 - * 398 - * SYSCALL_FAULT_USER_MAX is the amount to copy into the ring buffer. 399 - * It's slightly smaller than SYSCALL_FAULT_ARG_SZ to know if it 400 - * needs to append the EXTRA or not. 401 - * 402 - * This only allows up to 3 args from system calls. 400 + * to copy memory from user space addresses into that will hold 401 + * 3 args as only 3 args are allowed to be copied from system calls. 403 402 */ 404 - #define SYSCALL_FAULT_BUF_SZ 512 405 - #define SYSCALL_FAULT_ARG_SZ 168 406 - #define SYSCALL_FAULT_USER_MAX 128 403 + #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) 407 404 #define SYSCALL_FAULT_MAX_CNT 3 405 + #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) 408 406 409 407 /* Use the tracing per CPU buffer infrastructure to copy from user space */ 410 408 struct syscall_user_buffer { ··· 496 498 return 0; 497 499 } 498 500 499 - static char *sys_fault_user(struct syscall_metadata *sys_data, 501 + static char *sys_fault_user(unsigned int buf_size, 502 + struct syscall_metadata *sys_data, 500 503 struct syscall_user_buffer *sbuf, 501 504 unsigned long *args, 502 505 unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) ··· 547 548 data_size[i] = -1; /* Denotes no pointer */ 548 549 } 549 550 551 + /* A zero size means do not even try */ 552 + if (!buf_size) 553 + return NULL; 554 + 550 555 buffer = trace_user_fault_read(&sbuf->buf, NULL, size, 551 556 syscall_copy, &sargs); 552 557 if (!buffer) ··· 571 568 buf[x] = '.'; 572 569 } 573 570 571 + size = min(buf_size, SYSCALL_FAULT_USER_MAX); 572 + 574 573 /* 575 574 * If the text was truncated due to our max limit, 576 575 * add "..." to the string. 577 576 */ 578 - if (ret > SYSCALL_FAULT_USER_MAX) { 579 - strscpy(buf + SYSCALL_FAULT_USER_MAX, EXTRA, 580 - sizeof(EXTRA)); 581 - ret = SYSCALL_FAULT_USER_MAX + sizeof(EXTRA); 577 + if (ret > size) { 578 + strscpy(buf + size, EXTRA, sizeof(EXTRA)); 579 + ret = size + sizeof(EXTRA); 582 580 } else { 583 581 buf[ret++] = '\0'; 584 582 } 585 583 } else { 586 - ret = min(ret, SYSCALL_FAULT_USER_MAX); 584 + ret = min((unsigned int)ret, buf_size); 587 585 } 588 586 data_size[i] = ret; 589 587 } ··· 594 590 595 591 static int 596 592 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, 597 - char **buffer, int *size, int *user_sizes, int *uargs) 593 + char **buffer, int *size, int *user_sizes, int *uargs, 594 + int buf_size) 598 595 { 599 596 struct syscall_user_buffer *sbuf; 600 597 int i; ··· 605 600 if (!sbuf) 606 601 return -1; 607 602 608 - *buffer = sys_fault_user(sys_data, sbuf, args, user_sizes); 603 + *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); 609 604 /* 610 605 * user_size is the amount of data to append. 611 606 * Need to add 4 for the meta field that points to ··· 710 705 711 706 if (mayfault) { 712 707 if (syscall_get_data(sys_data, args, &user_ptr, 713 - &size, user_sizes, &uargs) < 0) 708 + &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) 714 709 return; 715 710 } 716 711 ··· 1209 1204 bool mayfault; 1210 1205 char *user_ptr; 1211 1206 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 1207 + int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; 1212 1208 int syscall_nr; 1213 1209 int rctx; 1214 1210 int size = 0; ··· 1239 1233 1240 1234 if (mayfault) { 1241 1235 if (syscall_get_data(sys_data, args, &user_ptr, 1242 - &size, user_sizes, &uargs) < 0) 1236 + &size, user_sizes, &uargs, buf_size) < 0) 1243 1237 return; 1244 1238 } 1245 1239