Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tracing: Allow the top level trace_marker to write into another instances

There are applications that have it hard coded to write into the top level
trace_marker instance (/sys/kernel/tracing/trace_marker). This can be
annoying if a profiler is using that instance for other work, or if it
needs all writes to go into a new instance.

A new option is created called "copy_trace_marker". By default, the top
level has this set, as that is the default buffer that writing into the
top level trace_marker file will go to. But now if an instance is created
and sets this option, all writes into the top level trace_marker will also
be written into that instance buffer just as if an application were to
write into the instance's trace_marker file.

If the top level instance disables this option, then writes to its own
trace_marker and trace_marker_raw files will not go into its buffer.

If no instance has this option set, then the write will return an error
and errno will contain ENODEV.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250508095639.39f84eda@gandalf.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

+128 -31
+13
Documentation/trace/ftrace.rst
··· 1205 1205 default instance. The only way the top level instance has this flag 1206 1206 cleared, is by it being set in another instance. 1207 1207 1208 + copy_trace_marker 1209 + If there are applications that hard code writing into the top level 1210 + trace_marker file (/sys/kernel/tracing/trace_marker or trace_marker_raw), 1211 + and the tooling would like it to go into an instance, this option can 1212 + be used. Create an instance and set this option, and then all writes 1213 + into the top level trace_marker file will also be redirected into this 1214 + instance. 1215 + 1216 + Note, by default this option is set for the top level instance. If it 1217 + is disabled, then writes to the trace_marker or trace_marker_raw files 1218 + will not be written into the top level file. If no instance has this 1219 + option set, then a write will error with the errno of ENODEV. 1220 + 1208 1221 annotate 1209 1222 It is sometimes confusing when the CPU buffers are full 1210 1223 and one CPU buffer had a lot of events recently, thus
+113 -31
kernel/trace/trace.c
··· 493 493 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ 494 494 TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ 495 495 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ 496 - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) 496 + TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ 497 + TRACE_ITER_COPY_MARKER) 497 498 498 499 /* trace_options that are only supported by global_trace */ 499 500 #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ ··· 502 501 503 502 /* trace_flags that are default zero for instances */ 504 503 #define ZEROED_TRACE_FLAGS \ 505 - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) 504 + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ 505 + TRACE_ITER_COPY_MARKER) 506 506 507 507 /* 508 508 * The global_trace is the descriptor that holds the top-level tracing ··· 514 512 }; 515 513 516 514 static struct trace_array *printk_trace = &global_trace; 515 + 516 + /* List of trace_arrays interested in the top level trace_marker */ 517 + static LIST_HEAD(marker_copies); 517 518 518 519 static __always_inline bool printk_binsafe(struct trace_array *tr) 519 520 { ··· 537 532 printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; 538 533 printk_trace = tr; 539 534 tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; 535 + } 536 + 537 + /* Returns true if the status of tr changed */ 538 + static bool update_marker_trace(struct trace_array *tr, int enabled) 539 + { 540 + lockdep_assert_held(&event_mutex); 541 + 542 + if (enabled) { 543 + if (!list_empty(&tr->marker_list)) 544 + return false; 545 + 546 + list_add_rcu(&tr->marker_list, &marker_copies); 547 + tr->trace_flags |= TRACE_ITER_COPY_MARKER; 548 + return true; 549 + } 550 + 551 + if (list_empty(&tr->marker_list)) 552 + return false; 553 + 554 + list_del_init(&tr->marker_list); 555 + tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; 556 + return true; 540 557 } 541 558 542 559 void trace_set_ring_buffer_expanded(struct trace_array *tr) ··· 5247 5220 { 5248 5221 if ((mask == TRACE_ITER_RECORD_TGID) || 5249 5222 (mask == TRACE_ITER_RECORD_CMD) || 5250 - (mask == TRACE_ITER_TRACE_PRINTK)) 5223 + (mask == TRACE_ITER_TRACE_PRINTK) || 5224 + (mask == TRACE_ITER_COPY_MARKER)) 5251 5225 lockdep_assert_held(&event_mutex); 5252 5226 5253 5227 /* do nothing if flag is already set */ ··· 5278 5250 update_printk_trace(&global_trace); 5279 5251 } 5280 5252 } 5253 + 5254 + if (mask == TRACE_ITER_COPY_MARKER) 5255 + update_marker_trace(tr, enabled); 5281 5256 5282 5257 if (enabled) 5283 5258 tr->trace_flags |= mask; ··· 7165 7134 7166 7135 #define TRACE_MARKER_MAX_SIZE 4096 7167 7136 7168 - static ssize_t 7169 - tracing_mark_write(struct file *filp, const char __user *ubuf, 7170 - size_t cnt, loff_t *fpos) 7137 + static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, 7138 + size_t cnt, unsigned long ip) 7171 7139 { 7172 - struct trace_array *tr = filp->private_data; 7173 7140 struct ring_buffer_event *event; 7174 7141 enum event_trigger_type tt = ETT_NONE; 7175 7142 struct trace_buffer *buffer; ··· 7180 7151 /* Used in tracing_mark_raw_write() as well */ 7181 7152 #define FAULTED_STR "<faulted>" 7182 7153 #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ 7183 - 7184 - if (tracing_disabled) 7185 - return -EINVAL; 7186 - 7187 - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) 7188 - return -EINVAL; 7189 - 7190 - if ((ssize_t)cnt < 0) 7191 - return -EINVAL; 7192 - 7193 - if (cnt > TRACE_MARKER_MAX_SIZE) 7194 - cnt = TRACE_MARKER_MAX_SIZE; 7195 7154 7196 7155 meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ 7197 7156 again: ··· 7213 7196 } 7214 7197 7215 7198 entry = ring_buffer_event_data(event); 7216 - entry->ip = _THIS_IP_; 7199 + entry->ip = ip; 7217 7200 7218 7201 len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); 7219 7202 if (len) { ··· 7246 7229 } 7247 7230 7248 7231 static ssize_t 7249 - tracing_mark_raw_write(struct file *filp, const char __user *ubuf, 7232 + tracing_mark_write(struct file *filp, const char __user *ubuf, 7250 7233 size_t cnt, loff_t *fpos) 7251 7234 { 7252 7235 struct trace_array *tr = filp->private_data; 7236 + ssize_t written = -ENODEV; 7237 + unsigned long ip; 7238 + 7239 + if (tracing_disabled) 7240 + return -EINVAL; 7241 + 7242 + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) 7243 + return -EINVAL; 7244 + 7245 + if ((ssize_t)cnt < 0) 7246 + return -EINVAL; 7247 + 7248 + if (cnt > TRACE_MARKER_MAX_SIZE) 7249 + cnt = TRACE_MARKER_MAX_SIZE; 7250 + 7251 + /* The selftests expect this function to be the IP address */ 7252 + ip = _THIS_IP_; 7253 + 7254 + /* The global trace_marker can go to multiple instances */ 7255 + if (tr == &global_trace) { 7256 + guard(rcu)(); 7257 + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { 7258 + written = write_marker_to_buffer(tr, ubuf, cnt, ip); 7259 + if (written < 0) 7260 + break; 7261 + } 7262 + } else { 7263 + written = write_marker_to_buffer(tr, ubuf, cnt, ip); 7264 + } 7265 + 7266 + return written; 7267 + } 7268 + 7269 + static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, 7270 + const char __user *ubuf, size_t cnt) 7271 + { 7253 7272 struct ring_buffer_event *event; 7254 7273 struct trace_buffer *buffer; 7255 7274 struct raw_data_entry *entry; ··· 7294 7241 int len; 7295 7242 7296 7243 #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) 7297 - 7298 - if (tracing_disabled) 7299 - return -EINVAL; 7300 - 7301 - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) 7302 - return -EINVAL; 7303 - 7304 - /* The marker must at least have a tag id */ 7305 - if (cnt < sizeof(unsigned int)) 7306 - return -EINVAL; 7307 7244 7308 7245 size = sizeof(*entry) + cnt; 7309 7246 if (cnt < FAULT_SIZE_ID) ··· 7321 7278 written = cnt; 7322 7279 7323 7280 __buffer_unlock_commit(buffer, event); 7281 + 7282 + return written; 7283 + } 7284 + 7285 + static ssize_t 7286 + tracing_mark_raw_write(struct file *filp, const char __user *ubuf, 7287 + size_t cnt, loff_t *fpos) 7288 + { 7289 + struct trace_array *tr = filp->private_data; 7290 + ssize_t written = -ENODEV; 7291 + 7292 + #define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) 7293 + 7294 + if (tracing_disabled) 7295 + return -EINVAL; 7296 + 7297 + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) 7298 + return -EINVAL; 7299 + 7300 + /* The marker must at least have a tag id */ 7301 + if (cnt < sizeof(unsigned int)) 7302 + return -EINVAL; 7303 + 7304 + /* The global trace_marker_raw can go to multiple instances */ 7305 + if (tr == &global_trace) { 7306 + guard(rcu)(); 7307 + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { 7308 + written = write_raw_marker_to_buffer(tr, ubuf, cnt); 7309 + if (written < 0) 7310 + break; 7311 + } 7312 + } else { 7313 + written = write_raw_marker_to_buffer(tr, ubuf, cnt); 7314 + } 7324 7315 7325 7316 return written; 7326 7317 } ··· 9852 9775 INIT_LIST_HEAD(&tr->events); 9853 9776 INIT_LIST_HEAD(&tr->hist_vars); 9854 9777 INIT_LIST_HEAD(&tr->err_log); 9778 + INIT_LIST_HEAD(&tr->marker_list); 9855 9779 9856 9780 #ifdef CONFIG_MODULES 9857 9781 INIT_LIST_HEAD(&tr->mod_events); ··· 10011 9933 10012 9934 if (printk_trace == tr) 10013 9935 update_printk_trace(&global_trace); 9936 + 9937 + if (update_marker_trace(tr, 0)) 9938 + synchronize_rcu(); 10014 9939 10015 9940 tracing_set_nop(tr); 10016 9941 clear_ftrace_function_probes(tr); ··· 11080 10999 INIT_LIST_HEAD(&global_trace.events); 11081 11000 INIT_LIST_HEAD(&global_trace.hist_vars); 11082 11001 INIT_LIST_HEAD(&global_trace.err_log); 11002 + list_add(&global_trace.marker_list, &marker_copies); 11083 11003 list_add(&global_trace.list, &ftrace_trace_arrays); 11084 11004 11085 11005 apply_trace_boot_options();
+2
kernel/trace/trace.h
··· 403 403 struct trace_options *topts; 404 404 struct list_head systems; 405 405 struct list_head events; 406 + struct list_head marker_list; 406 407 struct trace_event_file *trace_marker_file; 407 408 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ 408 409 /* one per_cpu trace_pipe can be opened by only one user */ ··· 1385 1384 C(MARKERS, "markers"), \ 1386 1385 C(EVENT_FORK, "event-fork"), \ 1387 1386 C(TRACE_PRINTK, "trace_printk_dest"), \ 1387 + C(COPY_MARKER, "copy_trace_marker"),\ 1388 1388 C(PAUSE_ON_TRACE, "pause-on-trace"), \ 1389 1389 C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ 1390 1390 FUNCTION_FLAGS \